Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
498 changes: 498 additions & 0 deletions src/sagemaker/hyperpod/cli/cluster_stack_utils.py

Large diffs are not rendered by default.

38 changes: 33 additions & 5 deletions src/sagemaker/hyperpod/cli/commands/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
from sagemaker.hyperpod.common.telemetry.constants import Feature
from sagemaker.hyperpod.common.utils import setup_logging
from sagemaker.hyperpod.cli.utils import convert_datetimes
from sagemaker.hyperpod.cli.cluster_stack_utils import (
StackNotFoundError,
delete_stack_with_confirmation
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -292,8 +296,11 @@ def list_cluster_stacks(region, debug, status):

@click.command("cluster-stack")
@click.argument("stack-name", required=True)
@click.option("--retain-resources", help="Comma-separated list of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks). Resource names are shown in failed deletion output, or use AWS CLI: 'aws cloudformation list-stack-resources --stack-name STACK_NAME --region REGION'")
@click.option("--region", required=True, help="AWS region (required)")
@click.option("--debug", is_flag=True, help="Enable debug logging")
def delete(stack_name: str, debug: bool) -> None:
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_cluster_stack_cli")
def delete_cluster_stack(stack_name: str, retain_resources: str, region: str, debug: bool) -> None:
"""Delete a HyperPod cluster stack.

Removes the specified CloudFormation stack and all associated AWS resources.
Expand All @@ -305,12 +312,34 @@ def delete(stack_name: str, debug: bool) -> None:
.. code-block:: bash

# Delete a cluster stack
hyp delete hyp-cluster my-stack-name
hyp delete cluster-stack my-stack-name --region us-west-2

# Delete with retained resources (only works on DELETE_FAILED stacks)
hyp delete cluster-stack my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models --region us-west-2
"""
logger = setup_logging(logging.getLogger(__name__), debug)

logger.info(f"Deleting stack: {stack_name}")
logger.info("This feature is not yet implemented.")
try:
# Use the high-level orchestration function with CLI-specific callbacks
delete_stack_with_confirmation(
stack_name=stack_name,
region=region,
retain_resources_str=retain_resources or "",
message_callback=click.echo,
confirm_callback=lambda msg: click.confirm("Continue?", default=False),
success_callback=lambda msg: click.echo(f"✓ {msg}")
)

except StackNotFoundError:
click.secho(f"❌ Stack '{stack_name}' not found", fg='red')
except click.ClickException:
# Re-raise ClickException for proper CLI error handling
raise
except Exception as e:
logger.error(f"Failed to delete stack: {e}")
if debug:
logger.exception("Detailed error information:")
raise click.ClickException(str(e))

@click.command("cluster")
@click.option("--cluster-name", required=True, help="The name of the cluster to update")
Expand Down Expand Up @@ -376,4 +405,3 @@ def update_cluster(

logger.info("Cluster has been updated")
click.secho(f"Cluster {cluster_name} has been updated")

54 changes: 52 additions & 2 deletions src/sagemaker/hyperpod/cli/common_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
from typing import Mapping, Type
from typing import Mapping, Type, List, Dict, Any
import click
import pkgutil
import json
Expand Down Expand Up @@ -68,4 +68,54 @@ def load_schema_for_version(
f"Could not load schema.json for version {version} "
f"(looked in package {ver_pkg})"
)
return json.loads(raw)
return json.loads(raw)


def parse_comma_separated_list(value: str) -> List[str]:
"""
Parse a comma-separated string into a list of strings.
Generic utility that can be reused across commands.

Args:
value: Comma-separated string like "item1,item2,item3"

Returns:
List of trimmed strings
"""
if not value:
return []
return [item.strip() for item in value.split(",") if item.strip()]


def categorize_resources_by_type(resources: List[Dict[str, Any]],
type_mappings: Dict[str, List[str]]) -> Dict[str, List[str]]:
"""
Generic function to categorize resources by type.

Args:
resources: List of resource dictionaries with 'ResourceType' and 'LogicalResourceId'
type_mappings: Dictionary mapping category names to lists of resource types

Returns:
Dictionary of category -> list of resource names
"""
categorized = {category: [] for category in type_mappings.keys()}
categorized["Other"] = []

for resource in resources:
resource_type = resource.get("ResourceType", "")
logical_id = resource.get("LogicalResourceId", "")

# Find which category this resource type belongs to
category_found = False
for category, types in type_mappings.items():
if any(resource_type.startswith(rt) for rt in types):
categorized[category].append(logical_id)
category_found = True
break

if not category_found:
categorized["Other"].append(logical_id)

# Remove empty categories
return {k: v for k, v in categorized.items() if v}
3 changes: 2 additions & 1 deletion src/sagemaker/hyperpod/cli/hyp_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \
get_monitoring
from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \
list_cluster_stacks, update_cluster
list_cluster_stacks, update_cluster, delete_cluster_stack
from sagemaker.hyperpod.cli.commands.training import (
pytorch_create,
list_jobs,
Expand Down Expand Up @@ -190,6 +190,7 @@ def exec():
delete.add_command(pytorch_delete)
delete.add_command(js_delete)
delete.add_command(custom_delete)
delete.add_command(delete_cluster_stack)

list_pods.add_command(pytorch_list_pods)
list_pods.add_command(js_list_pods)
Expand Down
110 changes: 110 additions & 0 deletions src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,116 @@ def check_status(stack_name: str, region: Optional[str] = None):
"""
return HpClusterStack._get_stack_status_helper(stack_name, region)

@staticmethod
def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None,
logger: Optional[logging.Logger] = None) -> None:
"""Deletes a HyperPod cluster CloudFormation stack.

Removes the specified CloudFormation stack and all associated AWS resources.
This operation cannot be undone and proceeds automatically without confirmation.

**Parameters:**

.. list-table::
:header-rows: 1
:widths: 20 20 60

* - Parameter
- Type
- Description
* - stack_name
- str
- Name of the CloudFormation stack to delete
* - region
- str, optional
- AWS region where the stack exists
* - retain_resources
- List[str], optional
- List of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks)
* - logger
- logging.Logger, optional
- Logger instance for output messages. Uses default logger if not provided

**Raises:**

ValueError: When stack doesn't exist or retain_resources limitation is encountered
RuntimeError: When CloudFormation deletion fails
Exception: For other deletion errors

.. dropdown:: Usage Examples
:open:

.. code-block:: python

>>> # Delete a stack (automatically proceeds without confirmation)
>>> HpClusterStack.delete("my-stack-name")
>>>
>>> # Delete in specific region
>>> HpClusterStack.delete("my-stack-name", region="us-west-2")
>>>
>>> # Delete with retained resources (only works on DELETE_FAILED stacks)
>>> HpClusterStack.delete("my-stack-name", retain_resources=["S3Bucket", "EFSFileSystem"])
>>>
>>> # Delete with custom logger
>>> import logging
>>> logger = logging.getLogger(__name__)
>>> HpClusterStack.delete("my-stack-name", logger=logger)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the user waiting on the delete to complete ?

Is there a way to check the status of a delete ? Does the describe command handle this ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the delete method returns immediately, i followed how the create SDK cmd works and that returns immediately as well

you can check the stack status with check_status, but you cant check the actual status of a delete. the describe cmd to describe the stack will not show any deleted stacks.

"""
from sagemaker.hyperpod.cli.cluster_stack_utils import (
delete_stack_with_confirmation,
StackNotFoundError
)

if logger is None:
logger = logging.getLogger(__name__)

# Convert retain_resources list to comma-separated string for the utility function
retain_resources_str = ",".join(retain_resources) if retain_resources else ""

def sdk_confirm_callback(message: str) -> bool:
"""SDK-specific confirmation callback - always auto-confirms."""
logger.info(f"Auto-confirming: {message}")
return True

try:
delete_stack_with_confirmation(
stack_name=stack_name,
region=region or boto3.session.Session().region_name,
retain_resources_str=retain_resources_str,
message_callback=logger.info,
confirm_callback=sdk_confirm_callback,
success_callback=logger.info
)
except StackNotFoundError:
error_msg = f"Stack '{stack_name}' not found"
logger.error(error_msg)
raise ValueError(error_msg)
except Exception as e:
error_str = str(e)

# Handle CloudFormation retain-resources limitation with clear exception for SDK
if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str:
error_msg = (
f"CloudFormation limitation: retain_resources can only be used on stacks in DELETE_FAILED state. "
f"Current stack state allows normal deletion. Try deleting without retain_resources first, "
f"then retry with retain_resources if deletion fails."
)
logger.error(error_msg)
raise ValueError(error_msg)

# Handle termination protection
if "TerminationProtection is enabled" in error_str:
error_msg = (
f"Stack deletion blocked: Termination Protection is enabled. "
f"Disable termination protection first using AWS CLI or Console."
)
logger.error(error_msg)
raise RuntimeError(error_msg)

# Handle other errors
logger.error(f"Failed to delete stack: {error_str}")
raise RuntimeError(f"Stack deletion failed: {error_str}")


def _yaml_to_json_string(yaml_path) -> str:
"""Convert YAML file to JSON string"""
Expand Down
Loading
Loading