-
Notifications
You must be signed in to change notification settings - Fork 69
Feature: Delete Cluster Command #250
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fdaf009
ced09aa
bdeaba0
3000a00
b9447d5
4c552d7
ec69411
12585f3
4cdc25b
034f8a5
3a87ee8
b48f761
578a937
15a0f96
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -537,6 +537,116 @@ def check_status(stack_name: str, region: Optional[str] = None): | |
| """ | ||
| return HpClusterStack._get_stack_status_helper(stack_name, region) | ||
|
|
||
| @staticmethod | ||
| def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None, | ||
| logger: Optional[logging.Logger] = None) -> None: | ||
| """Deletes a HyperPod cluster CloudFormation stack. | ||
|
|
||
| Removes the specified CloudFormation stack and all associated AWS resources. | ||
| This operation cannot be undone and proceeds automatically without confirmation. | ||
|
|
||
| **Parameters:** | ||
|
|
||
| .. list-table:: | ||
| :header-rows: 1 | ||
| :widths: 20 20 60 | ||
|
|
||
| * - Parameter | ||
| - Type | ||
| - Description | ||
| * - stack_name | ||
| - str | ||
| - Name of the CloudFormation stack to delete | ||
| * - region | ||
| - str, optional | ||
| - AWS region where the stack exists | ||
| * - retain_resources | ||
| - List[str], optional | ||
| - List of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks) | ||
| * - logger | ||
| - logging.Logger, optional | ||
| - Logger instance for output messages. Uses default logger if not provided | ||
|
|
||
| **Raises:** | ||
|
|
||
| ValueError: When stack doesn't exist or retain_resources limitation is encountered | ||
| RuntimeError: When CloudFormation deletion fails | ||
| Exception: For other deletion errors | ||
|
|
||
| .. dropdown:: Usage Examples | ||
| :open: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| >>> # Delete a stack (automatically proceeds without confirmation) | ||
| >>> HpClusterStack.delete("my-stack-name") | ||
| >>> | ||
| >>> # Delete in specific region | ||
| >>> HpClusterStack.delete("my-stack-name", region="us-west-2") | ||
| >>> | ||
| >>> # Delete with retained resources (only works on DELETE_FAILED stacks) | ||
| >>> HpClusterStack.delete("my-stack-name", retain_resources=["S3Bucket", "EFSFileSystem"]) | ||
| >>> | ||
| >>> # Delete with custom logger | ||
| >>> import logging | ||
| >>> logger = logging.getLogger(__name__) | ||
| >>> HpClusterStack.delete("my-stack-name", logger=logger) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the user waiting on the delete to complete ? Is there a way to check the status of a delete ? Does the describe command handle this ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the delete method returns immediately, i followed how the create SDK cmd works and that returns immediately as well you can check the stack status with check_status, but you cant check the actual status of a delete. the describe cmd to describe the stack will not show any deleted stacks. |
||
| """ | ||
| from sagemaker.hyperpod.cli.cluster_stack_utils import ( | ||
| delete_stack_with_confirmation, | ||
| StackNotFoundError | ||
| ) | ||
|
|
||
| if logger is None: | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| # Convert retain_resources list to comma-separated string for the utility function | ||
| retain_resources_str = ",".join(retain_resources) if retain_resources else "" | ||
|
|
||
| def sdk_confirm_callback(message: str) -> bool: | ||
| """SDK-specific confirmation callback - always auto-confirms.""" | ||
| logger.info(f"Auto-confirming: {message}") | ||
| return True | ||
|
|
||
| try: | ||
| delete_stack_with_confirmation( | ||
| stack_name=stack_name, | ||
| region=region or boto3.session.Session().region_name, | ||
| retain_resources_str=retain_resources_str, | ||
| message_callback=logger.info, | ||
| confirm_callback=sdk_confirm_callback, | ||
| success_callback=logger.info | ||
| ) | ||
| except StackNotFoundError: | ||
| error_msg = f"Stack '{stack_name}' not found" | ||
| logger.error(error_msg) | ||
| raise ValueError(error_msg) | ||
| except Exception as e: | ||
| error_str = str(e) | ||
|
|
||
| # Handle CloudFormation retain-resources limitation with clear exception for SDK | ||
| if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str: | ||
| error_msg = ( | ||
| f"CloudFormation limitation: retain_resources can only be used on stacks in DELETE_FAILED state. " | ||
| f"Current stack state allows normal deletion. Try deleting without retain_resources first, " | ||
| f"then retry with retain_resources if deletion fails." | ||
| ) | ||
| logger.error(error_msg) | ||
| raise ValueError(error_msg) | ||
|
|
||
| # Handle termination protection | ||
| if "TerminationProtection is enabled" in error_str: | ||
| error_msg = ( | ||
| f"Stack deletion blocked: Termination Protection is enabled. " | ||
| f"Disable termination protection first using AWS CLI or Console." | ||
| ) | ||
| logger.error(error_msg) | ||
| raise RuntimeError(error_msg) | ||
|
|
||
| # Handle other errors | ||
| logger.error(f"Failed to delete stack: {error_str}") | ||
| raise RuntimeError(f"Stack deletion failed: {error_str}") | ||
|
|
||
|
|
||
| def _yaml_to_json_string(yaml_path) -> str: | ||
| """Convert YAML file to JSON string""" | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.