diff --git a/src/sagemaker/hyperpod/cli/cluster_stack_utils.py b/src/sagemaker/hyperpod/cli/cluster_stack_utils.py new file mode 100644 index 00000000..5d3c7ad5 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/cluster_stack_utils.py @@ -0,0 +1,498 @@ +""" +CloudFormation cluster stack deletion utilities. + +This module provides utilities for managing CloudFormation stack deletion operations +with support for both CLI and SDK interfaces through a callback pattern. + +Public Interface: + delete_stack_with_confirmation() - Main orchestration function for stack deletion + StackNotFoundError - Exception raised when stack is not found + +All other functions are private implementation details and should not be used directly. +""" + +import boto3 +import click +import logging +from typing import List, Dict, Any, Optional, Tuple, Callable +from botocore.exceptions import ClientError +from sagemaker.hyperpod.cli.common_utils import ( + parse_comma_separated_list, + categorize_resources_by_type +) + + +class _StackNotFoundError(Exception): + """Exception raised when a CloudFormation stack is not found.""" + pass + + +# Make the exception available with the original name +StackNotFoundError = _StackNotFoundError + +MessageCallback = Callable[[str], None] +ConfirmCallback = Callable[[str], bool] +SuccessCallback = Callable[[str], None] + + +def _get_stack_resources(stack_name: str, region: str, logger: Optional[logging.Logger] = None) -> List[Dict[str, Any]]: + """Get all resources in a CloudFormation stack. + + Args: + stack_name: Name of the CloudFormation stack + region: AWS region for CloudFormation operations + logger: Optional logger for debug information + + Returns: + List of resource summaries from CloudFormation + + Raises: + _StackNotFoundError: When stack doesn't exist + ClientError: For other CloudFormation errors + """ + if logger: + logger.debug(f"Fetching resources for stack '{stack_name}' in region '{region}'") + + cf_client = boto3.client('cloudformation', region_name=region) + try: + resources_response = cf_client.list_stack_resources(StackName=stack_name) + resources = resources_response.get('StackResourceSummaries', []) + + if logger: + logger.debug(f"Found {len(resources)} resources in stack '{stack_name}'") + + return resources + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == 'ValidationError' and "does not exist" in str(e): + raise _StackNotFoundError(f"Stack '{stack_name}' not found") + raise + + +def _validate_retain_resources(retain_list: List[str], existing_resources: List[Dict[str, Any]]) -> Tuple[List[str], List[str]]: + """Validate that retain resources exist in the stack. + + Args: + retain_list: List of logical resource IDs to retain + existing_resources: List of existing stack resources + + Returns: + Tuple of (valid_resources, invalid_resources) + """ + if not retain_list: + return [], [] + + existing_resource_names = {r.get('LogicalResourceId', '') for r in existing_resources} + valid_retain_resources = [] + invalid_retain_resources = [] + + for resource in retain_list: + if resource in existing_resource_names: + valid_retain_resources.append(resource) + else: + invalid_retain_resources.append(resource) + + return valid_retain_resources, invalid_retain_resources + + +def _categorize_stack_resources(resources: List[Dict[str, Any]]) -> Dict[str, List[str]]: + """Categorize CloudFormation resources by type using generic utility.""" + type_mappings = { + "EC2 Instances": ["AWS::EC2::Instance"], + "Networking": ["AWS::EC2::VPC", "AWS::EC2::Subnet", "AWS::EC2::SecurityGroup", + "AWS::EC2::InternetGateway", "AWS::EC2::RouteTable", "AWS::EC2::Route"], + "IAM": ["AWS::IAM::Role", "AWS::IAM::Policy", "AWS::IAM::InstanceProfile"], + "Storage": ["AWS::S3::Bucket", "AWS::EBS::Volume", "AWS::EFS::FileSystem"] + } + + return categorize_resources_by_type(resources, type_mappings) + + +def _compare_resource_states(original_resources: List[Dict[str, Any]], current_resources: List[Dict[str, Any]]) -> Tuple[set[str], set[str]]: + """Compare original and current resource states to identify changes. + + Args: + original_resources: Resources before deletion attempt + current_resources: Resources after deletion attempt + + Returns: + Tuple of (deleted_resources, remaining_resources) + """ + original_names = {r['LogicalResourceId'] for r in original_resources} + current_names = {r['LogicalResourceId'] for r in current_resources} + + deleted_resources = original_names - current_names + remaining_resources = current_names + + return deleted_resources, remaining_resources + + +def _display_deletion_warning(categorized_resources: Dict[str, List[str]], message_callback: MessageCallback) -> None: + """Display warning about resources to be deleted.""" + total_count = sum(len(item_list) for item_list in categorized_resources.values()) + message_callback(f"\n⚠ WARNING: This will delete the following {total_count} resources:\n") + + for category, item_list in categorized_resources.items(): + if item_list: + message_callback(f"{category} ({len(item_list)}):") + for item in item_list: + message_callback(f" - {item}") + message_callback("") + + +def _display_invalid_resources_warning(invalid_resources: List[str], message_callback: MessageCallback) -> None: + """Display warning about invalid retain resources.""" + if not invalid_resources: + return + + message_callback(f"⚠️ Warning: The following {len(invalid_resources)} resources don't exist in the stack:") + for resource in invalid_resources: + message_callback(f" - {resource} (not found)") + message_callback("") + + +def _display_retention_info(retained_items: List[str], message_callback: MessageCallback) -> None: + """Display information about items that will be retained.""" + if retained_items: + message_callback(f"\nThe following {len(retained_items)} resources will be RETAINED:") + for item in retained_items: + message_callback(f" ✓ {item} (retained)") + + + + +def _handle_termination_protection_error(stack_name: str, region: str, message_callback: MessageCallback) -> None: + """Handle termination protection error.""" + message_callback("❌ Stack deletion blocked: Termination Protection is enabled") + message_callback("") + message_callback("To delete this stack, first disable termination protection:") + message_callback(f"aws cloudformation update-termination-protection --no-enable-termination-protection --stack-name {stack_name} --region {region}") + message_callback("") + message_callback("Then retry the delete command.") + + +def _handle_retention_limitation_error(stack_name: str, retain_resources: str, region: str, message_callback: MessageCallback) -> None: + """Handle CloudFormation retention limitation error.""" + message_callback("❌ CloudFormation limitation: --retain-resources only works on failed deletions") + message_callback("") + message_callback("💡 Recommended workflow:") + message_callback("1. First try deleting without --retain-resources:") + message_callback(f" hyp delete cluster-stack {stack_name} --region {region}") + message_callback("") + message_callback("2. If deletion fails, the stack will be in DELETE_FAILED state") + message_callback("3. Then retry with --retain-resources to keep specific resources:") + message_callback(f" hyp delete cluster-stack {stack_name} --retain-resources {retain_resources} --region {region}") + + +def _handle_generic_deletion_error(error_str: str, message_callback: MessageCallback) -> None: + """Handle generic deletion errors.""" + if "does not exist" in error_str: + message_callback("❌ Stack not found") + elif "AccessDenied" in error_str: + message_callback("❌ Access denied. Check AWS permissions") + else: + message_callback(f"❌ Error deleting stack: {error_str}") + + +def _handle_partial_deletion_failure(stack_name: str, region: str, original_resources: List[Dict[str, Any]], + retain_list: List[str], message_callback: MessageCallback) -> None: + """Handle partial deletion failures by showing what succeeded vs failed. + + Args: + stack_name: Name of the stack + region: AWS region + original_resources: Resources before deletion attempt + retain_list: List of resources that were supposed to be retained + message_callback: Function to call for outputting messages + """ + message_callback("✗ Stack deletion failed") + + try: + cf_client = boto3.client('cloudformation', region_name=region) + current_resources_response = cf_client.list_stack_resources(StackName=stack_name) + current_resources = current_resources_response.get('StackResourceSummaries', []) + + deleted_resources, remaining_resources = _compare_resource_states( + original_resources, current_resources + ) + + # Show what was successfully deleted + if deleted_resources: + message_callback("") + message_callback(f"Successfully deleted ({len(deleted_resources)}):") + for resource in deleted_resources: + message_callback(f" ✓ {resource}") + + # Show what failed to delete (excluding retained resources) + failed_resources = remaining_resources - set(retain_list) if retain_list else remaining_resources + if failed_resources: + message_callback("") + message_callback(f"Failed to delete ({len(failed_resources)}):") + for resource in failed_resources: + message_callback(f" ✗ {resource} (DependencyViolation: has dependent resources)") + + # Show retained resources + if retain_list: + message_callback("") + message_callback(f"Successfully retained as requested ({len(retain_list)}):") + for resource in retain_list: + message_callback(f" ✓ {resource} (retained)") + + message_callback("") + message_callback("💡 Note: Some resources may have dependencies preventing deletion") + message_callback(" Check the AWS CloudFormation console for detailed dependency information") + + except Exception: + # If we can't get current resources, show generic error + message_callback("Unable to determine which resources were deleted") + +def _parse_retain_resources(retain_resources_str: str) -> List[str]: + """Parse comma-separated retain resources string.""" + return parse_comma_separated_list(retain_resources_str) + + +def _perform_stack_deletion(stack_name: str, region: str, retain_list: List[str], + logger: Optional[logging.Logger] = None) -> None: + """Perform the actual CloudFormation stack deletion. + + This is a private low-level function that directly calls the CloudFormation delete_stack API. + Use delete_stack_with_confirmation() for the public interface. + + Args: + stack_name: Name of the stack to delete + region: AWS region + retain_list: List of resources to retain during deletion + logger: Optional logger for debug information + + Raises: + ClientError: If deletion fails due to CloudFormation errors + Exception: For other deletion failures + """ + if logger: + logger.debug(f"Initiating deletion of stack '{stack_name}' in region '{region}'") + if retain_list: + logger.debug(f"Retaining resources: {retain_list}") + + cf_client = boto3.client('cloudformation', region_name=region) + + delete_params = {'StackName': stack_name} + if retain_list: + delete_params['RetainResources'] = retain_list + + cf_client.delete_stack(**delete_params) + + if logger: + logger.info(f"Stack '{stack_name}' deletion initiated successfully") + + + + +def _get_stack_resources_and_validate_retention(stack_name: str, region: str, retain_resources_str: str, + logger: Optional[logging.Logger] = None) -> Tuple[List[Dict[str, Any]], List[str], List[str]]: + """Get stack resources and validate retention list. + + Args: + stack_name: Name of the CloudFormation stack + region: AWS region + retain_resources_str: Comma-separated retain resources string + logger: Optional logger for debug information + + Returns: + Tuple of (all_resources, valid_retain_list, invalid_retain_list) + + Raises: + StackNotFoundError: When stack doesn't exist + """ + resources = _get_stack_resources(stack_name, region, logger) + if not resources: + raise _StackNotFoundError(f"No resources found in stack '{stack_name}'") + + retain_list = _parse_retain_resources(retain_resources_str) + valid_retain, invalid_retain = _validate_retain_resources(retain_list, resources) + + if logger and retain_list: + logger.debug(f"Retention validation - Valid: {len(valid_retain)}, Invalid: {len(invalid_retain)}") + + return resources, valid_retain, invalid_retain + + +def _handle_stack_deletion_error(error: Exception, stack_name: str, region: str, retain_resources: Optional[str] = None, + message_callback: Optional[MessageCallback] = None, + logger: Optional[logging.Logger] = None) -> bool: + """Handle various CloudFormation deletion errors with customizable output. + + Args: + error: The exception that occurred + stack_name: Name of the stack being deleted + region: AWS region + retain_resources: Original retain resources string (for error messages) + message_callback: Function to call for outputting messages (default: click.echo) + logger: Optional logger for debug information + + Returns: + True if error was handled gracefully (don't re-raise), False if should re-raise + """ + if message_callback is None: + message_callback = click.echo + + error_str = str(error) + + if logger: + logger.debug(f"Handling deletion error for stack '{stack_name}': {error_str}") + + # Handle termination protection specifically + if "TerminationProtection is enabled" in error_str: + _handle_termination_protection_error(stack_name, region, message_callback) + return False # Should re-raise + + # Handle CloudFormation retain-resources limitation + # Always re-raise for SDK usage to ensure clear exceptions + if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str: + _handle_retention_limitation_error(stack_name, retain_resources, region, message_callback) + return False # ensure SDK gets the exception + + # Handle other deletion errors + _handle_generic_deletion_error(error_str, message_callback) + return False # Should re-raise + + +def _display_stack_deletion_confirmation(resources: List[Dict[str, Any]], valid_retain_list: List[str], + invalid_retain_list: List[str], + message_callback: Optional[MessageCallback] = None, + confirm_callback: Optional[ConfirmCallback] = None, + logger: Optional[logging.Logger] = None) -> bool: + """Display deletion warnings and get user confirmation with customizable output. + + Args: + resources: All stack resources + valid_retain_list: Valid resources to retain + invalid_retain_list: Invalid resources that don't exist + message_callback: Function to call for outputting messages (default: click.echo) + confirm_callback: Function to call for confirmation (default: click.confirm) + logger: Optional logger for debug information + + Returns: + True if user confirms deletion, False otherwise + """ + if message_callback is None: + message_callback = click.echo + if confirm_callback is None: + confirm_callback = lambda msg: click.confirm("Continue?", default=False) + + if logger: + logger.debug(f"Displaying confirmation for {len(resources)} resources, {len(valid_retain_list)} to retain") + + # Show warning for invalid retain resources + _display_invalid_resources_warning(invalid_retain_list, message_callback) + + # Display deletion warning + resource_categories = _categorize_stack_resources(resources) + _display_deletion_warning(resource_categories, message_callback) + + # Show retention info + _display_retention_info(valid_retain_list, message_callback) + + return confirm_callback("Continue with deletion?") + + +def _handle_stack_deletion_partial_failure(stack_name: str, region: str, original_resources: List[Dict[str, Any]], + retain_list: List[str], message_callback: Optional[MessageCallback] = None) -> None: + """Handle partial deletion failures by showing what succeeded vs failed. + + Args: + stack_name: Name of the stack + region: AWS region + original_resources: Resources before deletion attempt + retain_list: List of resources that were supposed to be retained + message_callback: Function to call for outputting messages (default: click.echo) + """ + if message_callback is None: + message_callback = click.echo + + _handle_partial_deletion_failure(stack_name, region, original_resources, retain_list, message_callback) + + + + +def delete_stack_with_confirmation(stack_name: str, region: str, retain_resources_str: str = "", + message_callback: Optional[MessageCallback] = None, + confirm_callback: Optional[ConfirmCallback] = None, + success_callback: Optional[SuccessCallback] = None, + logger: Optional[logging.Logger] = None) -> None: + """ + This is the main public interface for stack deletion, supporting both CLI and SDK + usage through customizable callback functions. It handles resource validation, + user confirmation, deletion execution, and comprehensive error handling. + + Args: + stack_name: Name of the stack to delete + region: AWS region + retain_resources_str: Comma-separated retain resources string + message_callback: Function to call for outputting messages (default: click.echo) + confirm_callback: Function to call for confirmation (default: click.confirm) + success_callback: Function to call on successful deletion (default: click.echo) + logger: Optional logger for debug information + + Raises: + StackNotFoundError: When stack doesn't exist + click.ClickException: For CLI usage + Exception: For SDK usage (depending on callback implementation) + + Example: + # CLI usage + delete_stack_with_confirmation( + stack_name="my-stack", + region="us-west-2", + message_callback=click.echo, + confirm_callback=lambda msg: click.confirm("Continue?", default=False) + ) + + # SDK usage + delete_stack_with_confirmation( + stack_name="my-stack", + region="us-west-2", + message_callback=logger.info, + confirm_callback=lambda msg: True # Auto-confirm + ) + """ + if message_callback is None: + message_callback = click.echo + if success_callback is None: + success_callback = lambda msg: click.echo(f"✓ {msg}") + + if logger: + logger.info(f"Starting deletion workflow for stack '{stack_name}' in region '{region}'") + + # 1. Get and validate resources + resources, valid_retain, invalid_retain = _get_stack_resources_and_validate_retention( + stack_name, region, retain_resources_str, logger + ) + + # 2. Display warnings and get confirmation + if not _display_stack_deletion_confirmation(resources, valid_retain, invalid_retain, + message_callback, confirm_callback, logger): + message_callback("Operation cancelled.") + return + + # 3. Perform deletion + try: + _perform_stack_deletion(stack_name, region, valid_retain, logger) + success_callback(f"Stack '{stack_name}' deletion initiated successfully") + except Exception as e: + # Handle deletion errors + should_handle_gracefully = _handle_stack_deletion_error( + e, stack_name, region, retain_resources_str, message_callback, logger + ) + + if should_handle_gracefully: + return # Exit gracefully for retention limitation error + + # For other errors, try to show partial failure info if possible + try: + _handle_stack_deletion_partial_failure(stack_name, region, resources, valid_retain, message_callback) + except Exception: + if logger: + logger.debug("Failed to show partial failure information") + + # Re-raise the original exception + raise diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py index 285ba1f7..e6921ae3 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py @@ -18,6 +18,10 @@ from sagemaker.hyperpod.common.telemetry.constants import Feature from sagemaker.hyperpod.common.utils import setup_logging from sagemaker.hyperpod.cli.utils import convert_datetimes +from sagemaker.hyperpod.cli.cluster_stack_utils import ( + StackNotFoundError, + delete_stack_with_confirmation +) logger = logging.getLogger(__name__) @@ -292,8 +296,11 @@ def list_cluster_stacks(region, debug, status): @click.command("cluster-stack") @click.argument("stack-name", required=True) +@click.option("--retain-resources", help="Comma-separated list of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks). Resource names are shown in failed deletion output, or use AWS CLI: 'aws cloudformation list-stack-resources --stack-name STACK_NAME --region REGION'") +@click.option("--region", required=True, help="AWS region (required)") @click.option("--debug", is_flag=True, help="Enable debug logging") -def delete(stack_name: str, debug: bool) -> None: +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_cluster_stack_cli") +def delete_cluster_stack(stack_name: str, retain_resources: str, region: str, debug: bool) -> None: """Delete a HyperPod cluster stack. Removes the specified CloudFormation stack and all associated AWS resources. @@ -305,12 +312,34 @@ def delete(stack_name: str, debug: bool) -> None: .. code-block:: bash # Delete a cluster stack - hyp delete hyp-cluster my-stack-name + hyp delete cluster-stack my-stack-name --region us-west-2 + + # Delete with retained resources (only works on DELETE_FAILED stacks) + hyp delete cluster-stack my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models --region us-west-2 """ logger = setup_logging(logging.getLogger(__name__), debug) - logger.info(f"Deleting stack: {stack_name}") - logger.info("This feature is not yet implemented.") + try: + # Use the high-level orchestration function with CLI-specific callbacks + delete_stack_with_confirmation( + stack_name=stack_name, + region=region, + retain_resources_str=retain_resources or "", + message_callback=click.echo, + confirm_callback=lambda msg: click.confirm("Continue?", default=False), + success_callback=lambda msg: click.echo(f"✓ {msg}") + ) + + except StackNotFoundError: + click.secho(f"❌ Stack '{stack_name}' not found", fg='red') + except click.ClickException: + # Re-raise ClickException for proper CLI error handling + raise + except Exception as e: + logger.error(f"Failed to delete stack: {e}") + if debug: + logger.exception("Detailed error information:") + raise click.ClickException(str(e)) @click.command("cluster") @click.option("--cluster-name", required=True, help="The name of the cluster to update") @@ -376,4 +405,3 @@ def update_cluster( logger.info("Cluster has been updated") click.secho(f"Cluster {cluster_name} has been updated") - diff --git a/src/sagemaker/hyperpod/cli/common_utils.py b/src/sagemaker/hyperpod/cli/common_utils.py index 02233b85..ac8f85ef 100644 --- a/src/sagemaker/hyperpod/cli/common_utils.py +++ b/src/sagemaker/hyperpod/cli/common_utils.py @@ -1,5 +1,5 @@ import sys -from typing import Mapping, Type +from typing import Mapping, Type, List, Dict, Any import click import pkgutil import json @@ -68,4 +68,54 @@ def load_schema_for_version( f"Could not load schema.json for version {version} " f"(looked in package {ver_pkg})" ) - return json.loads(raw) \ No newline at end of file + return json.loads(raw) + + +def parse_comma_separated_list(value: str) -> List[str]: + """ + Parse a comma-separated string into a list of strings. + Generic utility that can be reused across commands. + + Args: + value: Comma-separated string like "item1,item2,item3" + + Returns: + List of trimmed strings + """ + if not value: + return [] + return [item.strip() for item in value.split(",") if item.strip()] + + +def categorize_resources_by_type(resources: List[Dict[str, Any]], + type_mappings: Dict[str, List[str]]) -> Dict[str, List[str]]: + """ + Generic function to categorize resources by type. + + Args: + resources: List of resource dictionaries with 'ResourceType' and 'LogicalResourceId' + type_mappings: Dictionary mapping category names to lists of resource types + + Returns: + Dictionary of category -> list of resource names + """ + categorized = {category: [] for category in type_mappings.keys()} + categorized["Other"] = [] + + for resource in resources: + resource_type = resource.get("ResourceType", "") + logical_id = resource.get("LogicalResourceId", "") + + # Find which category this resource type belongs to + category_found = False + for category, types in type_mappings.items(): + if any(resource_type.startswith(rt) for rt in types): + categorized[category].append(logical_id) + category_found = True + break + + if not category_found: + categorized["Other"].append(logical_id) + + # Remove empty categories + return {k: v for k, v in categorized.items() if v} diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index 9012dee8..94036db1 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -10,7 +10,7 @@ from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \ get_monitoring from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \ - list_cluster_stacks, update_cluster + list_cluster_stacks, update_cluster, delete_cluster_stack from sagemaker.hyperpod.cli.commands.training import ( pytorch_create, list_jobs, @@ -190,6 +190,7 @@ def exec(): delete.add_command(pytorch_delete) delete.add_command(js_delete) delete.add_command(custom_delete) +delete.add_command(delete_cluster_stack) list_pods.add_command(pytorch_list_pods) list_pods.add_command(js_list_pods) diff --git a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py index 7857b3a0..a42f20b2 100644 --- a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py +++ b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py @@ -537,6 +537,116 @@ def check_status(stack_name: str, region: Optional[str] = None): """ return HpClusterStack._get_stack_status_helper(stack_name, region) + @staticmethod + def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None, + logger: Optional[logging.Logger] = None) -> None: + """Deletes a HyperPod cluster CloudFormation stack. + + Removes the specified CloudFormation stack and all associated AWS resources. + This operation cannot be undone and proceeds automatically without confirmation. + + **Parameters:** + + .. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Parameter + - Type + - Description + * - stack_name + - str + - Name of the CloudFormation stack to delete + * - region + - str, optional + - AWS region where the stack exists + * - retain_resources + - List[str], optional + - List of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks) + * - logger + - logging.Logger, optional + - Logger instance for output messages. Uses default logger if not provided + + **Raises:** + + ValueError: When stack doesn't exist or retain_resources limitation is encountered + RuntimeError: When CloudFormation deletion fails + Exception: For other deletion errors + + .. dropdown:: Usage Examples + :open: + + .. code-block:: python + + >>> # Delete a stack (automatically proceeds without confirmation) + >>> HpClusterStack.delete("my-stack-name") + >>> + >>> # Delete in specific region + >>> HpClusterStack.delete("my-stack-name", region="us-west-2") + >>> + >>> # Delete with retained resources (only works on DELETE_FAILED stacks) + >>> HpClusterStack.delete("my-stack-name", retain_resources=["S3Bucket", "EFSFileSystem"]) + >>> + >>> # Delete with custom logger + >>> import logging + >>> logger = logging.getLogger(__name__) + >>> HpClusterStack.delete("my-stack-name", logger=logger) + """ + from sagemaker.hyperpod.cli.cluster_stack_utils import ( + delete_stack_with_confirmation, + StackNotFoundError + ) + + if logger is None: + logger = logging.getLogger(__name__) + + # Convert retain_resources list to comma-separated string for the utility function + retain_resources_str = ",".join(retain_resources) if retain_resources else "" + + def sdk_confirm_callback(message: str) -> bool: + """SDK-specific confirmation callback - always auto-confirms.""" + logger.info(f"Auto-confirming: {message}") + return True + + try: + delete_stack_with_confirmation( + stack_name=stack_name, + region=region or boto3.session.Session().region_name, + retain_resources_str=retain_resources_str, + message_callback=logger.info, + confirm_callback=sdk_confirm_callback, + success_callback=logger.info + ) + except StackNotFoundError: + error_msg = f"Stack '{stack_name}' not found" + logger.error(error_msg) + raise ValueError(error_msg) + except Exception as e: + error_str = str(e) + + # Handle CloudFormation retain-resources limitation with clear exception for SDK + if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str: + error_msg = ( + f"CloudFormation limitation: retain_resources can only be used on stacks in DELETE_FAILED state. " + f"Current stack state allows normal deletion. Try deleting without retain_resources first, " + f"then retry with retain_resources if deletion fails." + ) + logger.error(error_msg) + raise ValueError(error_msg) + + # Handle termination protection + if "TerminationProtection is enabled" in error_str: + error_msg = ( + f"Stack deletion blocked: Termination Protection is enabled. " + f"Disable termination protection first using AWS CLI or Console." + ) + logger.error(error_msg) + raise RuntimeError(error_msg) + + # Handle other errors + logger.error(f"Failed to delete stack: {error_str}") + raise RuntimeError(f"Stack deletion failed: {error_str}") + def _yaml_to_json_string(yaml_path) -> str: """Convert YAML file to JSON string""" diff --git a/test/integration_tests/cluster_management/test_cli_cluster_stack_deletion.py b/test/integration_tests/cluster_management/test_cli_cluster_stack_deletion.py new file mode 100644 index 00000000..5ffc0150 --- /dev/null +++ b/test/integration_tests/cluster_management/test_cli_cluster_stack_deletion.py @@ -0,0 +1,135 @@ +""" +Integration tests for CLI cluster stack deletion functionality. + +Tests the basic happy path user workflow for deleting cluster stacks via CLI commands. +Focuses on core functionality with minimal stack creation/deletion overhead. + +Detailed error handling and edge cases are covered by unit tests. +""" +import time +import pytest +import boto3 +from click.testing import CliRunner + +from sagemaker.hyperpod.cli.commands.cluster_stack import delete_cluster_stack +from test.integration_tests.cluster_management.utils import ( + assert_command_succeeded, + assert_yes_no_prompt_displayed, + assert_success_message_displayed, +) + + +# --------- Test Configuration --------- +REGION = "us-east-2" +TEST_STACK_PREFIX = "hyperpod-cli-delete-test" + + +@pytest.fixture(scope="module") +def runner(): + """Click test runner for CLI commands.""" + return CliRunner() + + +@pytest.fixture(scope="module") +def cfn_client(): + """CloudFormation client for test infrastructure.""" + return boto3.client('cloudformation', region_name=REGION) + + +def create_test_stack(cfn_client, stack_name): + """Create a minimal test stack for deletion testing.""" + template = { + "AWSTemplateFormatVersion": "2010-09-09", + "Description": "Test stack for CLI deletion integration tests", + "Resources": { + "TestRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "RoleName": f"{stack_name}-test-role", + "AssumeRolePolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole" + } + ] + } + } + } + }, + "Outputs": { + "RoleName": { + "Description": "Name of the test role", + "Value": {"Ref": "TestRole"} + } + } + } + + import json + cfn_client.create_stack( + StackName=stack_name, + TemplateBody=json.dumps(template), + Capabilities=['CAPABILITY_NAMED_IAM'], + Tags=[ + {"Key": "Purpose", "Value": "IntegrationTest"}, + {"Key": "Component", "Value": "CLI-Delete-Test"} + ] + ) + + # Wait for stack creation to complete + waiter = cfn_client.get_waiter('stack_create_complete') + waiter.wait(StackName=stack_name, WaiterConfig={'Delay': 10, 'MaxAttempts': 30}) + + +def wait_for_stack_delete_complete(cfn_client, stack_name, timeout_minutes=10): + """Wait for stack deletion to complete.""" + try: + waiter = cfn_client.get_waiter('stack_delete_complete') + waiter.wait( + StackName=stack_name, + WaiterConfig={'Delay': 15, 'MaxAttempts': timeout_minutes * 4} + ) + return True + except Exception as e: + if "does not exist" in str(e): + return True # Stack was deleted + raise + + +# --------- CLI Delete Tests --------- + +def test_delete_with_user_confirmation(runner, cfn_client): + """Test CLI deletion happy path with user confirmation.""" + # Create a test stack for this test + import uuid + stack_name = f"{TEST_STACK_PREFIX}-happy-{str(uuid.uuid4())[:8]}" + create_test_stack(cfn_client, stack_name) + + try: + # Test deletion with confirmation prompt (simulate 'y' response) + result = runner.invoke(delete_cluster_stack, [ + stack_name, + "--region", REGION + ], input='y\n', catch_exceptions=False) + + assert_command_succeeded(result) + assert_yes_no_prompt_displayed(result) + assert_success_message_displayed(result, ["deletion", "initiated"]) + + # Wait for deletion to complete + wait_for_stack_delete_complete(cfn_client, stack_name) + + # Verify stack is deleted + with pytest.raises(Exception) as exc_info: + cfn_client.describe_stacks(StackName=stack_name) + assert "does not exist" in str(exc_info.value) + + except Exception: + # Cleanup in case of test failure + try: + cfn_client.delete_stack(StackName=stack_name) + except: + pass + raise diff --git a/test/integration_tests/cluster_management/test_sdk_cluster_stack_deletion.py b/test/integration_tests/cluster_management/test_sdk_cluster_stack_deletion.py new file mode 100644 index 00000000..e96d668e --- /dev/null +++ b/test/integration_tests/cluster_management/test_sdk_cluster_stack_deletion.py @@ -0,0 +1,119 @@ +""" +Integration tests for SDK cluster stack deletion functionality. + +Tests the basic happy path for HpClusterStack.delete() method. +Focuses on core SDK functionality with minimal stack creation/deletion overhead. + +Detailed error handling and edge cases are covered by unit tests. +""" +import time +import pytest +import boto3 +import uuid + +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack + + +# --------- Test Configuration --------- +REGION = "us-east-2" +TEST_STACK_PREFIX = "hyperpod-sdk-delete-test" + + +@pytest.fixture(scope="module") +def cfn_client(): + """CloudFormation client for test infrastructure.""" + return boto3.client('cloudformation', region_name=REGION) + + +def create_test_stack(cfn_client, stack_name): + """Create a minimal test stack for deletion testing.""" + template = { + "AWSTemplateFormatVersion": "2010-09-09", + "Description": "Test stack for SDK deletion integration tests", + "Resources": { + "TestRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "RoleName": f"{stack_name}-sdk-test-role", + "AssumeRolePolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole" + } + ] + } + } + } + }, + "Outputs": { + "RoleName": { + "Description": "Name of the test role", + "Value": {"Ref": "TestRole"} + } + } + } + + import json + cfn_client.create_stack( + StackName=stack_name, + TemplateBody=json.dumps(template), + Capabilities=['CAPABILITY_NAMED_IAM'], + Tags=[ + {"Key": "Purpose", "Value": "SDKIntegrationTest"}, + {"Key": "Component", "Value": "SDK-Delete-Test"} + ] + ) + + # Wait for stack creation to complete + waiter = cfn_client.get_waiter('stack_create_complete') + waiter.wait(StackName=stack_name, WaiterConfig={'Delay': 10, 'MaxAttempts': 30}) + + +def wait_for_stack_delete_complete(cfn_client, stack_name, timeout_minutes=10): + """Wait for stack deletion to complete.""" + try: + waiter = cfn_client.get_waiter('stack_delete_complete') + waiter.wait( + StackName=stack_name, + WaiterConfig={'Delay': 15, 'MaxAttempts': timeout_minutes * 4} + ) + return True + except Exception as e: + if "does not exist" in str(e): + return True # Stack was deleted + raise + + +# --------- SDK Delete Tests --------- + +def test_sdk_delete_basic_functionality(cfn_client): + """Test basic SDK deletion functionality with auto-confirmation.""" + # Create test stack + stack_name = f"{TEST_STACK_PREFIX}-basic-{str(uuid.uuid4())[:8]}" + create_test_stack(cfn_client, stack_name) + + try: + # Delete using SDK (should auto-confirm) + HpClusterStack.delete( + stack_name=stack_name, + region=REGION + ) + + # Wait for deletion to complete + wait_for_stack_delete_complete(cfn_client, stack_name) + + # Verify stack is deleted + with pytest.raises(Exception) as exc_info: + cfn_client.describe_stacks(StackName=stack_name) + assert "does not exist" in str(exc_info.value) + + except Exception: + # Cleanup in case of test failure + try: + cfn_client.delete_stack(StackName=stack_name) + except: + pass + raise diff --git a/test/unit_tests/cli/test_cluster_stack_utils.py b/test/unit_tests/cli/test_cluster_stack_utils.py new file mode 100644 index 00000000..eaacfd13 --- /dev/null +++ b/test/unit_tests/cli/test_cluster_stack_utils.py @@ -0,0 +1,380 @@ +""" +Unit tests for cluster stack utility functions. +Tests the modular components for CloudFormation operations. +""" + +import pytest +from unittest.mock import Mock, patch, MagicMock +import click +import logging +from botocore.exceptions import ClientError + +from sagemaker.hyperpod.cli.cluster_stack_utils import ( + StackNotFoundError, + delete_stack_with_confirmation, + MessageCallback, + ConfirmCallback, + SuccessCallback +) +from sagemaker.hyperpod.cli.common_utils import ( + parse_comma_separated_list, + categorize_resources_by_type +) + + +class TestStackDeletionWorkflow: + """Test suite for the main stack deletion workflow.""" + + def setup_method(self): + """Set up test fixtures.""" + self.region = 'us-west-2' + self.stack_name = 'test-stack' + + # Sample resources for testing + self.sample_resources = [ + { + 'LogicalResourceId': 'EC2Instance1', + 'ResourceType': 'AWS::EC2::Instance', + 'PhysicalResourceId': 'i-1234567890abcdef0' + }, + { + 'LogicalResourceId': 'VPCStack', + 'ResourceType': 'AWS::EC2::VPC', + 'PhysicalResourceId': 'vpc-1234567890abcdef0' + }, + { + 'LogicalResourceId': 'S3Bucket1', + 'ResourceType': 'AWS::S3::Bucket', + 'PhysicalResourceId': 's3-bucket-name' + }, + { + 'LogicalResourceId': 'IAMRole1', + 'ResourceType': 'AWS::IAM::Role', + 'PhysicalResourceId': 'MyRole' + } + ] + + @patch('boto3.client') + def test_delete_stack_with_confirmation_success(self, mock_boto3_client): + """Test successful stack deletion with confirmation.""" + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + mock_cf_client.list_stack_resources.return_value = { + 'StackResourceSummaries': self.sample_resources + } + + # Mock callbacks + message_callback = Mock() + confirm_callback = Mock(return_value=True) + success_callback = Mock() + + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="", + message_callback=message_callback, + confirm_callback=confirm_callback, + success_callback=success_callback + ) + + # Verify CloudFormation calls + mock_cf_client.list_stack_resources.assert_called_once_with(StackName=self.stack_name) + mock_cf_client.delete_stack.assert_called_once_with(StackName=self.stack_name) + + # Verify callbacks were called + assert message_callback.called + assert confirm_callback.called + assert success_callback.called + + @patch('boto3.client') + def test_delete_stack_with_confirmation_cancelled(self, mock_boto3_client): + """Test stack deletion cancelled by user.""" + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + mock_cf_client.list_stack_resources.return_value = { + 'StackResourceSummaries': self.sample_resources + } + + # Mock callbacks - user cancels + message_callback = Mock() + confirm_callback = Mock(return_value=False) + success_callback = Mock() + + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="", + message_callback=message_callback, + confirm_callback=confirm_callback, + success_callback=success_callback + ) + + # Verify deletion was not called + mock_cf_client.delete_stack.assert_not_called() + + # Verify cancellation message + message_callback.assert_any_call("Operation cancelled.") + assert not success_callback.called + + @patch('boto3.client') + def test_delete_stack_with_confirmation_stack_not_found(self, mock_boto3_client): + """Test handling when stack doesn't exist.""" + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + error = ClientError( + {'Error': {'Code': 'ValidationError', 'Message': 'Stack does not exist'}}, + 'ListStackResources' + ) + mock_cf_client.list_stack_resources.side_effect = error + + message_callback = Mock() + confirm_callback = Mock() + success_callback = Mock() + + with pytest.raises(StackNotFoundError): + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="", + message_callback=message_callback, + confirm_callback=confirm_callback, + success_callback=success_callback + ) + + @patch('boto3.client') + def test_delete_stack_with_retain_resources(self, mock_boto3_client): + """Test stack deletion with resource retention.""" + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + mock_cf_client.list_stack_resources.return_value = { + 'StackResourceSummaries': self.sample_resources + } + + message_callback = Mock() + confirm_callback = Mock(return_value=True) + success_callback = Mock() + + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="S3Bucket1,VPCStack", + message_callback=message_callback, + confirm_callback=confirm_callback, + success_callback=success_callback + ) + + # Verify deletion was called with retention + mock_cf_client.delete_stack.assert_called_once_with( + StackName=self.stack_name, + RetainResources=['S3Bucket1', 'VPCStack'] + ) + + @patch('boto3.client') + def test_delete_stack_with_invalid_retain_resources(self, mock_boto3_client): + """Test handling of invalid retain resources.""" + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + mock_cf_client.list_stack_resources.return_value = { + 'StackResourceSummaries': self.sample_resources + } + + message_callback = Mock() + confirm_callback = Mock(return_value=True) + success_callback = Mock() + + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="S3Bucket1,NonExistentResource", + message_callback=message_callback, + confirm_callback=confirm_callback, + success_callback=success_callback + ) + + # Verify warning about invalid resources was displayed + warning_calls = [call for call in message_callback.call_args_list + if 'don\'t exist in the stack' in str(call)] + assert len(warning_calls) > 0 + + # Verify deletion was called with only valid resources + mock_cf_client.delete_stack.assert_called_once_with( + StackName=self.stack_name, + RetainResources=['S3Bucket1'] + ) + + @patch('boto3.client') + def test_delete_stack_termination_protection_error(self, mock_boto3_client): + """Test handling of termination protection error.""" + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + mock_cf_client.list_stack_resources.return_value = { + 'StackResourceSummaries': self.sample_resources + } + + # Mock termination protection error + error = Exception("Stack cannot be deleted while TerminationProtection is enabled") + mock_cf_client.delete_stack.side_effect = error + + message_callback = Mock() + confirm_callback = Mock(return_value=True) + success_callback = Mock() + + with pytest.raises(Exception): + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="", + message_callback=message_callback, + confirm_callback=confirm_callback, + success_callback=success_callback + ) + + # Verify termination protection message was displayed + protection_calls = [call for call in message_callback.call_args_list + if 'Termination Protection is enabled' in str(call)] + assert len(protection_calls) > 0 + + @patch('boto3.client') + def test_delete_stack_retention_limitation_error(self, mock_boto3_client): + """Test handling of CloudFormation retention limitation error.""" + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + mock_cf_client.list_stack_resources.return_value = { + 'StackResourceSummaries': self.sample_resources + } + + # Mock retention limitation error + error = Exception("specify which resources to retain only when the stack is in the DELETE_FAILED state") + mock_cf_client.delete_stack.side_effect = error + + message_callback = Mock() + confirm_callback = Mock(return_value=True) + success_callback = Mock() + + # Should raise exception + with pytest.raises(Exception, match="specify which resources to retain only when the stack is in the DELETE_FAILED state"): + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="S3Bucket1", + message_callback=message_callback, + confirm_callback=confirm_callback, + success_callback=success_callback + ) + + def test_delete_stack_with_logger(self): + """Test stack deletion with logger parameter.""" + logger = Mock(spec=logging.Logger) + message_callback = Mock() + confirm_callback = Mock(return_value=False) # Cancel to avoid actual deletion + + with patch('boto3.client') as mock_boto3_client: + mock_cf_client = Mock() + mock_boto3_client.return_value = mock_cf_client + mock_cf_client.list_stack_resources.return_value = { + 'StackResourceSummaries': self.sample_resources + } + + delete_stack_with_confirmation( + stack_name=self.stack_name, + region=self.region, + retain_resources_str="", + message_callback=message_callback, + confirm_callback=confirm_callback, + logger=logger + ) + + # Verify logger was used + assert logger.info.called + + +class TestCallbackTypes: + """Test suite for callback type definitions.""" + + def test_message_callback_type(self): + """Test MessageCallback type works correctly.""" + def test_callback(message: str) -> None: + pass + + # Should not raise type errors + callback: MessageCallback = test_callback + callback("test message") + + def test_confirm_callback_type(self): + """Test ConfirmCallback type works correctly.""" + def test_callback(message: str) -> bool: + return True + + # Should not raise type errors + callback: ConfirmCallback = test_callback + result = callback("test message") + assert result is True + + def test_success_callback_type(self): + """Test SuccessCallback type works correctly.""" + def test_callback(message: str) -> None: + pass + + # Should not raise type errors + callback: SuccessCallback = test_callback + callback("test message") + + +class TestGenericUtilities: + """Test suite for generic utilities from common_utils.""" + + def test_parse_comma_separated_list(self): + """Test parsing comma-separated lists.""" + # Test normal case + result = parse_comma_separated_list("item1,item2,item3") + assert result == ["item1", "item2", "item3"] + + # Test with spaces + result = parse_comma_separated_list("item1, item2 , item3") + assert result == ["item1", "item2", "item3"] + + # Test empty string + result = parse_comma_separated_list("") + assert result == [] + + # Test None + result = parse_comma_separated_list(None) + assert result == [] + + def test_categorize_resources_by_type(self): + """Test generic resource categorization.""" + resources = [ + {"ResourceType": "AWS::EC2::Instance", "LogicalResourceId": "MyInstance"}, + {"ResourceType": "AWS::S3::Bucket", "LogicalResourceId": "MyBucket"}, + {"ResourceType": "AWS::Lambda::Function", "LogicalResourceId": "MyFunction"} + ] + + type_mappings = { + "Compute": ["AWS::EC2::Instance", "AWS::Lambda::Function"], + "Storage": ["AWS::S3::Bucket"] + } + + result = categorize_resources_by_type(resources, type_mappings) + + assert result == { + "Compute": ["MyInstance", "MyFunction"], + "Storage": ["MyBucket"] + } + + +class TestStackNotFoundError: + """Test suite for StackNotFoundError exception.""" + + def test_stack_not_found_error_creation(self): + """Test StackNotFoundError can be created and raised.""" + with pytest.raises(StackNotFoundError, match="Test stack not found"): + raise StackNotFoundError("Test stack not found") + + def test_stack_not_found_error_inheritance(self): + """Test StackNotFoundError inherits from Exception.""" + error = StackNotFoundError("Test error") + assert isinstance(error, Exception) + + +if __name__ == '__main__': + pytest.main([__file__]) diff --git a/test/unit_tests/cli/test_delete_cluster_stack.py b/test/unit_tests/cli/test_delete_cluster_stack.py new file mode 100644 index 00000000..0f38f505 --- /dev/null +++ b/test/unit_tests/cli/test_delete_cluster_stack.py @@ -0,0 +1,257 @@ +""" +Unit tests for delete cluster-stack command implementation. +Tests all possible scenarios including success, failures, and edge cases. +""" + +import pytest +from unittest.mock import Mock, patch, MagicMock +from click.testing import CliRunner +import click +import json + +from sagemaker.hyperpod.cli.commands.cluster_stack import delete_cluster_stack +from sagemaker.hyperpod.cli.cluster_stack_utils import StackNotFoundError + + +class TestDeleteClusterStack: + """Test suite for delete cluster-stack command.""" + + def setup_method(self): + """Set up test fixtures.""" + self.runner = CliRunner() + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_successful_deletion_without_retention(self, mock_setup_logging, mock_delete_stack): + """Test successful stack deletion without resource retention.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Execute command + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack', '--region', 'us-west-2', '--debug'] + ) + + # Assertions + assert result.exit_code == 0 + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['stack_name'] == 'test-stack' + assert call_args[1]['region'] == 'us-west-2' + assert call_args[1]['retain_resources_str'] == "" + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_successful_deletion_with_retention(self, mock_setup_logging, mock_delete_stack): + """Test successful stack deletion with resource retention.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Execute command with retention + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack', '--retain-resources', 'S3BucketStack,VPCStack', '--region', 'us-west-2'] + ) + + # Assertions + assert result.exit_code == 0 + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['stack_name'] == 'test-stack' + assert call_args[1]['region'] == 'us-west-2' + assert call_args[1]['retain_resources_str'] == 'S3BucketStack,VPCStack' + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_stack_not_found(self, mock_setup_logging, mock_delete_stack): + """Test handling when stack doesn't exist.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + mock_delete_stack.side_effect = StackNotFoundError("Stack 'non-existent-stack' not found") + + # Execute command + result = self.runner.invoke( + delete_cluster_stack, + ['non-existent-stack', '--region', 'us-west-2'] + ) + + # Assertions + assert result.exit_code == 0 + assert "❌ Stack 'non-existent-stack' not found" in result.output + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_termination_protection_enabled(self, mock_setup_logging, mock_delete_stack): + """Test handling when stack has termination protection enabled.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Mock termination protection error + from botocore.exceptions import ClientError + error = ClientError( + {'Error': {'Code': 'ValidationError', 'Message': 'Stack cannot be deleted while TerminationProtection is enabled'}}, + 'DeleteStack' + ) + mock_delete_stack.side_effect = error + + # Execute command + result = self.runner.invoke( + delete_cluster_stack, + ['protected-stack', '--region', 'us-west-2'] + ) + + # Assertions + assert result.exit_code == 1 + assert "TerminationProtection is enabled" in result.output + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_cloudformation_retention_limitation(self, mock_setup_logging, mock_delete_stack): + """Test handling CloudFormation's retain-resources limitation.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Mock CloudFormation retention limitation error + from botocore.exceptions import ClientError + error = ClientError( + {'Error': {'Code': 'ValidationError', 'Message': 'specify which resources to retain only when the stack is in the DELETE_FAILED state'}}, + 'DeleteStack' + ) + mock_delete_stack.side_effect = error + + # Execute command with retention + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack', '--retain-resources', 'S3BucketStack', '--region', 'us-west-2'] + ) + + # Assertions - CLI re-raises as ClickException, so exit code is 1 + assert result.exit_code == 1 + assert "DELETE_FAILED state" in result.output + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_access_denied_error(self, mock_setup_logging, mock_delete_stack): + """Test handling access denied errors.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Mock access denied error + from botocore.exceptions import ClientError + error = ClientError( + {'Error': {'Code': 'AccessDenied', 'Message': 'Access denied'}}, + 'ListStackResources' + ) + mock_delete_stack.side_effect = error + + # Execute command + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack', '--region', 'us-west-2'] + ) + + # Assertions - ClickException results in exit code 1 + assert result.exit_code == 1 + assert "Access denied" in result.output + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_retain_resources_parsing(self, mock_setup_logging, mock_delete_stack): + """Test proper parsing of retain-resources parameter.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Test with spaces and various formats + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack', '--retain-resources', ' S3BucketStack , VPCStack , IAMRole1 ', '--region', 'us-west-2'] + ) + + # Assertions + assert result.exit_code == 0 + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['retain_resources_str'] == ' S3BucketStack , VPCStack , IAMRole1 ' + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_debug_logging(self, mock_setup_logging, mock_delete_stack): + """Test that debug logging is properly enabled.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Execute command with debug flag + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack', '--region', 'us-west-2', '--debug'] + ) + + # Assertions + assert result.exit_code == 0 + + # Verify setup_logging was called + mock_setup_logging.assert_called_once() + + def test_command_help(self): + """Test that command help is displayed correctly.""" + result = self.runner.invoke(delete_cluster_stack, ['--help']) + + assert result.exit_code == 0 + assert "Delete a HyperPod cluster stack." in result.output + assert "--retain-resources" in result.output + assert "--region" in result.output + assert "--debug" in result.output + assert "Removes the specified CloudFormation stack and all associated AWS resources." in result.output + + def test_required_region_flag(self): + """Test that the --region flag is required.""" + # Test without region flag should fail + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack'] + ) + + # Assertions + assert result.exit_code == 2 # Click returns 2 for missing required options + assert "Missing option '--region'" in result.output + + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cli.commands.cluster_stack.setup_logging') + def test_generic_error_handling(self, mock_setup_logging, mock_delete_stack): + """Test handling of generic/unexpected errors.""" + # Setup mocks + mock_logger = Mock() + mock_setup_logging.return_value = mock_logger + + # Mock unexpected error + error = Exception("Unexpected error occurred") + mock_delete_stack.side_effect = error + + # Execute command + result = self.runner.invoke( + delete_cluster_stack, + ['test-stack', '--region', 'us-west-2'] + ) + + # Assertions + assert result.exit_code == 1 + assert "Unexpected error occurred" in result.output + + +if __name__ == '__main__': + pytest.main([__file__]) diff --git a/test/unit_tests/cluster_management/test_hp_cluster_stack.py b/test/unit_tests/cluster_management/test_hp_cluster_stack.py index 8652d772..361b1d8f 100644 --- a/test/unit_tests/cluster_management/test_hp_cluster_stack.py +++ b/test/unit_tests/cluster_management/test_hp_cluster_stack.py @@ -622,4 +622,291 @@ def test_list_with_region(self, mock_create_client): HpClusterStack.list(region='us-east-1') # Assert - mock_create_client.assert_called_once_with('cloudformation', region_name='us-east-1') \ No newline at end of file + mock_create_client.assert_called_once_with('cloudformation', region_name='us-east-1') + + +class TestHpClusterStackDelete(unittest.TestCase): + """Test suite for HpClusterStack delete functionality.""" + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_successful_without_retention(self, mock_get_logger, mock_session, mock_delete_stack): + """Test successful stack deletion without resource retention.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Execute delete + HpClusterStack.delete('test-stack', region='us-west-2') + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['stack_name'] == 'test-stack' + assert call_args[1]['region'] == 'us-west-2' + assert call_args[1]['retain_resources_str'] == "" + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_successful_with_retention(self, mock_get_logger, mock_session, mock_delete_stack): + """Test successful stack deletion with resource retention.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Execute delete with retention + HpClusterStack.delete('test-stack', region='us-west-2', retain_resources=['S3Bucket', 'EFSFileSystem']) + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['stack_name'] == 'test-stack' + assert call_args[1]['region'] == 'us-west-2' + assert call_args[1]['retain_resources_str'] == 'S3Bucket,EFSFileSystem' + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_with_auto_confirm(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete with automatic confirmation (always enabled).""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Execute delete (auto-confirm is always enabled now) + HpClusterStack.delete('test-stack', region='us-west-2') + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + + # Test the confirm callback - should always auto-confirm + confirm_callback = call_args[1]['confirm_callback'] + result = confirm_callback("Test confirmation message") + assert result is True + + # Verify logger was called for auto-confirmation + mock_logger.info.assert_called_with("Auto-confirming: Test confirmation message") + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_with_custom_logger(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete with custom logger.""" + # Setup mocks + custom_logger = MagicMock() + mock_session.return_value.region_name = 'us-west-2' + + # Execute delete with custom logger + HpClusterStack.delete('test-stack', region='us-west-2', logger=custom_logger) + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + + # Verify custom logger is used in callbacks + message_callback = call_args[1]['message_callback'] + success_callback = call_args[1]['success_callback'] + + message_callback("Test message") + success_callback("Test success") + + custom_logger.info.assert_any_call("Test message") + custom_logger.info.assert_any_call("Test success") + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_uses_default_region(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete uses default region when none provided.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-east-1' + + # Execute delete without region + HpClusterStack.delete('test-stack') + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['region'] == 'us-east-1' + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_stack_not_found(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete handles stack not found error.""" + from sagemaker.hyperpod.cli.cluster_stack_utils import StackNotFoundError + + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + mock_delete_stack.side_effect = StackNotFoundError("Stack 'non-existent-stack' not found") + + # Execute delete and expect ValueError + with self.assertRaises(ValueError) as context: + HpClusterStack.delete('non-existent-stack', region='us-west-2') + + assert "Stack 'non-existent-stack' not found" in str(context.exception) + mock_logger.error.assert_called_with("Stack 'non-existent-stack' not found") + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_termination_protection_enabled(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete handles termination protection error.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Mock termination protection error + from botocore.exceptions import ClientError + error = ClientError( + {'Error': {'Code': 'ValidationError', 'Message': 'Stack cannot be deleted while TerminationProtection is enabled'}}, + 'DeleteStack' + ) + mock_delete_stack.side_effect = error + + # Execute delete and expect RuntimeError + with self.assertRaises(RuntimeError) as context: + HpClusterStack.delete('protected-stack', region='us-west-2') + + assert "Termination Protection is enabled" in str(context.exception) + mock_logger.error.assert_called() + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_retention_limitation(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete handles CloudFormation retention limitation.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Mock retention limitation error + from botocore.exceptions import ClientError + error = ClientError( + {'Error': {'Code': 'ValidationError', 'Message': 'specify which resources to retain only when the stack is in the DELETE_FAILED state'}}, + 'DeleteStack' + ) + mock_delete_stack.side_effect = error + + # Execute delete with retention and expect ValueError + with self.assertRaises(ValueError) as context: + HpClusterStack.delete('test-stack', region='us-west-2', retain_resources=['S3Bucket']) + + assert "retain_resources can only be used on stacks in DELETE_FAILED state" in str(context.exception) + mock_logger.error.assert_called() + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_access_denied_error(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete handles access denied error.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Mock access denied error + from botocore.exceptions import ClientError + error = ClientError( + {'Error': {'Code': 'AccessDenied', 'Message': 'Access denied'}}, + 'ListStackResources' + ) + mock_delete_stack.side_effect = error + + # Execute delete and expect RuntimeError + with self.assertRaises(RuntimeError) as context: + HpClusterStack.delete('test-stack', region='us-west-2') + + assert "Stack deletion failed" in str(context.exception) + mock_logger.error.assert_called() + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_generic_error(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete handles generic errors.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Mock generic error + error = Exception("Unexpected error occurred") + mock_delete_stack.side_effect = error + + # Execute delete and expect RuntimeError + with self.assertRaises(RuntimeError) as context: + HpClusterStack.delete('test-stack', region='us-west-2') + + assert "Stack deletion failed: Unexpected error occurred" in str(context.exception) + mock_logger.error.assert_called_with("Failed to delete stack: Unexpected error occurred") + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + def test_delete_uses_default_logger_when_none_provided(self, mock_session, mock_delete_stack): + """Test delete uses default logger when none provided.""" + # Setup mocks + mock_session.return_value.region_name = 'us-west-2' + + # Execute delete without logger + HpClusterStack.delete('test-stack', region='us-west-2') + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + + # Verify message_callback and success_callback are logger.info methods + message_callback = call_args[1]['message_callback'] + success_callback = call_args[1]['success_callback'] + + # These should be bound methods of a logger instance + assert hasattr(message_callback, '__self__') + assert hasattr(success_callback, '__self__') + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_empty_retain_resources_list(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete handles empty retain_resources list.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Execute delete with empty retain_resources + HpClusterStack.delete('test-stack', region='us-west-2', retain_resources=[]) + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['retain_resources_str'] == "" + + @patch('sagemaker.hyperpod.cli.cluster_stack_utils.delete_stack_with_confirmation') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.boto3.session.Session') + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.logging.getLogger') + def test_delete_none_retain_resources(self, mock_get_logger, mock_session, mock_delete_stack): + """Test delete handles None retain_resources.""" + # Setup mocks + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + mock_session.return_value.region_name = 'us-west-2' + + # Execute delete with None retain_resources + HpClusterStack.delete('test-stack', region='us-west-2', retain_resources=None) + + # Verify function calls + mock_delete_stack.assert_called_once() + call_args = mock_delete_stack.call_args + assert call_args[1]['retain_resources_str'] == ""