From 33788987ca464e92bf5e982af65301424116ccda Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 13 Dec 2024 08:35:14 -0800 Subject: [PATCH] Add validators to check head node instance type and shared storage type w.r.t cluster size The requirements set in these validators are minimum. Users should leave more safety margin considering their workloads. Signed-off-by: Hanwen --- cli/src/pcluster/config/cluster_config.py | 16 +++++++++ .../pcluster/validators/cluster_validators.py | 36 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 42c7cfc16f..2053eb7a31 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -88,6 +88,7 @@ FsxArchitectureOsValidator, HeadNodeImdsValidator, HeadNodeLaunchTemplateValidator, + HeadNodeMemorySizeValidator, HostedZoneValidator, InstanceArchitectureCompatibilityValidator, IntelHpcArchitectureValidator, @@ -108,6 +109,7 @@ SchedulerDisableSudoAccessForDefaultUserValidator, SchedulerOsValidator, SchedulerValidator, + SharedEbsPerformanceBottleNeckValidator, SharedFileCacheNotHomeValidator, SharedStorageMountDirValidator, SharedStorageNameValidator, @@ -3030,6 +3032,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues) checked_images = [] capacity_reservation_id_max_count_map = {} + total_max_compute_nodes = 0 for index, queue in enumerate(self.scheduling.queues): queue_image = self.image_dict[queue.name] if index == 0: @@ -3064,6 +3067,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 self._register_validator(AmiOsCompatibleValidator, os=self.image.os, image_id=queue_image) for compute_resource in queue.compute_resources: + total_max_compute_nodes += compute_resource.max_count self._register_validator( InstanceArchitectureCompatibilityValidator, instance_type_info_list=list(compute_resource.instance_type_info_map.values()), @@ -3180,6 +3184,18 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 compute_resource_tags=compute_resource.get_tags(), ) + self._register_validator( + HeadNodeMemorySizeValidator, + head_node_instance_type=self.head_node.instance_type, + total_max_compute_nodes=total_max_compute_nodes, + ) + if self.shared_storage: + for storage in self.shared_storage: + if isinstance(storage, SharedEbs): + self._register_validator( + SharedEbsPerformanceBottleNeckValidator, + total_max_compute_nodes=total_max_compute_nodes, + ) for capacity_reservation_id, num_of_instances in capacity_reservation_id_max_count_map.items(): self._register_validator( CapacityReservationSizeValidator, diff --git a/cli/src/pcluster/validators/cluster_validators.py b/cli/src/pcluster/validators/cluster_validators.py index 548db963c4..407143500e 100644 --- a/cli/src/pcluster/validators/cluster_validators.py +++ b/cli/src/pcluster/validators/cluster_validators.py @@ -1311,6 +1311,42 @@ def _validate(self, imds_secured: bool, scheduler: str): ) +class HeadNodeMemorySizeValidator(Validator): + """ + Head Node Instance Type Validator. + + Verify if the Head Node has enough memory to manage compute nodes. + """ + + def _validate(self, head_node_instance_type: str, total_max_compute_nodes: int): + head_node_memory = ( + AWSApi.instance().ec2.get_instance_type_info(head_node_instance_type).ec2memory_size_in_mib() / 1024 + ) + # Assume OS takes up 0.6GB memory. Only check upto 16GB memory to prevent usage of small instance types. + required_memory = min(total_max_compute_nodes / 25 + 0.6, 16) + if head_node_memory < required_memory: + self._add_failure( + f"Head node instance type {head_node_instance_type} has {head_node_memory} GB of memory. " + f"Please choose a head node instance type with at least {required_memory} GB of memory" + f" to manage {total_max_compute_nodes} compute nodes.", + FailureLevel.ERROR, + ) + + +class SharedEbsPerformanceBottleNeckValidator(Validator): + """Warn potential performance bottleneck of using Shared EBS.""" + + def _validate(self, total_max_compute_nodes: int): + if total_max_compute_nodes > 100: + self._add_failure( + "EBS shared storage is mounted on the head node and shared to the compute nodes. " + "Therefore, the head node network bandwidth is a performance bottle neck " + "if the compute nodes rely on this shared storage. " + "Please use FSx and EFS for better performance.", + FailureLevel.WARNING, + ) + + class ComputeResourceLaunchTemplateValidator(_LaunchTemplateValidator): """Try to launch the requested instances (in dry-run mode) to verify configuration parameters."""