Skip to content

Commit

Permalink
Add validators to check head node instance type and shared storage ty…
Browse files Browse the repository at this point in the history
…pe w.r.t cluster size

The requirements set in these validators are minimum. Users should leave more safety margin considering their workloads.

Signed-off-by: Hanwen <[email protected]>
  • Loading branch information
hanwen-cluster committed Dec 18, 2024
1 parent a839865 commit 3378898
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 0 deletions.
16 changes: 16 additions & 0 deletions cli/src/pcluster/config/cluster_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
FsxArchitectureOsValidator,
HeadNodeImdsValidator,
HeadNodeLaunchTemplateValidator,
HeadNodeMemorySizeValidator,
HostedZoneValidator,
InstanceArchitectureCompatibilityValidator,
IntelHpcArchitectureValidator,
Expand All @@ -108,6 +109,7 @@
SchedulerDisableSudoAccessForDefaultUserValidator,
SchedulerOsValidator,
SchedulerValidator,
SharedEbsPerformanceBottleNeckValidator,
SharedFileCacheNotHomeValidator,
SharedStorageMountDirValidator,
SharedStorageNameValidator,
Expand Down Expand Up @@ -3030,6 +3032,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues)
checked_images = []
capacity_reservation_id_max_count_map = {}
total_max_compute_nodes = 0
for index, queue in enumerate(self.scheduling.queues):
queue_image = self.image_dict[queue.name]
if index == 0:
Expand Down Expand Up @@ -3064,6 +3067,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
self._register_validator(AmiOsCompatibleValidator, os=self.image.os, image_id=queue_image)

for compute_resource in queue.compute_resources:
total_max_compute_nodes += compute_resource.max_count
self._register_validator(
InstanceArchitectureCompatibilityValidator,
instance_type_info_list=list(compute_resource.instance_type_info_map.values()),
Expand Down Expand Up @@ -3180,6 +3184,18 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
compute_resource_tags=compute_resource.get_tags(),
)

self._register_validator(
HeadNodeMemorySizeValidator,
head_node_instance_type=self.head_node.instance_type,
total_max_compute_nodes=total_max_compute_nodes,
)
if self.shared_storage:
for storage in self.shared_storage:
if isinstance(storage, SharedEbs):
self._register_validator(
SharedEbsPerformanceBottleNeckValidator,
total_max_compute_nodes=total_max_compute_nodes,
)
for capacity_reservation_id, num_of_instances in capacity_reservation_id_max_count_map.items():
self._register_validator(
CapacityReservationSizeValidator,
Expand Down
36 changes: 36 additions & 0 deletions cli/src/pcluster/validators/cluster_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,42 @@ def _validate(self, imds_secured: bool, scheduler: str):
)


class HeadNodeMemorySizeValidator(Validator):
"""
Head Node Instance Type Validator.
Verify if the Head Node has enough memory to manage compute nodes.
"""

def _validate(self, head_node_instance_type: str, total_max_compute_nodes: int):
head_node_memory = (
AWSApi.instance().ec2.get_instance_type_info(head_node_instance_type).ec2memory_size_in_mib() / 1024
)
# Assume OS takes up 0.6GB memory. Only check upto 16GB memory to prevent usage of small instance types.
required_memory = min(total_max_compute_nodes / 25 + 0.6, 16)
if head_node_memory < required_memory:
self._add_failure(
f"Head node instance type {head_node_instance_type} has {head_node_memory} GB of memory. "
f"Please choose a head node instance type with at least {required_memory} GB of memory"
f" to manage {total_max_compute_nodes} compute nodes.",
FailureLevel.ERROR,
)


class SharedEbsPerformanceBottleNeckValidator(Validator):
"""Warn potential performance bottleneck of using Shared EBS."""

def _validate(self, total_max_compute_nodes: int):
if total_max_compute_nodes > 100:
self._add_failure(
"EBS shared storage is mounted on the head node and shared to the compute nodes. "
"Therefore, the head node network bandwidth is a performance bottle neck "
"if the compute nodes rely on this shared storage. "
"Please use FSx and EFS for better performance.",
FailureLevel.WARNING,
)


class ComputeResourceLaunchTemplateValidator(_LaunchTemplateValidator):
"""Try to launch the requested instances (in dry-run mode) to verify configuration parameters."""

Expand Down

0 comments on commit 3378898

Please sign in to comment.