Skip to content

Commit

Permalink
Merge branch 'develop' into wip/mgiacomo/390/dfsm2/forced-update-0108
Browse files Browse the repository at this point in the history
  • Loading branch information
gmarciani authored Jan 16, 2024
2 parents 088ce00 + 2b10b88 commit 9ad63df
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 1 deletion.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ CHANGELOG
- Add support for Python 3.11, 3.12 in pcluster CLI and aws-parallelcluster-batch-cli.
- Upgrade Python to version 3.12 and NodeJS to version 18 in ParallelCluster Lambda Layer.
- Build network interfaces using network card index from `NetworkCardIndex` list of EC2 DescribeInstances response,
instead of looping over `MaximumNetworkCards` range.
instead of looping over `MaximumNetworkCards` range.
- Fail cluster creation when using instance types P3, G3, P2 and G2 because their GPU architecture is not compatible with Open Source Nvidia Drivers (OpenRM) introduced as part of 3.8.0 release.

3.8.0
------
Expand Down Expand Up @@ -51,6 +52,7 @@ CHANGELOG
- Upgrade NVIDIA driver to version 535.129.03.
- Upgrade CUDA Toolkit to version 12.2.2.
- Use Open Source NVIDIA GPU drivers (OpenRM) as NVIDIA kernel module for Linux instead of NVIDIA closed source module.
- This change removes support for P3, G3, P2 and G2 instances with GPU architecture not supported by OpenRM. The Open Source Nvidia only works on platforms that have the GSP (GPU System Processor).
- Remove support of `all_or_nothing_batch` configuration parameter in the Slurm resume program, in favor of the new `Scheduling/ScalingStrategy` cluster configuration.
- Changed cluster alarms naming convention to '[cluster-name]-[component-name]-[metric]'.
- Change default EBS volume types in ADC regions from `gp2` to `gp3`, for both the root and additional volumes.
Expand Down
1 change: 1 addition & 0 deletions cli/src/pcluster/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
DELETION_POLICIES_WITH_SNAPSHOT = DELETION_POLICIES + ["Snapshot"]
SUPPORTED_ARCHITECTURES = ["x86_64", "arm64"]
SUPPORTED_OSES_FOR_ARCHITECTURE = {"x86_64": SUPPORTED_OSES, "arm64": SUPPORTED_OSES}
NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES = ["p3", "p3dn", "p2", "g3", "g3s", "g2"]
SLURM = "slurm"
AWSBATCH = "awsbatch"

Expand Down
14 changes: 14 additions & 0 deletions cli/src/pcluster/validators/ec2_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pcluster.aws.aws_resources import CapacityReservationInfo
from pcluster.aws.common import AWSClientError
from pcluster.config.common import CapacityType
from pcluster.constants import NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES
from pcluster.utils import get_resource_name_from_resource_arn
from pcluster.validators.common import FailureLevel, Validator

Expand Down Expand Up @@ -147,6 +148,19 @@ def _validate(self, instance_type: str, image: str):
FailureLevel.ERROR,
)

if (
image_info
and "AWS ParallelCluster AMI" in image_info.description
and instance_type.split(".")[0] in NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES
):
self._add_failure(
f"The instance type '{instance_type}' is not supported by NVIDIA OpenRM drivers. "
f"OpenRM can only be used on any Turing or later GPU architectures. "
f"Please consider using a different instance type or building a custom AMI "
f"with closed source NVIDIA drivers.",
FailureLevel.ERROR,
)

def _validate_base_ami(self, image: str):
try:
ami_id = imagebuilder_utils.get_ami_id(image)
Expand Down
1 change: 1 addition & 0 deletions cli/tests/pcluster/models/test_imagebuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
},
}
],
"Description": "AWS ParallelCluster AMI for alinux2",
},
None,
[
Expand Down
56 changes: 56 additions & 0 deletions cli/tests/pcluster/validators/test_ec2_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def test_instance_type_memory_info_validator(mocker, instance_type, instance_typ
{
"ImageId": "ami-0185634c5a8a37250",
"Architecture": "x86_64",
"Description": "AWS ParallelCluster AMI",
"BlockDeviceMappings": [
{
"DeviceName": "/dev/xvda",
Expand All @@ -266,6 +267,7 @@ def test_instance_type_memory_info_validator(mocker, instance_type, instance_typ
{
"ImageId": "ami-0185634c5a8a37250",
"Architecture": "x86_64",
"Description": "AWS ParallelCluster AMI",
"BlockDeviceMappings": [
{
"DeviceName": "/dev/xvda",
Expand Down Expand Up @@ -299,6 +301,7 @@ def test_instance_type_memory_info_validator(mocker, instance_type, instance_typ
{
"ImageId": "ami-0185634c5a8a37250",
"Architecture": "x86_64",
"Description": "AWS ParallelCluster AMI",
"BlockDeviceMappings": [
{
"DeviceName": "/dev/xvda",
Expand All @@ -316,6 +319,59 @@ def test_instance_type_memory_info_validator(mocker, instance_type, instance_typ
["m6g.xlarge", "c5.xlarge"],
[],
),
(
"p3.2xlarge",
"ami-0185634c5a8a37250",
"The instance type 'p3.2xlarge' is not supported by NVIDIA OpenRM drivers. "
"OpenRM can only be used on any Turing or later GPU architectures. "
"Please consider using a different instance type or building a custom AMI "
"with closed source NVIDIA drivers.",
{
"ImageId": "ami-0185634c5a8a37250",
"Architecture": "x86_64",
"Description": "AWS ParallelCluster AMI",
"BlockDeviceMappings": [
{
"DeviceName": "/dev/xvda",
"Ebs": {
"DeleteOnTermination": True,
"SnapshotId": "snap-0a20b6671bc5e3ead",
"VolumeSize": 25,
"VolumeType": "gp2",
"Encrypted": False,
},
}
],
},
None,
["p3.2xlarge", "c5.xlarge"],
["x86_64"],
),
(
"p3.2xlarge",
"ami-0185634c5a8a37250",
None,
{
"ImageId": "ami-0185634c5a8a37250",
"Architecture": "x86_64",
"Description": "Custom AMI",
"BlockDeviceMappings": [
{
"DeviceName": "/dev/xvda",
"Ebs": {
"DeleteOnTermination": True,
"SnapshotId": "snap-0a20b6671bc5e3ead",
"VolumeSize": 25,
"VolumeType": "gp2",
"Encrypted": False,
},
}
],
},
None,
["p3.2xlarge", "c5.xlarge"],
["x86_64"],
),
],
)
def test_instance_type_base_ami_compatible_validator(
Expand Down

0 comments on commit 9ad63df

Please sign in to comment.