Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DFSM] Support live updates on compute nodes. #6003

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 21 additions & 59 deletions cli/src/pcluster/resources/compute_node/user_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,66 +59,29 @@ datasource_list: [ Ec2, None ]
output:
all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/console"
write_files:
- path: /tmp/dna.json
permissions: '0644'
owner: root:root
content: |
{
"cluster": {
"cluster_name": "${ClusterName}",
"stack_name": "${AWS::StackName}",
"stack_arn": "${AWS::StackId}",
"enable_efa": "${EnableEfa}",
"raid_shared_dir": "${RAIDSharedDir}",
"raid_type": "${RAIDType}",
"base_os": "${BaseOS}",
"region": "${AWS::Region}",
"shared_storage_type": "${SharedStorageType}",
"efs_fs_ids": "${EFSIds}",
"efs_shared_dirs": "${EFSSharedDirs}",
"efs_encryption_in_transits": "${EFSEncryptionInTransits}",
"efs_iam_authorizations": "${EFSIamAuthorizations}",
"fsx_fs_ids": "${FSXIds}",
"fsx_mount_names": "${FSXMountNames}",
"fsx_dns_names": "${FSXDNSNames}",
"fsx_volume_junction_paths": "${FSXVolumeJunctionPaths}",
"fsx_fs_types": "${FSXFileSystemTypes}",
"fsx_shared_dirs": "${FSXSharedDirs}",
"scheduler": "${Scheduler}",
"ephemeral_dir": "${EphemeralDir}",
"ebs_shared_dirs": "${EbsSharedDirs}",
"proxy": "${ProxyServer}",
"slurm_ddb_table": "${SlurmDynamoDBTable}",
"log_group_name": "${LogGroupName}",
"dns_domain": "${ClusterDNSDomain}",
"hosted_zone": "${ClusterHostedZone}",
"node_type": "ComputeFleet",
"cluster_user": "${OSUser}",
"enable_intel_hpc_platform": "${IntelHPCPlatform}",
"cw_logging_enabled": "${CWLoggingEnabled}",
"log_rotation_enabled": "${LogRotationEnabled}",
"scheduler_queue_name": "${QueueName}",
"scheduler_compute_resource_name": "${ComputeResourceName}",
"enable_efa_gdr": "${EnableEfaGdr}",
"custom_node_package": "${CustomNodePackage}",
"custom_awsbatchcli_package": "${CustomAwsBatchCliPackage}",
"use_private_hostname": "${UsePrivateHostname}",
"head_node_private_ip": "${HeadNodePrivateIp}",
"directory_service": {
"enabled": "${DirectoryServiceEnabled}"
},
"disable_sudo_access_for_default_user":"${DisableSudoAccessForDefault}"
}
}
- path: /etc/chef/client.rb
permissions: '0644'
owner: root:root
content: cookbook_path ['/etc/chef/cookbooks']
- path: /tmp/extra.json
permissions: '0644'
- path: /etc/cfn/cfn-hup.conf
permissions: '0400'
owner: root:root
content: |
${ExtraJson}
[main]
stack=${AWS::StackId}
region=${AWS::Region}
url=${CloudFormationUrl}
role=${CfnInitRole}
interval=2
- path: /etc/cfn/hooks.d/parallelcluster-update.conf
permissions: '0400'
owner: root:root
content: |
[parallelcluster-update]
triggers=post.update
path=Resources.${LaunchTemplateResourceId}.Metadata.AWS::CloudFormation::Init
action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/profile.d/pcluster.sh; cfn-init -v --stack ${AWS::StackName} --resource ${LaunchTemplateResourceId} --configsets update --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole}
runas=root
- path: /tmp/bootstrap.sh
permissions: '0744'
owner: root:root
Expand Down Expand Up @@ -161,6 +124,10 @@ write_files:
fi
}

export PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin

cfn-init -s ${AWS::StackName} -v -c deployFiles -r ${LaunchTemplateResourceId} --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole} || error_exit 'Failed to bootstrap the compute node. Please check /var/log/cfn-init.log in the compute node, or check the cfn-init.log in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.'

[ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh

# Configure AWS CLI using the expected overrides, if any.
Expand All @@ -187,7 +154,6 @@ write_files:
cookbook_url=${!custom_cookbook}
fi
fi
export PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin
export parallelcluster_version=aws-parallelcluster-${ParallelClusterVersion}
export cookbook_version=${CookbookVersion}
export chef_version=${ChefVersion}
Expand All @@ -208,13 +174,9 @@ write_files:
fi
cd /tmp

mkdir -p /etc/chef/ohai/hints
touch /etc/chef/ohai/hints/ec2.json

# measure start time
start=$(date +%s)

jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 * $f2' > /etc/chef/dna.json || ( echo "jq not installed or invalid extra_json"; cp /tmp/dna.json /etc/chef/dna.json)
{
pushd /etc/chef &&
cinc-client --local-mode --config /etc/chef/client.rb --log_level info --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::init &&
Expand Down
21 changes: 21 additions & 0 deletions cli/src/pcluster/templates/cdk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,27 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
)
],
),
iam.PolicyStatement(
sid="CloudFormation",
actions=[
"cloudformation:DescribeStackResource",
],
effect=iam.Effect.ALLOW,
resources=[
self._format_arn(service="cloudformation", resource=f"stack/{Stack.of(self).stack_name}-*/*"),
],
),
iam.PolicyStatement(
sid="DynamoDBTable",
actions=["dynamodb:UpdateItem", "dynamodb:PutItem", "dynamodb:GetItem"],
effect=iam.Effect.ALLOW,
resources=[
self._format_arn(
service="dynamodb",
resource=f"table/{PCLUSTER_DYNAMODB_PREFIX}{Stack.of(self).stack_name}",
)
],
),
]


Expand Down
1 change: 1 addition & 0 deletions cli/src/pcluster/templates/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ def _add_fleet_and_scheduler_resources(self, cleanup_lambda, cleanup_lambda_role
dynamodb_table=self.scheduler_resources.dynamodb_table if self.scheduler_resources else None,
head_eni=self._head_eni,
slurm_construct=self.scheduler_resources,
cluster_bucket=self.bucket,
)

def _add_login_nodes_resources(self):
Expand Down
3 changes: 3 additions & 0 deletions cli/src/pcluster/templates/compute_fleet_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
dynamodb_table,
head_eni,
slurm_construct: SlurmConstruct,
cluster_bucket,
):
super().__init__(scope, id)
self._cleanup_lambda = cleanup_lambda
Expand All @@ -61,6 +62,7 @@ def __init__(
self._dynamodb_table = dynamodb_table
self._head_eni = head_eni
self._slurm_construct = slurm_construct
self._cluster_bucket = cluster_bucket

self.launch_templates = {}
self.managed_compute_fleet_instance_roles = {}
Expand Down Expand Up @@ -95,6 +97,7 @@ def _add_resources(self):
head_eni=self._head_eni,
slurm_construct=self._slurm_construct,
compute_security_group=self._compute_security_group,
cluster_bucket=self._cluster_bucket,
)

self.managed_compute_fleet_instance_roles.update(queues_stack.managed_compute_instance_roles)
Expand Down
Loading
Loading