@@ -52,6 +52,8 @@ VOLUME_SIZE=10 # in GB
5252KEY_NAME=" ${AWS_KEY_NAME:- aws_auto} " # Key path is assumed to be ~/.ssh/${KEY_NAME}.pem
5353SECURITY_GROUP=" ${AWS_SECURITY_GROUP:- } "
5454
55+ RETRIES=5 # Number of retries when a problem occurs during instance processing
56+
5557# --------------------
5658# Internal variables (do not modify)
5759# --------------------
@@ -65,13 +67,16 @@ CREATED_SECURITY_GROUP=""
6567# Cleanup function to delete created security group on exit
6668cleanup () {
6769 if [ -n " ${CREATED_SECURITY_GROUP} " ]; then
68- echo " Cleaning up security group: ${CREATED_SECURITY_GROUP} "
69- aws ec2 delete-security-group --group-id " ${CREATED_SECURITY_GROUP} " || true
70+ if aws ec2 delete-security-group --group-id " ${CREATED_SECURITY_GROUP} " ; then
71+ echo " Cleaned up security group: ${CREATED_SECURITY_GROUP} "
72+ else
73+ echo " Failed to clean up security group ${CREATED_SECURITY_GROUP} ; you should delete it manually."
74+ fi
7075 fi
7176}
7277
7378check_prerequisites () {
74- if (( BASH_VERSINFO[0 ] < 4 )) ; then
79+ if (( BASH_VERSINFO[0 ] < 4 )) ; then
7580 echo " Error: This script requires Bash version 4 or higher." >&2
7681 exit 1
7782 fi
@@ -148,36 +153,28 @@ get_arch() {
148153 fi
149154}
150155
151- process_instance () {
152- INSTANCE_NAME=$1
153- AMI_ID=$2
154- echo " Running instance for ${INSTANCE_NAME} with AMI ${AMI_ID} "
155-
156- INSTANCE_ID=$( aws ec2 run-instances \
157- --image-id ${AMI_ID} \
158- --instance-type ${INSTANCE_NAME} \
159- --key-name ${KEY_NAME} \
160- --block-device-mappings " DeviceName=/dev/sda1,Ebs={VolumeSize=${VOLUME_SIZE} }" \
161- --associate-public-ip-address \
162- --security-group-ids ${SECURITY_GROUP} \
163- --count " 1" --query ' Instances[0].InstanceId' --output text)
164-
165- echo " Waiting for instance ${INSTANCE_ID} to be ready..."
166- aws ec2 wait instance-status-ok --instance-ids ${INSTANCE_ID}
167- echo " Started instance: ${INSTANCE_ID} "
168-
169- PUBLIC_IP=$( aws ec2 describe-instances \
170- --instance-ids ${INSTANCE_ID} \
171- --query " Reservations[0].Instances[0].PublicIpAddress" --output text)
172- echo " Instance ${INSTANCE_ID} public IP: ${PUBLIC_IP} "
156+ ensure_ssh_ready () {
157+ local ip=" $1 "
158+ for i in {1..30}; do
159+ if ${SSH_COMMAND} ubuntu@${ip} " echo SSH Ready" > /dev/null 2>&1 ; then
160+ return 0
161+ fi
162+ sleep 5
163+ done
164+ echo " SSH did not become ready on ${ip} after waiting."
165+ return 1
166+ }
173167
168+ do_remote_work () {
169+ local ip=" $1 "
174170 git ls-files -z | rsync -avz --partial --progress --from0 --files-from=- -e " ${SSH_COMMAND} " \
175- ./ ubuntu@${PUBLIC_IP} :~ /${PROJECT_DIR}
176- ${SSH_COMMAND} ubuntu@${PUBLIC_IP} << EOF
171+ ./ ubuntu@${ip} :~ /${PROJECT_DIR} || return 1
172+
173+ ${SSH_COMMAND} ubuntu@${ip} << EOF
177174 set -e # Exit on error
178175 cd ~/${PROJECT_DIR}
179176
180- echo "Updating and installing dependencies on ${INSTANCE_NAME} ..."
177+ echo "Updating and installing dependencies..."
181178 sudo apt update
182179 sudo DEBIAN_FRONTEND=noninteractive apt install -y \
183180 linux-tools-common linux-tools-generic g++ clang cmake python3
@@ -202,14 +199,52 @@ process_instance() {
202199 CC=clang CXX=clang++ cmake -B build . && cmake --build build
203200 ./scripts/generate_multiple_tables.py clang++
204201EOF
202+ }
203+
204+ process_instance () {
205+ INSTANCE_NAME=$1
206+ AMI_ID=$2
207+ echo " Running instance for ${INSTANCE_NAME} with AMI ${AMI_ID} "
208+
209+ INSTANCE_ID=$( aws ec2 run-instances \
210+ --image-id ${AMI_ID} \
211+ --instance-type ${INSTANCE_NAME} \
212+ --key-name ${KEY_NAME} \
213+ --block-device-mappings " DeviceName=/dev/sda1,Ebs={VolumeSize=${VOLUME_SIZE} }" \
214+ --associate-public-ip-address \
215+ --security-group-ids ${SECURITY_GROUP} \
216+ --count " 1" --query ' Instances[0].InstanceId' --output text)
217+
218+ echo " Waiting for instance ${INSTANCE_ID} to be ready..."
219+ aws ec2 wait instance-status-ok --instance-ids ${INSTANCE_ID}
220+ echo " Started instance: ${INSTANCE_ID} "
221+
222+ PUBLIC_IP=$( aws ec2 describe-instances \
223+ --instance-ids ${INSTANCE_ID} \
224+ --query " Reservations[0].Instances[0].PublicIpAddress" --output text)
225+ echo " Instance ${INSTANCE_ID} public IP: ${PUBLIC_IP} "
226+ ensure_ssh_ready " ${PUBLIC_IP} "
227+
228+ for attempt in $( seq 1 ${RETRIES} ) ; do
229+ if do_remote_work " ${PUBLIC_IP} " ; then
230+ echo " Remote work completed successfully"
231+ break
232+ else
233+ echo " Attempt ${attempt} failed, retrying..."
234+ sleep 10
235+ fi
236+ done
205237
206- echo " Script executed successfully on ${INSTANCE_NAME} "
207238 mkdir -p " ./outputs/${INSTANCE_NAME} "
208239 rsync -avz --partial --progress -e " ${SSH_COMMAND} " \
209- ubuntu@${PUBLIC_IP} :~ /${PROJECT_DIR} /outputs/ ./outputs/${INSTANCE_NAME} /
240+ ubuntu@${PUBLIC_IP} :~ /${PROJECT_DIR} /outputs/ ./outputs/${INSTANCE_NAME} / \
241+ || echo " Failed to copy outputs from ${PUBLIC_IP} "
210242
211- aws ec2 terminate-instances --instance-ids ${INSTANCE_ID}
212- echo " Terminated instance: ${INSTANCE_ID} "
243+ if aws ec2 terminate-instances --instance-ids ${INSTANCE_ID} ; then
244+ echo " Terminated instance: ${INSTANCE_ID} "
245+ else
246+ echo " Failed to terminate instance ${INSTANCE_ID} ; you should terminate it manually."
247+ fi
213248}
214249
215250main () {
@@ -219,11 +254,12 @@ main () {
219254 create_security_group
220255
221256 echo " Launching ${# INSTANCES_aarch64[@]} aarch64 instances and ${# INSTANCES_x86_64[@]} x86_64 instances in parallel..."
222- for INSTANCE_NAME in " ${INSTANCES_x86_64 [@]} " " ${INSTANCES_aarch64 [@]} " ; do
257+ for INSTANCE_NAME in " ${INSTANCES_aarch64 [@]} " " ${INSTANCES_x86_64 [@]} " ; do
223258 ARCH=$( get_arch " $INSTANCE_NAME " )
224259 AMI_ID=" ${AMI_MAP[$ARCH]} "
225260
226261 process_instance " ${INSTANCE_NAME} " " ${AMI_ID} " 2>&1 | tee " ${INSTANCE_NAME} .log" &
262+ sleep 1
227263 done
228264
229265 # Wait for all background jobs to finish
0 commit comments