Skip to content

Commit 90e0420

Browse files
committed
make the AWS script more robust
1 parent 201a466 commit 90e0420

File tree

2 files changed

+72
-36
lines changed

2 files changed

+72
-36
lines changed

scripts/aws_tests.bash

Lines changed: 69 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ VOLUME_SIZE=10 # in GB
5252
KEY_NAME="${AWS_KEY_NAME:-aws_auto}" # Key path is assumed to be ~/.ssh/${KEY_NAME}.pem
5353
SECURITY_GROUP="${AWS_SECURITY_GROUP:-}"
5454

55+
RETRIES=5 # Number of retries when a problem occurs during instance processing
56+
5557
# --------------------
5658
# Internal variables (do not modify)
5759
# --------------------
@@ -65,13 +67,16 @@ CREATED_SECURITY_GROUP=""
6567
# Cleanup function to delete created security group on exit
6668
cleanup() {
6769
if [ -n "${CREATED_SECURITY_GROUP}" ]; then
68-
echo "Cleaning up security group: ${CREATED_SECURITY_GROUP}"
69-
aws ec2 delete-security-group --group-id "${CREATED_SECURITY_GROUP}" || true
70+
if aws ec2 delete-security-group --group-id "${CREATED_SECURITY_GROUP}"; then
71+
echo "Cleaned up security group: ${CREATED_SECURITY_GROUP}"
72+
else
73+
echo "Failed to clean up security group ${CREATED_SECURITY_GROUP}; you should delete it manually."
74+
fi
7075
fi
7176
}
7277

7378
check_prerequisites() {
74-
if ((BASH_VERSINFO[0] < 4)); then
79+
if (( BASH_VERSINFO[0] < 4 )); then
7580
echo "Error: This script requires Bash version 4 or higher." >&2
7681
exit 1
7782
fi
@@ -148,36 +153,28 @@ get_arch() {
148153
fi
149154
}
150155

151-
process_instance() {
152-
INSTANCE_NAME=$1
153-
AMI_ID=$2
154-
echo "Running instance for ${INSTANCE_NAME} with AMI ${AMI_ID}"
155-
156-
INSTANCE_ID=$(aws ec2 run-instances \
157-
--image-id ${AMI_ID} \
158-
--instance-type ${INSTANCE_NAME} \
159-
--key-name ${KEY_NAME} \
160-
--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${VOLUME_SIZE}}" \
161-
--associate-public-ip-address \
162-
--security-group-ids ${SECURITY_GROUP} \
163-
--count "1" --query 'Instances[0].InstanceId' --output text)
164-
165-
echo "Waiting for instance ${INSTANCE_ID} to be ready..."
166-
aws ec2 wait instance-status-ok --instance-ids ${INSTANCE_ID}
167-
echo "Started instance: ${INSTANCE_ID}"
168-
169-
PUBLIC_IP=$(aws ec2 describe-instances \
170-
--instance-ids ${INSTANCE_ID} \
171-
--query "Reservations[0].Instances[0].PublicIpAddress" --output text)
172-
echo "Instance ${INSTANCE_ID} public IP: ${PUBLIC_IP}"
156+
ensure_ssh_ready() {
157+
local ip="$1"
158+
for i in {1..30}; do
159+
if ${SSH_COMMAND} ubuntu@${ip} "echo SSH Ready" >/dev/null 2>&1; then
160+
return 0
161+
fi
162+
sleep 5
163+
done
164+
echo "SSH did not become ready on ${ip} after waiting."
165+
return 1
166+
}
173167

168+
do_remote_work() {
169+
local ip="$1"
174170
git ls-files -z | rsync -avz --partial --progress --from0 --files-from=- -e "${SSH_COMMAND}" \
175-
./ ubuntu@${PUBLIC_IP}:~/${PROJECT_DIR}
176-
${SSH_COMMAND} ubuntu@${PUBLIC_IP} << EOF
171+
./ ubuntu@${ip}:~/${PROJECT_DIR} || return 1
172+
173+
${SSH_COMMAND} ubuntu@${ip} << EOF
177174
set -e # Exit on error
178175
cd ~/${PROJECT_DIR}
179176
180-
echo "Updating and installing dependencies on ${INSTANCE_NAME}..."
177+
echo "Updating and installing dependencies..."
181178
sudo apt update
182179
sudo DEBIAN_FRONTEND=noninteractive apt install -y \
183180
linux-tools-common linux-tools-generic g++ clang cmake python3
@@ -202,14 +199,52 @@ process_instance() {
202199
CC=clang CXX=clang++ cmake -B build . && cmake --build build
203200
./scripts/generate_multiple_tables.py clang++
204201
EOF
202+
}
203+
204+
process_instance() {
205+
INSTANCE_NAME=$1
206+
AMI_ID=$2
207+
echo "Running instance for ${INSTANCE_NAME} with AMI ${AMI_ID}"
208+
209+
INSTANCE_ID=$(aws ec2 run-instances \
210+
--image-id ${AMI_ID} \
211+
--instance-type ${INSTANCE_NAME} \
212+
--key-name ${KEY_NAME} \
213+
--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${VOLUME_SIZE}}" \
214+
--associate-public-ip-address \
215+
--security-group-ids ${SECURITY_GROUP} \
216+
--count "1" --query 'Instances[0].InstanceId' --output text)
217+
218+
echo "Waiting for instance ${INSTANCE_ID} to be ready..."
219+
aws ec2 wait instance-status-ok --instance-ids ${INSTANCE_ID}
220+
echo "Started instance: ${INSTANCE_ID}"
221+
222+
PUBLIC_IP=$(aws ec2 describe-instances \
223+
--instance-ids ${INSTANCE_ID} \
224+
--query "Reservations[0].Instances[0].PublicIpAddress" --output text)
225+
echo "Instance ${INSTANCE_ID} public IP: ${PUBLIC_IP}"
226+
ensure_ssh_ready "${PUBLIC_IP}"
227+
228+
for attempt in $(seq 1 ${RETRIES}); do
229+
if do_remote_work "${PUBLIC_IP}"; then
230+
echo "Remote work completed successfully"
231+
break
232+
else
233+
echo "Attempt ${attempt} failed, retrying..."
234+
sleep 10
235+
fi
236+
done
205237

206-
echo "Script executed successfully on ${INSTANCE_NAME}"
207238
mkdir -p "./outputs/${INSTANCE_NAME}"
208239
rsync -avz --partial --progress -e "${SSH_COMMAND}" \
209-
ubuntu@${PUBLIC_IP}:~/${PROJECT_DIR}/outputs/ ./outputs/${INSTANCE_NAME}/
240+
ubuntu@${PUBLIC_IP}:~/${PROJECT_DIR}/outputs/ ./outputs/${INSTANCE_NAME}/ \
241+
|| echo "Failed to copy outputs from ${PUBLIC_IP}"
210242

211-
aws ec2 terminate-instances --instance-ids ${INSTANCE_ID}
212-
echo "Terminated instance: ${INSTANCE_ID}"
243+
if aws ec2 terminate-instances --instance-ids ${INSTANCE_ID}; then
244+
echo "Terminated instance: ${INSTANCE_ID}"
245+
else
246+
echo "Failed to terminate instance ${INSTANCE_ID}; you should terminate it manually."
247+
fi
213248
}
214249

215250
main () {
@@ -219,11 +254,12 @@ main () {
219254
create_security_group
220255

221256
echo "Launching ${#INSTANCES_aarch64[@]} aarch64 instances and ${#INSTANCES_x86_64[@]} x86_64 instances in parallel..."
222-
for INSTANCE_NAME in "${INSTANCES_x86_64[@]}" "${INSTANCES_aarch64[@]}"; do
257+
for INSTANCE_NAME in "${INSTANCES_aarch64[@]}" "${INSTANCES_x86_64[@]}"; do
223258
ARCH=$(get_arch "$INSTANCE_NAME")
224259
AMI_ID="${AMI_MAP[$ARCH]}"
225260

226261
process_instance "${INSTANCE_NAME}" "${AMI_ID}" 2>&1 | tee "${INSTANCE_NAME}.log" &
262+
sleep 1
227263
done
228264

229265
# Wait for all background jobs to finish

scripts/generate_multiple_tables.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
volume_v = 100_000
2424
flag_combinations = [
2525
[],
26-
['-F6'],
26+
# ['-F6'],
2727
['-s'],
28-
['-F6', '-s'],
28+
# ['-F6', '-s'],
2929
]
3030

3131
# Get compiler label from command line
@@ -43,7 +43,7 @@ def get_cpu_model():
4343
if system == "Windows":
4444
return platform.processor()
4545
elif system == "Darwin":
46-
os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin'
46+
os.environ['PATH'] += os.pathsep + '/usr/sbin'
4747
command = ["sysctl", "-n", "machdep.cpu.brand_string"]
4848
return subprocess.check_output(command, env=env, text=True).strip()
4949
elif system == "Linux":

0 commit comments

Comments
 (0)