From cf74c55c46b417ec2d4f8b09725be721d20c1596 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 18 Feb 2025 11:59:36 -0500 Subject: [PATCH 1/4] fix: aws_handle_evict recovery & termination This should gracefully handle both recovering from spot evict and terminating in time --- ci3/aws_handle_evict | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/ci3/aws_handle_evict b/ci3/aws_handle_evict index bc0d6f07eabc..a582933a74dc 100755 --- a/ci3/aws_handle_evict +++ b/ci3/aws_handle_evict @@ -1,3 +1,7 @@ +#!/bin/bash +# Capture initial PIDs using our stdio before running the command. +initial_stdio_pids=$(fuser /dev/stdin /dev/stdout /dev/stderr 2>/dev/null | tr -s ' ' '\n') + # Gracefully signals eviction status with a 155 exit code. # Runs the given command in the background and waits on it while polling for eviction status. bash -c "$1" & @@ -5,21 +9,30 @@ child_pid=$! token=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") +function terminate_stdio_attached_processes { + # Get current PIDs using stdio and kill those that were not present at startup. + fuser /dev/stdin /dev/stdout /dev/stderr 2>/dev/null | tr -s ' ' '\n' \ + | grep -vFxf <(echo "$initial_stdio_pids") \ + | xargs -r kill -9 +} + # Poll until the child finishes or a termination notice is detected while true; do - # Wait for process to come up, makes check below happen every 5 seconds + # Wait for process to come up, makes check below happen every 5 seconds. for i in {1..5}; do if ! kill -0 "$child_pid" 2>/dev/null; then wait "$child_pid" + terminate_stdio_attached_processes exit $? fi sleep 1 done - # Check for imminent spot termination - if curl -fs -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/spot/termination-time &>/dev/null; then - # Termination notice found, exit with 155. + + # Check for imminent spot termination. + if curl -fs -H "X-aws-ec2-metadata-token: $token" \ + http://169.254.169.254/latest/meta-data/spot/termination-time &>/dev/null; then echo "Spot will be terminated! Exiting early." - fuser /dev/stdin /dev/stdout /dev/stderr 2>/dev/null | tr -s ' ' '\n' | grep -v $$ | xargs -r kill -9 + terminate_stdio_attached_processes exit 155 fi done From 95663f0e367125928d49e7b6e65f79b49f9f2307 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 18 Feb 2025 12:00:34 -0500 Subject: [PATCH 2/4] Update aws_handle_evict --- ci3/aws_handle_evict | 1 + 1 file changed, 1 insertion(+) diff --git a/ci3/aws_handle_evict b/ci3/aws_handle_evict index a582933a74dc..ccd2662f08fc 100755 --- a/ci3/aws_handle_evict +++ b/ci3/aws_handle_evict @@ -33,6 +33,7 @@ while true; do http://169.254.169.254/latest/meta-data/spot/termination-time &>/dev/null; then echo "Spot will be terminated! Exiting early." terminate_stdio_attached_processes + # Exit with 155 to signal wrapper to restart with on-demand. exit 155 fi done From d06506159061b798f8081313cf94ce9e7482781a Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 18 Feb 2025 12:26:09 -0500 Subject: [PATCH 3/4] Update aws_handle_evict --- ci3/aws_handle_evict | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ci3/aws_handle_evict b/ci3/aws_handle_evict index ccd2662f08fc..a4aa007c0448 100755 --- a/ci3/aws_handle_evict +++ b/ci3/aws_handle_evict @@ -1,6 +1,6 @@ #!/bin/bash # Capture initial PIDs using our stdio before running the command. -initial_stdio_pids=$(fuser /dev/stdin /dev/stdout /dev/stderr 2>/dev/null | tr -s ' ' '\n') +initial_stdio_pids=$(fuser /dev/stdin /dev/stdout 2>/dev/null) # Gracefully signals eviction status with a 155 exit code. # Runs the given command in the background and waits on it while polling for eviction status. @@ -10,10 +10,12 @@ child_pid=$! token=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") function terminate_stdio_attached_processes { - # Get current PIDs using stdio and kill those that were not present at startup. - fuser /dev/stdin /dev/stdout /dev/stderr 2>/dev/null | tr -s ' ' '\n' \ - | grep -vFxf <(echo "$initial_stdio_pids") \ - | xargs -r kill -9 + # Identify new PIDs not present initially and force kill them. + current_stdio_pids=$(fuser /dev/stdin /dev/stdout 2>/dev/null) + comm -13 \ + <(echo "$initial_stdio_pids" | tr ' ' '\n' | sort) \ + <(echo "$current_stdio_pids" | tr ' ' '\n' | sort) | \ + xargs -r kill -9 } # Poll until the child finishes or a termination notice is detected From 3d4f49d298a55bff9175a848c5a5ea8f78a86aa9 Mon Sep 17 00:00:00 2001 From: ludamad Date: Tue, 18 Feb 2025 12:50:46 -0500 Subject: [PATCH 4/4] Update aws_handle_evict --- ci3/aws_handle_evict | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/ci3/aws_handle_evict b/ci3/aws_handle_evict index a4aa007c0448..c38c9251a8db 100755 --- a/ci3/aws_handle_evict +++ b/ci3/aws_handle_evict @@ -1,7 +1,3 @@ -#!/bin/bash -# Capture initial PIDs using our stdio before running the command. -initial_stdio_pids=$(fuser /dev/stdin /dev/stdout 2>/dev/null) - # Gracefully signals eviction status with a 155 exit code. # Runs the given command in the background and waits on it while polling for eviction status. bash -c "$1" & @@ -9,33 +5,22 @@ child_pid=$! token=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") -function terminate_stdio_attached_processes { - # Identify new PIDs not present initially and force kill them. - current_stdio_pids=$(fuser /dev/stdin /dev/stdout 2>/dev/null) - comm -13 \ - <(echo "$initial_stdio_pids" | tr ' ' '\n' | sort) \ - <(echo "$current_stdio_pids" | tr ' ' '\n' | sort) | \ - xargs -r kill -9 -} - # Poll until the child finishes or a termination notice is detected while true; do - # Wait for process to come up, makes check below happen every 5 seconds. + # Wait for process to come up, makes check below happen every 5 seconds for i in {1..5}; do if ! kill -0 "$child_pid" 2>/dev/null; then wait "$child_pid" - terminate_stdio_attached_processes exit $? fi sleep 1 done - - # Check for imminent spot termination. - if curl -fs -H "X-aws-ec2-metadata-token: $token" \ - http://169.254.169.254/latest/meta-data/spot/termination-time &>/dev/null; then + # Check for imminent spot termination + if curl -fs -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/spot/termination-time &>/dev/null; then + # Termination notice found, exit with 155. echo "Spot will be terminated! Exiting early." - terminate_stdio_attached_processes - # Exit with 155 to signal wrapper to restart with on-demand. + pids=$(fuser /dev/stdin /dev/stdout 2>/dev/null) + echo "$pids" | tr -s ' ' '\n' | grep -v $$ | xargs -r kill -9 exit 155 fi done