From a7e9b884a143ecf1764db48be0c32a8e04e8e32b Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 30 Jul 2024 13:05:14 +0000 Subject: [PATCH 1/7] Make PR dir if missing; check that driver is running before killing --- ci/scripts/driver.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 0f53ebff6ff..afdf838c742 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -77,8 +77,9 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" - for pr in ${pr_list}; do pr_dir="${GFS_CI_ROOT}/PR/${pr}" + [[ ! -d ${pr_dir} ]] && mkdir -p ${pr_dir} db_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") - output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" + output_ci_single="${pr_dir}/output_single.log" ############################################################# # Check if a Ready labeled PR has changed back from once set # and in that case completely kill the previose driver.sh cron @@ -107,7 +108,7 @@ for pr in ${pr_list}; do echo -e "${pstree_out}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill fi else - ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' + ssh "${driver_HOST}" "if ps -p ${driver_PID} 2>&1; then pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill; fi" fi { echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}" From e8cfb15c1c4afd048876c5f43fb2e49d21aa1fe4 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 30 Jul 2024 13:24:52 +0000 Subject: [PATCH 2/7] Add comment on ssh kill command --- ci/scripts/driver.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index afdf838c742..1485b68f230 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -108,6 +108,7 @@ for pr in ${pr_list}; do echo -e "${pstree_out}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill fi else + # Check if the driver is still running on the head node; if so, kill it and all child processes ssh "${driver_HOST}" "if ps -p ${driver_PID} 2>&1; then pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill; fi" fi { From 7a1dd48eafde2e375cf3d42090ebffeb0a63699e Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 30 Jul 2024 13:34:41 +0000 Subject: [PATCH 3/7] Ignore SC2029 when killing via ssh --- ci/scripts/driver.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 1485b68f230..4afd9ccebe9 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -77,7 +77,7 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" - for pr in ${pr_list}; do pr_dir="${GFS_CI_ROOT}/PR/${pr}" - [[ ! -d ${pr_dir} ]] && mkdir -p ${pr_dir} + [[ ! -d ${pr_dir} ]] && mkdir -p "${pr_dir}" db_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") output_ci_single="${pr_dir}/output_single.log" ############################################################# @@ -109,6 +109,7 @@ for pr in ${pr_list}; do fi else # Check if the driver is still running on the head node; if so, kill it and all child processes + #shellcheck disable=SC2029 ssh "${driver_HOST}" "if ps -p ${driver_PID} 2>&1; then pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill; fi" fi { From c5655f66780c201c2637ecd829e360e5a66aed90 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 31 Jul 2024 14:46:20 +0000 Subject: [PATCH 4/7] Remove if-defined parameter expansions from if-blocks for defined variables --- ci/scripts/check_ci.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index 04dd92f4a69..24c5e242c35 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -50,14 +50,14 @@ fi export GH rocotostat=$(command -v rocotostat) -if [[ -z ${rocotostat+x} ]]; then +if [[ -z ${rocotostat} ]]; then echo "rocotostat not found on system" exit 1 else echo "rocotostat being used from ${rocotostat}" fi rocotocheck=$(command -v rocotocheck) -if [[ -z ${rocotocheck+x} ]]; then +if [[ -z ${rocotocheck} ]]; then echo "rocotocheck not found on system" exit 1 else @@ -70,7 +70,7 @@ pr_list="" if [[ -f "${pr_list_dbfile}" ]]; then pr_list=$("${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Running) || true fi -if [[ -z "${pr_list+x}" ]]; then +if [[ -z "${pr_list}" ]]; then echo "no PRs open and ready to run cases on .. exiting" exit 0 fi @@ -124,7 +124,7 @@ for pr in ${pr_list}; do for pslot_dir in "${pr_dir}/RUNTESTS/EXPDIR/"*; do pslot=$(basename "${pslot_dir}") || true - if [[ -z "${pslot+x}" ]]; then + if [[ -z "${pslot}" ]]; then echo "No experiments found in ${pslot_dir} .. exiting" exit 0 fi From 6721ece2004ea531811e6953a33b10890d52a77c Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 31 Jul 2024 14:53:05 +0000 Subject: [PATCH 5/7] Remove more if-defined parameter expansions --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 4afd9ccebe9..50c9185b3ea 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -144,7 +144,7 @@ pr_list="" if [[ -f "${pr_list_dbfile}" ]]; then pr_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Ready) || true fi -if [[ -z "${pr_list+x}" ]]; then +if [[ -z "${pr_list}" ]]; then echo "no PRs open and ready for checkout/build .. exiting" exit 0 fi @@ -158,7 +158,7 @@ fi for pr in ${pr_list}; do # Skip pr's that are currently Building for when overlapping driver scripts are being called from within cron pr_building=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | grep Building) || true - if [[ -z "${pr_building+x}" ]]; then + if [[ -z "${pr_building}" ]]; then continue fi id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') From abc96602ad38ff670c888999c19ce6307297d640 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 31 Jul 2024 15:24:01 +0000 Subject: [PATCH 6/7] Check that pr_building is not empty rather than empty --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 50c9185b3ea..5858fee23d3 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -158,7 +158,7 @@ fi for pr in ${pr_list}; do # Skip pr's that are currently Building for when overlapping driver scripts are being called from within cron pr_building=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | grep Building) || true - if [[ -z "${pr_building}" ]]; then + if [[ -n "${pr_building}" ]]; then continue fi id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') From 4ef3a00a447e3a78bb1db879f2c39be47b47a380 Mon Sep 17 00:00:00 2001 From: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:40:17 +0000 Subject: [PATCH 7/7] Update ci/scripts/driver.sh --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 5858fee23d3..8a998173259 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -110,7 +110,7 @@ for pr in ${pr_list}; do else # Check if the driver is still running on the head node; if so, kill it and all child processes #shellcheck disable=SC2029 - ssh "${driver_HOST}" "if ps -p ${driver_PID} 2>&1; then pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill; fi" + ssh "${driver_HOST}" "pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill || echo \"Failed to kill process with PID: ${driver_PID}, it may not be valid.\"" fi { echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}"