diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile index c799d3c4885..23285ee348a 100644 --- a/ci/Jenkinsfile +++ b/ci/Jenkinsfile @@ -4,13 +4,16 @@ def HOME = 'none' def caseList = '' // Location of the custom workspaces for each machine in the CI system. They are persitent for each iteration of the PR. def custom_workspace = [hera: '/scratch1/NCEPDEV/global/CI', orion: '/work2/noaa/stmp/CI/ORION', hercules: '/work2/noaa/stmp/CI/HERCULES'] +def repo_url = 'git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git' +def STATUS = 'Passed' pipeline { + agent { label 'built-in' } options { skipDefaultCheckout() - parallelsAlwaysFailFast() + //parallelsAlwaysFailFast() } stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR @@ -46,10 +49,8 @@ pipeline { properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hera-EMC', 'Orion-EMC'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])]) HOME = "${WORKSPACE}" sh(script: "mkdir -p ${HOME}/RUNTESTS;rm -Rf ${HOME}/RUNTESTS/error.logs") - pullRequest.addLabel("CI-${Machine}-Building") - if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Ready") }) { - pullRequest.removeLabel("CI-${Machine}-Ready") - } + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-Building" """) + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Ready" """) } echo "Building and running on ${Machine} in directory ${HOME}" } @@ -108,10 +109,10 @@ pipeline { catch (Exception error_arch) { echo "Failed to archive error log ${line}: ${error_arch.getMessage()}" } } } - repo_url=sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_BUILD_${env.CHANGE_ID}", returnStdout: true).trim() - gist_url=sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID}", returnStdout: true).trim() try { - pullRequest.comment("Build failed on **${Machine}** with error logs:${error_logs_message}\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})") + sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_BUILD_${env.CHANGE_ID}") + gist_url=sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID}", returnStdout: true).trim() + sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Build **FAILED** on **${Machine}** with error logs:\n\\`\\`\\`${error_logs_message}\n\\`\\`\\`\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})" """) } catch (Exception error_comment) { echo "Failed to comment on PR: ${error_comment.getMessage()}" } @@ -119,15 +120,13 @@ pipeline { } } sh(script: './link_workflow.sh') - // sh(script: "echo ${HOMEgfs} > BUILT_semaphor") + sh(script: "echo ${HOMEgfs} > BUILT_semaphor") } } if (env.CHANGE_ID && system == 'gfs') { try { - if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Building") }) { - pullRequest.removeLabel("CI-${Machine}-Building") - } - pullRequest.addLabel("CI-${Machine}-Running") + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-Running" """) + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Building" """) } catch (Exception e) { echo "Failed to update label from Buildng to Running: ${e.getMessage()}" } @@ -144,6 +143,7 @@ pipeline { } stage('Run Tests') { + failFast false matrix { agent { label "${machine}-emc" } axes { @@ -175,19 +175,21 @@ pipeline { when { expression { return caseList.contains(Case) } } + failFast false steps { script { HOMEgfs = "${HOME}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments pslot = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${HOME}/RUNTESTS ${Case}", returnStdout: true).trim() try { - sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot}") + sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot} ${system}") } catch (Exception error_experment) { - sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh cancel_all_batch_jobs ${HOME}/RUNTESTS") + sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh cancel_batch_jobs ${pslot}") ws(HOME) { def error_logs = "" def error_logs_message = "" - if (fileExists("RUNTESTS/error.logs")) { - def fileContent = readFile 'RUNTESTS/error.logs' + def error_file = "${HOME}/RUNTESTS/${pslot}_error.logs" + if (fileExists(error_file)) { + def fileContent = readFile error_file def lines = fileContent.readLines() for (line in lines) { echo "archiving: ${line}" @@ -201,15 +203,22 @@ pipeline { } } } - repo_url = sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_${env.CHANGE_ID}", returnStdout: true).trim() - gist_url = sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_${env.CHANGE_ID}", returnStdout: true).trim() try { - pullRequest.comment("Experiment ${Case} failed on ${Machine} with error logs: ${error_logs_message}\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})") + gist_url = sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_${env.CHANGE_ID}", returnStdout: true).trim() + sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} with error logs:\n\\`\\`\\`\n${error_logs_message}\\`\\`\\`\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})" """) + sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_${env.CHANGE_ID}") } catch (Exception error_comment) { echo "Failed to comment on PR: ${error_comment.getMessage()}" } } else { - echo "No error logs found for failed cases in $HOME/RUNTESTS/error.logs" + echo "No error logs found for failed cases in $HOME/RUNTESTS/${pslot}_error.logs" + } + STATUS = 'Failed' + try { + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" """, returnStatus: true) + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true) + } catch (Exception e) { + echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}" } error("Failed to run experiments ${Case} on ${Machine}") } @@ -220,49 +229,25 @@ pipeline { } } } - } - - post { - always { - script { - if(env.CHANGE_ID) { - try { - for (label in pullRequest.labels) { - if (label.contains("${Machine}")) { - pullRequest.removeLabel(label) - } - } - } catch (Exception e) { - echo "Failed to remove labels: ${e.getMessage()}" - } - } - } - } - success { - script { - if(env.CHANGE_ID) { - try { - pullRequest.addLabel("CI-${Machine}-Passed") - def timestamp = new Date().format('MM dd HH:mm:ss', TimeZone.getTimeZone('America/New_York')) - pullRequest.comment("**CI SUCCESS** ${Machine} at ${timestamp}\n\nBuilt and ran in directory `${HOME}`") - } catch (Exception e) { - echo "Failed to add success label or comment: ${e.getMessage()}" - } + stage( 'FINALIZE' ) { + when { + expression { + STATUS == 'Passed' } } - } - failure { - script { - if(env.CHANGE_ID) { + agent { label "${machine}-emc" } + steps { + script { try { - pullRequest.addLabel("CI-${Machine}-Failed") - def timestamp = new Date().format('MM dd HH:mm:ss', TimeZone.getTimeZone('America/New_York')) - pullRequest.comment("**CI FAILED** ${Machine} at ${timestamp}
Built and ran in directory `${HOME}`") + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" """, returnStatus: true) + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Building" """, returnStatus: true) + sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true) + sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "**CI ${STATUS}** ${Machine} at
Built and ran in directory `${HOME}`" """, returnStatus: true) } catch (Exception e) { - echo "Failed to add failure label or comment: ${e.getMessage()}" + echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}" } } } - } + } } } diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index 4ff7eefd265..b3c82aa9fd0 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -14,7 +14,8 @@ echo "Begin ${scriptname} at $(date -u)" || true export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]' GH=${HOME}/bin/gh -REPO_URL="https://github.com/NOAA-EMC/global-workflow.git" +#REPO_URL="https://github.com/NOAA-EMC/global-workflow.git" +REPO_URL="git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git" ######################################################################### # Set up runtime environment varibles for accounts on supproted machines @@ -58,7 +59,7 @@ pr_list_dbfile="${GFS_CI_ROOT}/open_pr_list.db" pr_list="" if [[ -f "${pr_list_dbfile}" ]]; then - pr_list=$("${HOMEgfs}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display | grep -v Failed | grep Running | awk '{print $1}') || true + pr_list=$("${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Running) || true fi if [[ -z "${pr_list+x}" ]]; then echo "no PRs open and ready to run cases on .. exiting" @@ -90,7 +91,7 @@ for pr in ${pr_list}; do sed -i "1 i\`\`\`" "${output_ci}" sed -i "1 i\All CI Test Cases Passed on ${MACHINE_ID^}:" "${output_ci}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" - "${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + "${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" # Check to see if this PR that was opened by the weekly tests and if so close it if it passed on all platforms weekly_labels=$(${GH} pr view "${pr}" --repo "${REPO_URL}" --json headRefName,labels,author --jq 'select(.author.login | contains("emcbot")) | select(.headRefName | contains("weekly_ci")) | .labels[].name ') || true if [[ -n "${weekly_labels}" ]]; then @@ -123,31 +124,33 @@ for pr in ${pr_list}; do if [[ ! -f "${db}" ]]; then continue fi - rocoto_stat_output=$("${rocotostat}" -w "${xml}" -d "${db}" -s | grep -v CYCLE) || true - num_cycles=$(echo "${rocoto_stat_output}" | wc -l) || true - num_done=$(echo "${rocoto_stat_output}" | grep -c Done) || true - # num_succeeded=$("${rocotostat}" -w "${xml}" -d "${db}" -a | grep -c SUCCEEDED) || true - echo "${pslot} Total Cycles: ${num_cycles} number done: ${num_done}" || true - num_failed=$("${rocotostat}" -w "${xml}" -d "${db}" -a | grep -c -E 'FAIL|DEAD') || true - if [[ ${num_failed} -ne 0 ]]; then - "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed" - error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true - { - echo "Experiment ${pslot} *** FAILED *** on ${MACHINE_ID^}" - echo "Experiment ${pslot} with ${num_failed} tasks failed at $(date +'%D %r')" || true - echo "Error logs:" - echo "${error_logs}" - } >> "${output_ci}" - sed -i "1 i\`\`\`" "${output_ci}" - "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" - "${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" - for kill_cases in "${pr_dir}/RUNTESTS/"*; do - pslot=$(basename "${kill_cases}") - cancel_slurm_jobs "${pslot}" - done - break + + set +e + rocoto_state="$("${HOMEgfs}/ci/scripts/utils/rocotostat.py" -w "${xml}" -d "${db}")" + rocoto_error=$? + rm -f "${output_ci_single}" + if [[ "${rocoto_error}" -ne 0 ]]; then + "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed" + error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true + # shellcheck disable=SC2086 + ${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo "PR_${pr}" > /dev/null + # shellcheck disable=SC2086 + gist_url="$("${HOMEgfs}/ci/scripts/utils/publish_logs.py" --file ${error_logs} --gist "PR_${pr}")" + { + echo "Experiment ${pslot} **${rocoto_state}** on ${MACHINE_ID^} at $(date +'%D %r')" || true + echo "" + echo "Error logs:" + echo "\`\`\`" + echo "${error_logs}" + echo "\`\`\`" + echo "Follow link here to view the contents of the above file(s): [(link)](${gist_url})" + } >> "${output_ci_single}" + "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" + "${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + cancel_all_batch_jobs "${pr_dir}/RUNTESTS" + exit "${rocoto_error}" fi - if [[ "${num_done}" -eq "${num_cycles}" ]]; then + if [[ "${rocoto_state}" == "DONE" ]]; then #Remove Experment cases that completed successfully rm -Rf "${pslot_dir}" rm -Rf "${pr_dir}/RUNTESTS/COMROOT/${pslot}" @@ -157,7 +160,6 @@ for pr in ${pr_list}; do echo "Experiment ${pslot} **SUCCESS** on ${MACHINE_ID^} at ${DATE}" >> "${output_ci_single}" echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" - fi done done diff --git a/ci/scripts/clone-build_ci.sh b/ci/scripts/clone-build_ci.sh index 798c98bf50a..11bc43aacbe 100755 --- a/ci/scripts/clone-build_ci.sh +++ b/ci/scripts/clone-build_ci.sh @@ -48,7 +48,7 @@ git clone "${REPO_URL}" cd global-workflow || exit 1 # checkout pull request -"${GH}" pr checkout "${PR}" --repo "${REPO_URL}" --recurse-submodules +"${GH}" pr checkout "${PR}" --repo "${REPO_URL}" --recurse-submodules -j 4 HOMEgfs="${PWD}" source "${HOMEgfs}/ush/detect_machine.sh" @@ -74,7 +74,7 @@ set +e source "${HOMEgfs}/ush/module-setup.sh" export BUILD_JOBS=8 rm -rf log.build -./build_all.sh -gu >> log.build 2>&1 +./build_all.sh -guk >> log.build 2>&1 build_status=$? DATE=$(date +'%D %r') @@ -83,6 +83,7 @@ if [[ ${build_status} != 0 ]]; then echo "Build: *** FAILED ***" echo "Build: Failed at ${DATE}" cat "${PWD}/log.build" + cat "${PWD}/logs/error.logs" } >> "${outfile}" exit "${build_status}" else diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index f37b5e3f2e9..a94c72facf4 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -18,7 +18,7 @@ set -eux # TODO using static build for GitHub CLI until fixed in HPC-Stack ################################################################# export GH=${HOME}/bin/gh -export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"} +export REPO_URL=git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git ################################################################ # Setup the reletive paths to scripts and PS4 for better logging @@ -64,14 +64,14 @@ unset HOMEgfs pr_list_dbfile="${GFS_CI_ROOT}/open_pr_list.db" if [[ ! -f "${pr_list_dbfile}" ]]; then - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --create --dbfile "${pr_list_dbfile}" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --create --dbfile "${pr_list_dbfile}" fi pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" --state "open" | awk '{print $1}') || true for pr in ${pr_list}; do pr_dir="${GFS_CI_ROOT}/PR/${pr}" - db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") + db_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" ############################################################# # Check if a Ready labeled PR has changed back from once set @@ -82,7 +82,7 @@ for pr in ${pr_list}; do if [[ "${db_list}" == *"already is in list"* ]]; then # Get the the PID and HOST of the driver.sh cron job # that is stored int he CI database for this PR - driver_ID=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true + driver_ID=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true driver_PID=$(echo "${driver_ID}" | cut -d":" -f1) || true driver_HOST=$(echo "${driver_ID}" | cut -d":" -f2) || true host_name=$(hostname -s) @@ -95,10 +95,12 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - # shellcheck disable=SC2312 - pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill + pstree_out="$(pstree -A -p "${driver_PID}")" + if [[ -n "${pstree_out}" ]]; then + #shellcheck disable=SC2312 + echo -e "${pstree_out}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill + fi else - # shellcheck disable=SC2312 ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' fi { @@ -113,22 +115,25 @@ for pr in ${pr_list}; do else for case in ${experiments}; do case_name=$(basename "${case}") - cancel_slurm_jobs "${case_name}" + cancel_batch_jobs "${case_name}" { echo "Canceled all jobs for experiment ${case_name} in PR:${pr} on ${MACHINE_ID^}" } >> "${output_ci_single}" done fi - sed -i "1 i\`\`\`" "${output_ci_single}" + first_line=$(head -n 1 "${output_ci_single}") + if [[ "${first_line}" != '```' ]]; then + sed -i "1 i\`\`\`" "${output_ci_single}" + fi "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}" fi done pr_list="" if [[ -f "${pr_list_dbfile}" ]]; then - pr_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display --dbfile "${pr_list_dbfile}" | grep -v Failed | grep Open | grep Ready | awk '{print $1}') || true + pr_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Ready) || true fi if [[ -z "${pr_list+x}" ]]; then echo "no PRs open and ready for checkout/build .. exiting" @@ -143,7 +148,7 @@ fi for pr in ${pr_list}; do # Skip pr's that are currently Building for when overlapping driver scripts are being called from within cron - pr_building=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | grep Building) || true + pr_building=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | grep Building) || true if [[ -z "${pr_building+x}" ]]; then continue fi @@ -154,7 +159,7 @@ for pr in ${pr_list}; do driver_build_PID=$$ driver_build_HOST=$(hostname -s) "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building "${driver_build_PID}:${driver_build_HOST}" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building "${driver_build_PID}:${driver_build_HOST}" rm -Rf "${pr_dir}" mkdir -p "${pr_dir}" { @@ -164,7 +169,10 @@ for pr in ${pr_list}; do echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" echo "" } >> "${output_ci_single}" - sed -i "1 i\`\`\`" "${output_ci_single}" + first_line=$(head -n 1 "${output_ci_single}") + if [[ "${first_line}" != '```' ]]; then + sed -i "1 i\`\`\`" "${output_ci_single}" + fi "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" set +e "${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${output_ci}" @@ -176,7 +184,7 @@ for pr in ${pr_list}; do # we need to exit this instance of the driver script ################################################################# if [[ ${ci_status} -ne 0 ]]; then - build_PID_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}' | cut -d":" -f1) || true + build_PID_check=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}' | cut -d":" -f1) || true if [[ "${build_PID_check}" -ne "$$" ]]; then echo "Driver build PID: ${build_PID_check} no longer running this build ... exiting" exit 0 @@ -184,7 +192,7 @@ for pr in ${pr_list}; do fi set -e if [[ ${ci_status} -eq 0 ]]; then - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built "0:0" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built "0:0" #setup space to put an experiment # export RUNTESTS for yaml case files to pickup export RUNTESTS="${pr_dir}/RUNTESTS" @@ -226,24 +234,35 @@ for pr in ${pr_list}; do cat "${LOGFILE_PATH}" } >> "${output_ci}" "${GH}" pr edit "${pr}" --repo "${REPO_URL}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Failed" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" exit 1 fi done "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Running" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running "0:0" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running "0:0" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" - else + else # failed to clone and build + { echo "Failed on cloning and building global-workflowi PR: ${pr}" echo "CI on ${MACHINE_ID^} failed to build on $(date) for repo ${REPO_URL}" || true } >> "${output_ci}" + "${GH}" pr edit "${pr}" --repo "${REPO_URL}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Failed" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + "${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + + if [[ -f "${HOMEgfs}/sorc/logs/error.logs" ]]; then + gist_URL=$("${ROOT_DIR}/ci/scripts/utils/ci_utils_wrapper.sh" publish_logs "PR_${pr}" "${HOMEgfs}/sorc" "${HOMEgfs}/sorc/logs/error.logs") + { + echo -e "\nError logs from build" + echo "Gist URL: ${gist_URL}" + } >> "${output_ci}" + fi "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" + fi done # looping over each open and labeled PR diff --git a/ci/scripts/run-check_ci.sh b/ci/scripts/run-check_ci.sh index 8e1e9270505..6f73d308062 100755 --- a/ci/scripts/run-check_ci.sh +++ b/ci/scripts/run-check_ci.sh @@ -4,11 +4,12 @@ set -eu ##################################################################################### # Script description: script to check the status of an experiment as reported -# by Rocoto +# by Rocoto ##################################################################################### TEST_DIR=${1:-${TEST_DIR:-?}} # Location of the root of the testing directory pslot=${2:-${pslot:-?}} # Name of the experiment being tested by this script +SYSTEM_BUILD_DIR=${3:-${SYSTEM_BUILD_DIR:-?}} # Name of the system build directory # TEST_DIR contains 2 directories; # 1. HOMEgfs: clone of the global-workflow @@ -23,7 +24,7 @@ pslot=${2:-${pslot:-?}} # Name of the experiment being tested by this scr # └── ${pslot} # Two system build directories created at build time gfs, and gdas # TODO: Make this configurable (for now all scripts run from gfs for CI at runtime) -HOMEgfs="${TEST_DIR}/gfs" +HOMEgfs="${TEST_DIR}/${SYSTEM_BUILD_DIR:-global-workflow}" RUNTESTS="${TEST_DIR}/RUNTESTS" run_check_logfile="${RUNTESTS}/ci-run_check.log" @@ -48,7 +49,7 @@ fi # Launch experiment echo "Launch experiment with Rocoto." rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}" -sleep 30 +sleep 10 if [[ ! -f "${db}" ]]; then echo "FATAL ERROR: Rocoto database file ${db} not found, experiment ${pslot} failed, ABORT!" exit 2 @@ -56,46 +57,48 @@ fi # Experiment launched rc=99 +set +e while true; do echo "Run rocotorun." rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}" # Wait before running rocotostat - sleep 30 + sleep 10 # Get job statistics echo "Gather Rocoto statistics" - rocotostat_output=$(rocotostat -w "${xml}" -d "${db}" -s | grep -v CYCLE) || true - num_cycles=$(echo "${rocotostat_output}" | wc -l) || true - num_done=$(echo "${rocotostat_output}" | grep -c Done) || true - num_succeeded=$(rocotostat -w "${xml}" -d "${db}" -a | grep -c SUCCEEDED) || true - num_failed=$(rocotostat -w "${xml}" -d "${db}" -a | grep -c -E 'FAIL|DEAD') || true + # shellcheck disable=SC2312 # We want to use the exit code of the command + eval=$("${HOMEgfs}/ci/scripts/utils/rocotostat.py" -w "${xml}" -d "${db}" --export) + error_stat=$? + eval "${eval}" - echo "${pslot} Total Cycles: ${num_cycles} number done: ${num_done}" + echo -e "(${pslot} on ${MACHINE_ID^})\n\tTotal Cycles: ${CYCLES_TOTAL}\n\tNumber Cycles done: ${CYCLES_DONE}\n\tState: ${ROCOTO_STATE}" - if [[ ${num_failed} -ne 0 ]]; then + if [[ ${error_stat} -ne 0 ]]; then { - echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true - echo "Experiment ${pslot} Terminated: *FAILED*" + echo "Experiment ${pslot} Terminated with ${FAIL} tasks failed and ${DEAD} dead at $(date)" || true + echo "Experiment ${pslot} Terminated: *${ROCOTO_STATE}*" } | tee -a "${run_check_logfile}" - error_logs=$(rocotostat -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs rocotocheck -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true - { - echo "Error logs:" - echo "${error_logs}" - } | tee -a "${run_check_logfile}" - # rm -f "${RUNTESTS}/error.logs" - for log in ${error_logs}; do - echo "RUNTESTS${log#*RUNTESTS}" >> "${RUNTESTS}/error.logs" - done - rc=1 - break + if [[ "${DEAD}" -ne 0 ]]; then + error_logs=$(rocotostat -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs rocotocheck -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true + { + echo "Error logs:" + echo "${error_logs}" + } | tee -a "${run_check_logfile}" + rm -f "${RUNTESTS}/${pslot}_error.logs" + for log in ${error_logs}; do + echo "RUNTESTS${log#*RUNTESTS}" >> "${RUNTESTS}/${pslot}_error.logs" + done + fi + rc=1 + break fi - if [[ "${num_done}" -eq "${num_cycles}" ]]; then + if [[ "${ROCOTO_STATE}" == "DONE" ]]; then { - echo "Experiment ${pslot} Completed at $(date)" || true - echo "with ${num_succeeded} successfully completed jobs" || true + echo "Experiment ${pslot} Completed ${CYCLES_DONE} Cycles at $(date)" || true + echo "with ${SUCCEEDED} successfully completed jobs" || true echo "Experiment ${pslot} Completed: *SUCCESS*" } | tee -a "${run_check_logfile}" rc=0 diff --git a/ci/scripts/run_ci.sh b/ci/scripts/run_ci.sh index f50a4465d04..8af35bc471a 100755 --- a/ci/scripts/run_ci.sh +++ b/ci/scripts/run_ci.sh @@ -48,7 +48,8 @@ pr_list_dbfile="${GFS_CI_ROOT}/open_pr_list.db" pr_list="" if [[ -f "${pr_list_dbfile}" ]]; then - pr_list=$("${HOMEgfs}/ci/scripts/pr_list_database.py" --display --dbfile "${pr_list_dbfile}" | grep -v Failed | grep Open | grep Running | awk '{print $1}' | head -"${max_concurrent_pr}") || true + pr_list=$("${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Running) || true + pr_list=$(echo "${pr_list}" | tr ' ' '\n' | head -n "${max_concurrent_pr}" | tr '\n' ' ') || true fi if [[ -z "${pr_list}" ]]; then echo "no open and built PRs that are ready for the cases to advance with rocotorun .. exiting" diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index ce2e0393072..72b496207a0 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -126,3 +126,30 @@ function create_experiment () { "${HOMEgfs}/${system}/workflow/create_experiment.py" --overwrite --yaml "${yaml_config}" } + +function publish_logs() { +# publish_logs function +# This function takes a directory path and a list of files as arguments. +# It calls the publish_logs.py script to publish the logs and returns its gist URL. +# Usage: publish_logs ... + local PR_header="$1" + local dir_path="$2" + local file="$3" + + local full_paths="" + while IFS= read -r line; do + full_path="${dir_path}/${line}" + if [[ -f "${full_path}" ]]; then + full_paths+="${full_path} " + else + echo "File ${full_path} does not exist" + fi + done < "${file}" + + if [[ -n "${full_paths}" ]]; then + # shellcheck disable=SC2027,SC2086 + ${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${full_paths} --repo ${PR_header} > /dev/null + URL="$("${HOMEgfs}/ci/scripts/utils/publish_logs.py" --file "${full_paths}" --gist "${PR_header}")" + fi + echo "${URL}" +} diff --git a/ci/scripts/pr_list_database.py b/ci/scripts/utils/pr_list_database.py similarity index 72% rename from ci/scripts/pr_list_database.py rename to ci/scripts/utils/pr_list_database.py index f525d64987a..849bc953580 100755 --- a/ci/scripts/pr_list_database.py +++ b/ci/scripts/utils/pr_list_database.py @@ -3,28 +3,27 @@ import sys import os from wxflow import SQLiteDB, SQLiteDBError +from githubpr import GitHubPR from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, REMAINDER def full_path(string): """ full_path Get the absolute path of a file or directory. - Parameters ---------- string : str The relative path of the file or directory. - Returns ------- str The absolute path of the file or directory. - Raises ------ NotADirectoryError If the provided string does not represent a valid file or directory. """ + if os.path.isfile(string) or os.path.isdir(os.path.dirname(string)): return os.path.abspath(string) else: @@ -43,7 +42,7 @@ def create_table(db: SQLiteDB): db.create_table('pr_list', ['pr INTEGER PRIMARY KEY UNIQUE', 'state TEXT', 'status TEXT', 'reset_id INTEGER', 'cases TEXT']) -def add_pr(db: SQLiteDB, pr: str): +def add_pr(db: SQLiteDB, pr: str) -> bool: """ Add a pull request to the database. @@ -57,9 +56,11 @@ def add_pr(db: SQLiteDB, pr: str): entities = (pr, 'Open', 'Ready', 0, 'ci_repo') try: db.insert_data('pr_list', entities) + return True except (SQLiteDBError.IntegrityError) as e: if 'unique' in str(e).lower(): print(f"pr {pr} already is in list: nothing added") + return False def update_pr(db: SQLiteDB, args): @@ -68,7 +69,7 @@ def update_pr(db: SQLiteDB, args): Parameters ---------- - db : SQLiteDB + ci_database : SQLiteDB The database to update the pull request in. args : argparse.Namespace The command line arguments. @@ -83,16 +84,16 @@ def update_pr(db: SQLiteDB, args): db.update_data('pr_list', update, value, 'pr', args.update_pr[0]) -def display_db(db, display): +def display_db(db, display) -> list: """ Display the database. Parameters ---------- - ci_database : SQLiteDB + db : SQLiteDB The database to display. - display : list - The command line argument values. + args : display + The command line arguments. Returns ------- @@ -101,8 +102,10 @@ def display_db(db, display): """ values = [] if len(display) == 1: - rows = db.fetch_data('pr_list', ['pr', 'state', 'status', 'reset_id', 'cases'], f'pr = {display[0]}') - else: + rows = db.fetch_data('pr_list', ['pr', 'state', 'status', 'reset_id', 'cases'], f"pr = '{display[0]}'") + if len(display) == 2: + rows = db.fetch_data('pr_list', ['pr'], f"state = '{display[0]}' AND status = '{display[1]}'") + if len(display) == 0: rows = db.fetch_data('pr_list', ['pr', 'state', 'status', 'reset_id', 'cases']) for row in rows: values.append(' '.join(map(str, row))) @@ -110,6 +113,32 @@ def display_db(db, display): return values +def update_database(db: SQLiteDB) -> list: + """ + Update the database from the GitHub PRs + - only PRs from host machine are added to the database + - if the PR is already in the database it its added to the kill list + + Parameters + ---------- + db : SQLiteDB + The database to update. + + Returns + ------- + list + The kill list of pull requests. + """ + gh = GitHubPR() + pr_ready_list, pr_kill_list = gh.get_open_pr_list() + for pr in pr_ready_list: + if not add_pr(db, str(pr)): + if pr not in pr_kill_list: + pr_kill_list.append(pr) + pr_kill_list = list(set(pr_kill_list)) + return pr_kill_list + + def input_args(): """ Parse command line arguments. @@ -132,6 +161,9 @@ def input_args(): parser.add_argument('--update_pr', nargs=REMAINDER, metavar=('pr', 'state', 'status', 'reset_id', 'cases'), help='updates state and status of a given pr', required=False) parser.add_argument('--display', nargs='*', help='output pr table', required=False) + parser.add_argument('--list', nargs=2, metavar=('state', 'status'), required=False) + parser.add_argument('--update_database', help='use labels from Open GitHub PRs to update database state and produces a kill list', + action='store_true', required=False) args = parser.parse_args() return args @@ -159,5 +191,14 @@ def input_args(): if args.display is not None: for rows in display_db(ci_database, args.display): print(rows) + if args.list: + for rows in display_db(ci_database, [args.list[0], args.list[1]]): + print(rows, end=' ') + print() + if args.update_database: + pr_kill_list = update_database(ci_database) + for pr in pr_kill_list: + print(pr, end=' ') + print() ci_database.disconnect() diff --git a/ci/scripts/utils/rocotostat.py b/ci/scripts/utils/rocotostat.py new file mode 100755 index 00000000000..730def1d27a --- /dev/null +++ b/ci/scripts/utils/rocotostat.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +import sys +import os + +from wxflow import Executable, which, Logger, CommandNotFoundError +from argparse import ArgumentParser, FileType + +logger = Logger(level=os.environ.get("LOGGING_LEVEL", "DEBUG"), colored_log=False) + + +def input_args(): + """ + Parse command-line arguments. + + Returns + ------- + args : Namespace + The parsed command-line arguments. + """ + + description = """ + Using rocotostat to get the status of all jobs this scripts + determines rocoto_state: if all cycles are done, then rocoto_state is Done. + Assuming rocotorun had just been run, and the rocoto_state is not Done, then + rocoto_state is Stalled if there are no jobs that are RUNNING, SUBMITTING, or QUEUED. + """ + + parser = ArgumentParser(description=description) + + parser.add_argument('-w', help='workflow_document', type=FileType('r'), required=True) + parser.add_argument('-d', help='database_file', metavar='Database File', type=FileType('r'), required=True) + parser.add_argument('--verbose', action='store_true', help='List the states and the number of jobs that are in each', required=False) + parser.add_argument('-v', action='store_true', help='List the states and the number of jobs that are in each', required=False) + parser.add_argument('--export', action='store_true', help='create and export list of the status values for bash', required=False) + + args = parser.parse_args() + + return args + + +def rocoto_statcount(): + """ + Run rocotostat and process its output. + """ + + args = input_args() + + try: + rocotostat = which("rocotostat") + except CommandNotFoundError: + logger.exception("rocotostat not found in PATH") + raise CommandNotFoundError("rocotostat not found in PATH") + + rocotostat_all = which("rocotostat") + rocotostat.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name), '-s']) + rocotostat_all.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name), '-a']) + + rocotostat_output = rocotostat(output=str) + rocotostat_output = rocotostat_output.splitlines()[1:] + rocotostat_output = [line.split()[0:2] for line in rocotostat_output] + + rocotostat_output_all = rocotostat_all(output=str) + rocotostat_output_all = rocotostat_output_all.splitlines()[1:] + rocotostat_output_all = [line.split()[0:4] for line in rocotostat_output_all] + rocotostat_output_all = [line for line in rocotostat_output_all if len(line) != 1] + + rocoto_status = { + 'CYCLES_TOTAL': len(rocotostat_output), + 'CYCLES_DONE': sum([sublist.count('Done') for sublist in rocotostat_output]) + } + + status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED'] + for case in status_cases: + rocoto_status[case] = sum([sublist.count(case) for sublist in rocotostat_output_all]) + + return rocoto_status + + +if __name__ == '__main__': + + args = input_args() + + error_return = 0 + rocoto_status = rocoto_statcount() + + if rocoto_status['CYCLES_TOTAL'] == rocoto_status['CYCLES_DONE']: + if not args.export: + print(f"All {rocoto_status['CYCLES_TOTAL']} Cycles are Done") + rocoto_state = 'DONE' + elif rocoto_status['DEAD'] > 0: + error_return = rocoto_status['FAIL'] + rocoto_status['DEAD'] + rocoto_state = 'FAIL' + elif 'UNKNOWN' in rocoto_status: + error_return = rocoto_status['UNKNOWN'] + rocoto_state = 'UNKNOWN' + elif rocoto_status['RUNNING'] + rocoto_status['SUBMITTING'] + rocoto_status['QUEUED'] == 0: + error_return = -3 + rocoto_state = 'STALLED' + else: + rocoto_state = 'RUNNING' + + rocoto_status['ROCOTO_STATE'] = rocoto_state + + if args.verbose or args.v: + for status in rocoto_status: + if args.v: + print(f'{status}:{rocoto_status[status]}') + else: + print(f'Number of {status} : {rocoto_status[status]}') + + if args.export: + for status in rocoto_status: + print(f'export {status}={rocoto_status[status]}') + else: + print(rocoto_state) + + sys.exit(error_return) diff --git a/workflow/rocoto_viewer.py b/workflow/rocoto_viewer.py index 95dd9e76dd8..459381f601d 100755 --- a/workflow/rocoto_viewer.py +++ b/workflow/rocoto_viewer.py @@ -1360,7 +1360,7 @@ def main(screen): screen.refresh() curses.mousemask(1) curses.noecho() - for i in range(0, curses.COLORS): + for i in range(0, curses.COLORS - 1): curses.init_pair(i + 1, i, curses.COLOR_BLACK) if i == 4: curses.init_pair(i + 1, i, curses.COLOR_WHITE)