From ddd31885bb8896a054eb35f78fdcfcecbfa61447 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 13:37:35 +0000 Subject: [PATCH 1/5] Initial plan From 9bffe43054a79ba826fd629458aff6ef160af468 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 13:44:39 +0000 Subject: [PATCH 2/5] Add master scrontab runner script to cycle through CI tests sequentially MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When generate_workflows.sh is used on SLURM-managed scron systems (e.g. Gaea), running all rocoto instances simultaneously can exhaust head-node memory and cause OOM errors. Changes: - Collect individual .scron.sh paths in an array during experiment creation instead of immediately writing per-experiment entries to tests.cron - After all experiments are created, generate a single master runner script (rocoto_master_run.sh) in ${RUNTESTS}/EXPDIR/ that calls each experiment's .scron.sh sequentially - Place a single scrontab entry pointing to the master script, with: - partition/account pulled from the first experiment's .crontab - wall-time computed as 10 min × number of experiments - --dependency=singleton to prevent overlapping runs - Email notification behavior is unchanged: each individual .scron.sh continues to send failure emails via its own monitoring logic Agent-Logs-Url: https://github.com/DavidHuber-NOAA/global-workflow/sessions/051bf98c-4777-4976-8c4e-a69d634ab5a1 Co-authored-by: DavidHuber-NOAA <69919478+DavidHuber-NOAA@users.noreply.github.com> --- dev/workflow/generate_workflows.sh | 57 +++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh index 63fbf77f800..872e5ecfa97 100755 --- a/dev/workflow/generate_workflows.sh +++ b/dev/workflow/generate_workflows.sh @@ -121,6 +121,8 @@ _cwd=$(pwd) _runtests="${RUNTESTS:-${_runtests:-}}" _auto_del=false _nonflag_option_count=0 +_use_scron=false +declare -a _scron_sh_files=() # --------------------------------------------------------------------------- # # Argument Parsing # --------------------------------------------------------------------------- # @@ -696,17 +698,62 @@ for _case in "${_yaml_list[@]}"; do fi if [[ "${_use_scron}" == true ]]; then - { - grep "^####" "${cron_file}" - grep "^#SCRON" "${cron_file}" - grep "${scron_sh_file}" "${cron_file}" - } >> tests.cron + # Collect this experiment's scron script path; the master runner script + # will call them all sequentially to reduce simultaneous rocoto instances. + _scron_sh_files+=("${scron_sh_file}") else grep "${_pslot}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab" >> tests.cron fi done echo +# --------------------------------------------------------------------------- # +# Build Master Runner Script for scrontab (if using scron) +# --------------------------------------------------------------------------- # + +# When running on a SLURM-managed scron system (e.g. Gaea), running all rocoto +# instances simultaneously can exhaust head-node memory. Instead, generate a +# single master script that cycles through every experiment scron script +# sequentially, and place only that one entry in the scrontab. +if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then + _master_script="${_runtests}/EXPDIR/rocoto_master_run.sh" + { + printf '%s\n' '#!/usr/bin/env bash' + printf '%s\n' '# Master runner script - cycles through all experiments sequentially' + printf '%s\n' '# to reduce simultaneous rocoto instances on the head node.' + for _scron_sh in "${_scron_sh_files[@]}"; do + printf 'if [[ -x "%s" ]]; then\n' "${_scron_sh}" + printf ' "%s"\n' "${_scron_sh}" + printf '%s\n' 'fi' + done + } > "${_master_script}" + chmod +x "${_master_script}" + + # Compute wall time: allow 10 minutes per experiment + _num_expts=${#_scron_sh_files[@]} + _wall_minutes=$(( _num_expts * 10 )) + _wall_time=$(printf "%02d:%02d:00" $(( _wall_minutes / 60 )) $(( _wall_minutes % 60 ))) + + # Pull partition and account from the first experiment's crontab + _first_pslot="${_yaml_list[0]}${_tag}" + _first_cron_file="${_runtests}/EXPDIR/${_first_pslot}/${_first_pslot}.crontab" + _master_log="${_runtests}/EXPDIR/rocoto_master_run.log" + + { + printf "\n" + printf "#################### master_run ####################\n" + grep "^#SCRON --partition=" "${_first_cron_file}" | head -1 + grep "^#SCRON --account=" "${_first_cron_file}" | head -1 + printf "#SCRON --job-name=master_scron\n" + printf "#SCRON --output=%s\n" "${_master_log}" + printf "#SCRON --time=%s\n" "${_wall_time}" + printf "#SCRON --dependency=singleton\n" + printf "*/5 * * * * %s\n" "${_master_script}" + printf "#################################################################\n" + printf "\n" + } >> tests.cron +fi + # --------------------------------------------------------------------------- # # Configure Mail Behavior # --------------------------------------------------------------------------- # From 9af64a8dc2ba55aa9eebfb06cc99959460721da4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 13:45:57 +0000 Subject: [PATCH 3/5] Address code review: add guards, error checks, configurable wall-time per experiment - Add explicit guard that _yaml_list is non-empty before accessing _yaml_list[0] - Add error check that #SCRON --partition= and #SCRON --account= are present in the first experiment's crontab before writing master scrontab entry - Make minutes-per-experiment configurable via SCRON_MINUTES_PER_EXPERIMENT environment variable (default: 10) instead of hardcoding the value Agent-Logs-Url: https://github.com/DavidHuber-NOAA/global-workflow/sessions/051bf98c-4777-4976-8c4e-a69d634ab5a1 Co-authored-by: DavidHuber-NOAA <69919478+DavidHuber-NOAA@users.noreply.github.com> --- dev/workflow/generate_workflows.sh | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh index 872e5ecfa97..5d9858b6e72 100755 --- a/dev/workflow/generate_workflows.sh +++ b/dev/workflow/generate_workflows.sh @@ -729,21 +729,37 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then } > "${_master_script}" chmod +x "${_master_script}" - # Compute wall time: allow 10 minutes per experiment + # Compute wall time: allow SCRON_MINUTES_PER_EXPERIMENT (default 10) minutes + # per experiment so the master job has enough time to run all experiments. + _scron_min_per_expt="${SCRON_MINUTES_PER_EXPERIMENT:-10}" _num_expts=${#_scron_sh_files[@]} - _wall_minutes=$(( _num_expts * 10 )) + _wall_minutes=$(( _num_expts * _scron_min_per_expt )) _wall_time=$(printf "%02d:%02d:00" $(( _wall_minutes / 60 )) $(( _wall_minutes % 60 ))) + # Guard: _yaml_list must be non-empty if _scron_sh_files is non-empty, + # but verify explicitly to surface any unexpected state. + if [[ ${#_yaml_list[@]} -eq 0 ]]; then + echo "ERROR: _yaml_list is empty but scron scripts were collected. This is unexpected." + exit 14 + fi + # Pull partition and account from the first experiment's crontab _first_pslot="${_yaml_list[0]}${_tag}" _first_cron_file="${_runtests}/EXPDIR/${_first_pslot}/${_first_pslot}.crontab" _master_log="${_runtests}/EXPDIR/rocoto_master_run.log" + _scron_partition=$(grep "^#SCRON --partition=" "${_first_cron_file}" | head -1) + _scron_account=$(grep "^#SCRON --account=" "${_first_cron_file}" | head -1) + if [[ -z "${_scron_partition}" || -z "${_scron_account}" ]]; then + echo "ERROR: Could not find #SCRON --partition= or #SCRON --account= in ${_first_cron_file}" + exit 15 + fi + { printf "\n" printf "#################### master_run ####################\n" - grep "^#SCRON --partition=" "${_first_cron_file}" | head -1 - grep "^#SCRON --account=" "${_first_cron_file}" | head -1 + printf "%s\n" "${_scron_partition}" + printf "%s\n" "${_scron_account}" printf "#SCRON --job-name=master_scron\n" printf "#SCRON --output=%s\n" "${_master_log}" printf "#SCRON --time=%s\n" "${_wall_time}" From c4fa738589534410b3235a4ec941f060f0d58bb9 Mon Sep 17 00:00:00 2001 From: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:48:57 -0400 Subject: [PATCH 4/5] Update dev/workflow/generate_workflows.sh Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- dev/workflow/generate_workflows.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh index 5d9858b6e72..371e17afe5f 100755 --- a/dev/workflow/generate_workflows.sh +++ b/dev/workflow/generate_workflows.sh @@ -733,8 +733,8 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then # per experiment so the master job has enough time to run all experiments. _scron_min_per_expt="${SCRON_MINUTES_PER_EXPERIMENT:-10}" _num_expts=${#_scron_sh_files[@]} - _wall_minutes=$(( _num_expts * _scron_min_per_expt )) - _wall_time=$(printf "%02d:%02d:00" $(( _wall_minutes / 60 )) $(( _wall_minutes % 60 ))) + _wall_minutes=$((_num_expts * _scron_min_per_expt)) + _wall_time=$(printf "%02d:%02d:00" $((_wall_minutes / 60)) $((_wall_minutes % 60))) # Guard: _yaml_list must be non-empty if _scron_sh_files is non-empty, # but verify explicitly to surface any unexpected state. From 6c6a339d71ce48d2be1c455535efb64bfbfbe84a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 13:55:42 +0000 Subject: [PATCH 5/5] Use static 10-minute wall time for master scrontab entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per reviewer request, replace the dynamic per-experiment wall-time calculation with a fixed 00:10:00 — 10 minutes total is enough time to run rocotorun sequentially across all experiments. Agent-Logs-Url: https://github.com/DavidHuber-NOAA/global-workflow/sessions/d5d8659b-071a-44bc-ad67-5c6464622085 Co-authored-by: DavidHuber-NOAA <69919478+DavidHuber-NOAA@users.noreply.github.com> --- dev/workflow/generate_workflows.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh index 371e17afe5f..758e2761095 100755 --- a/dev/workflow/generate_workflows.sh +++ b/dev/workflow/generate_workflows.sh @@ -729,13 +729,6 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then } > "${_master_script}" chmod +x "${_master_script}" - # Compute wall time: allow SCRON_MINUTES_PER_EXPERIMENT (default 10) minutes - # per experiment so the master job has enough time to run all experiments. - _scron_min_per_expt="${SCRON_MINUTES_PER_EXPERIMENT:-10}" - _num_expts=${#_scron_sh_files[@]} - _wall_minutes=$((_num_expts * _scron_min_per_expt)) - _wall_time=$(printf "%02d:%02d:00" $((_wall_minutes / 60)) $((_wall_minutes % 60))) - # Guard: _yaml_list must be non-empty if _scron_sh_files is non-empty, # but verify explicitly to surface any unexpected state. if [[ ${#_yaml_list[@]} -eq 0 ]]; then @@ -762,7 +755,7 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then printf "%s\n" "${_scron_account}" printf "#SCRON --job-name=master_scron\n" printf "#SCRON --output=%s\n" "${_master_log}" - printf "#SCRON --time=%s\n" "${_wall_time}" + printf "#SCRON --time=00:10:00\n" printf "#SCRON --dependency=singleton\n" printf "*/5 * * * * %s\n" "${_master_script}" printf "#################################################################\n"