From ddd31885bb8896a054eb35f78fdcfcecbfa61447 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Apr 2026 13:37:35 +0000
Subject: [PATCH 1/5] Initial plan


From 9bffe43054a79ba826fd629458aff6ef160af468 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Apr 2026 13:44:39 +0000
Subject: [PATCH 2/5] Add master scrontab runner script to cycle through CI
 tests sequentially
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When generate_workflows.sh is used on SLURM-managed scron systems (e.g.
Gaea), running all rocoto instances simultaneously can exhaust head-node
memory and cause OOM errors.

Changes:
- Collect individual .scron.sh paths in an array during experiment creation
  instead of immediately writing per-experiment entries to tests.cron
- After all experiments are created, generate a single master runner script
  (rocoto_master_run.sh) in ${RUNTESTS}/EXPDIR/ that calls each experiment's
  .scron.sh sequentially
- Place a single scrontab entry pointing to the master script, with:
  - partition/account pulled from the first experiment's .crontab
  - wall-time computed as 10 min × number of experiments
  - --dependency=singleton to prevent overlapping runs
- Email notification behavior is unchanged: each individual .scron.sh
  continues to send failure emails via its own monitoring logic

Agent-Logs-Url: https://github.com/DavidHuber-NOAA/global-workflow/sessions/051bf98c-4777-4976-8c4e-a69d634ab5a1

Co-authored-by: DavidHuber-NOAA <69919478+DavidHuber-NOAA@users.noreply.github.com>
---
 dev/workflow/generate_workflows.sh | 57 +++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 5 deletions(-)

diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh
index 63fbf77f800..872e5ecfa97 100755
--- a/dev/workflow/generate_workflows.sh
+++ b/dev/workflow/generate_workflows.sh
@@ -121,6 +121,8 @@ _cwd=$(pwd)
 _runtests="${RUNTESTS:-${_runtests:-}}"
 _auto_del=false
 _nonflag_option_count=0
+_use_scron=false
+declare -a _scron_sh_files=()
 # --------------------------------------------------------------------------- #
 # Argument Parsing
 # --------------------------------------------------------------------------- #
@@ -696,17 +698,62 @@ for _case in "${_yaml_list[@]}"; do
     fi
 
     if [[ "${_use_scron}" == true ]]; then
-        {
-            grep "^####" "${cron_file}"
-            grep "^#SCRON" "${cron_file}"
-            grep "${scron_sh_file}" "${cron_file}"
-        } >> tests.cron
+        # Collect this experiment's scron script path; the master runner script
+        # will call them all sequentially to reduce simultaneous rocoto instances.
+        _scron_sh_files+=("${scron_sh_file}")
     else
         grep "${_pslot}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab" >> tests.cron
     fi
 done
 echo
 
+# --------------------------------------------------------------------------- #
+# Build Master Runner Script for scrontab (if using scron)
+# --------------------------------------------------------------------------- #
+
+# When running on a SLURM-managed scron system (e.g. Gaea), running all rocoto
+# instances simultaneously can exhaust head-node memory.  Instead, generate a
+# single master script that cycles through every experiment scron script
+# sequentially, and place only that one entry in the scrontab.
+if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then
+    _master_script="${_runtests}/EXPDIR/rocoto_master_run.sh"
+    {
+        printf '%s\n' '#!/usr/bin/env bash'
+        printf '%s\n' '# Master runner script - cycles through all experiments sequentially'
+        printf '%s\n' '# to reduce simultaneous rocoto instances on the head node.'
+        for _scron_sh in "${_scron_sh_files[@]}"; do
+            printf 'if [[ -x "%s" ]]; then\n' "${_scron_sh}"
+            printf '    "%s"\n' "${_scron_sh}"
+            printf '%s\n' 'fi'
+        done
+    } > "${_master_script}"
+    chmod +x "${_master_script}"
+
+    # Compute wall time: allow 10 minutes per experiment
+    _num_expts=${#_scron_sh_files[@]}
+    _wall_minutes=$(( _num_expts * 10 ))
+    _wall_time=$(printf "%02d:%02d:00" $(( _wall_minutes / 60 )) $(( _wall_minutes % 60 )))
+
+    # Pull partition and account from the first experiment's crontab
+    _first_pslot="${_yaml_list[0]}${_tag}"
+    _first_cron_file="${_runtests}/EXPDIR/${_first_pslot}/${_first_pslot}.crontab"
+    _master_log="${_runtests}/EXPDIR/rocoto_master_run.log"
+
+    {
+        printf "\n"
+        printf "#################### master_run ####################\n"
+        grep "^#SCRON --partition=" "${_first_cron_file}" | head -1
+        grep "^#SCRON --account=" "${_first_cron_file}" | head -1
+        printf "#SCRON --job-name=master_scron\n"
+        printf "#SCRON --output=%s\n" "${_master_log}"
+        printf "#SCRON --time=%s\n" "${_wall_time}"
+        printf "#SCRON --dependency=singleton\n"
+        printf "*/5 * * * * %s\n" "${_master_script}"
+        printf "#################################################################\n"
+        printf "\n"
+    } >> tests.cron
+fi
+
 # --------------------------------------------------------------------------- #
 # Configure Mail Behavior
 # --------------------------------------------------------------------------- #

From 9af64a8dc2ba55aa9eebfb06cc99959460721da4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Apr 2026 13:45:57 +0000
Subject: [PATCH 3/5] Address code review: add guards, error checks,
 configurable wall-time per experiment

- Add explicit guard that _yaml_list is non-empty before accessing _yaml_list[0]
- Add error check that #SCRON --partition= and #SCRON --account= are present
  in the first experiment's crontab before writing master scrontab entry
- Make minutes-per-experiment configurable via SCRON_MINUTES_PER_EXPERIMENT
  environment variable (default: 10) instead of hardcoding the value

Agent-Logs-Url: https://github.com/DavidHuber-NOAA/global-workflow/sessions/051bf98c-4777-4976-8c4e-a69d634ab5a1

Co-authored-by: DavidHuber-NOAA <69919478+DavidHuber-NOAA@users.noreply.github.com>
---
 dev/workflow/generate_workflows.sh | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh
index 872e5ecfa97..5d9858b6e72 100755
--- a/dev/workflow/generate_workflows.sh
+++ b/dev/workflow/generate_workflows.sh
@@ -729,21 +729,37 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then
     } > "${_master_script}"
     chmod +x "${_master_script}"
 
-    # Compute wall time: allow 10 minutes per experiment
+    # Compute wall time: allow SCRON_MINUTES_PER_EXPERIMENT (default 10) minutes
+    # per experiment so the master job has enough time to run all experiments.
+    _scron_min_per_expt="${SCRON_MINUTES_PER_EXPERIMENT:-10}"
     _num_expts=${#_scron_sh_files[@]}
-    _wall_minutes=$(( _num_expts * 10 ))
+    _wall_minutes=$(( _num_expts * _scron_min_per_expt ))
     _wall_time=$(printf "%02d:%02d:00" $(( _wall_minutes / 60 )) $(( _wall_minutes % 60 )))
 
+    # Guard: _yaml_list must be non-empty if _scron_sh_files is non-empty,
+    # but verify explicitly to surface any unexpected state.
+    if [[ ${#_yaml_list[@]} -eq 0 ]]; then
+        echo "ERROR: _yaml_list is empty but scron scripts were collected. This is unexpected."
+        exit 14
+    fi
+
     # Pull partition and account from the first experiment's crontab
     _first_pslot="${_yaml_list[0]}${_tag}"
     _first_cron_file="${_runtests}/EXPDIR/${_first_pslot}/${_first_pslot}.crontab"
     _master_log="${_runtests}/EXPDIR/rocoto_master_run.log"
 
+    _scron_partition=$(grep "^#SCRON --partition=" "${_first_cron_file}" | head -1)
+    _scron_account=$(grep "^#SCRON --account=" "${_first_cron_file}" | head -1)
+    if [[ -z "${_scron_partition}" || -z "${_scron_account}" ]]; then
+        echo "ERROR: Could not find #SCRON --partition= or #SCRON --account= in ${_first_cron_file}"
+        exit 15
+    fi
+
     {
         printf "\n"
         printf "#################### master_run ####################\n"
-        grep "^#SCRON --partition=" "${_first_cron_file}" | head -1
-        grep "^#SCRON --account=" "${_first_cron_file}" | head -1
+        printf "%s\n" "${_scron_partition}"
+        printf "%s\n" "${_scron_account}"
         printf "#SCRON --job-name=master_scron\n"
         printf "#SCRON --output=%s\n" "${_master_log}"
         printf "#SCRON --time=%s\n" "${_wall_time}"

From c4fa738589534410b3235a4ec941f060f0d58bb9 Mon Sep 17 00:00:00 2001
From: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com>
Date: Mon, 27 Apr 2026 09:48:57 -0400
Subject: [PATCH 4/5] Update dev/workflow/generate_workflows.sh

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 dev/workflow/generate_workflows.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh
index 5d9858b6e72..371e17afe5f 100755
--- a/dev/workflow/generate_workflows.sh
+++ b/dev/workflow/generate_workflows.sh
@@ -733,8 +733,8 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then
     # per experiment so the master job has enough time to run all experiments.
     _scron_min_per_expt="${SCRON_MINUTES_PER_EXPERIMENT:-10}"
     _num_expts=${#_scron_sh_files[@]}
-    _wall_minutes=$(( _num_expts * _scron_min_per_expt ))
-    _wall_time=$(printf "%02d:%02d:00" $(( _wall_minutes / 60 )) $(( _wall_minutes % 60 )))
+    _wall_minutes=$((_num_expts * _scron_min_per_expt))
+    _wall_time=$(printf "%02d:%02d:00" $((_wall_minutes / 60)) $((_wall_minutes % 60)))
 
     # Guard: _yaml_list must be non-empty if _scron_sh_files is non-empty,
     # but verify explicitly to surface any unexpected state.

From 6c6a339d71ce48d2be1c455535efb64bfbfbe84a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Apr 2026 13:55:42 +0000
Subject: [PATCH 5/5] Use static 10-minute wall time for master scrontab entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per reviewer request, replace the dynamic per-experiment wall-time
calculation with a fixed 00:10:00 — 10 minutes total is enough time
to run rocotorun sequentially across all experiments.

Agent-Logs-Url: https://github.com/DavidHuber-NOAA/global-workflow/sessions/d5d8659b-071a-44bc-ad67-5c6464622085

Co-authored-by: DavidHuber-NOAA <69919478+DavidHuber-NOAA@users.noreply.github.com>
---
 dev/workflow/generate_workflows.sh | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh
index 371e17afe5f..758e2761095 100755
--- a/dev/workflow/generate_workflows.sh
+++ b/dev/workflow/generate_workflows.sh
@@ -729,13 +729,6 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then
     } > "${_master_script}"
     chmod +x "${_master_script}"
 
-    # Compute wall time: allow SCRON_MINUTES_PER_EXPERIMENT (default 10) minutes
-    # per experiment so the master job has enough time to run all experiments.
-    _scron_min_per_expt="${SCRON_MINUTES_PER_EXPERIMENT:-10}"
-    _num_expts=${#_scron_sh_files[@]}
-    _wall_minutes=$((_num_expts * _scron_min_per_expt))
-    _wall_time=$(printf "%02d:%02d:00" $((_wall_minutes / 60)) $((_wall_minutes % 60)))
-
     # Guard: _yaml_list must be non-empty if _scron_sh_files is non-empty,
     # but verify explicitly to surface any unexpected state.
     if [[ ${#_yaml_list[@]} -eq 0 ]]; then
@@ -762,7 +755,7 @@ if [[ "${_use_scron}" == true && ${#_scron_sh_files[@]} -gt 0 ]]; then
         printf "%s\n" "${_scron_account}"
         printf "#SCRON --job-name=master_scron\n"
         printf "#SCRON --output=%s\n" "${_master_log}"
-        printf "#SCRON --time=%s\n" "${_wall_time}"
+        printf "#SCRON --time=00:10:00\n"
         printf "#SCRON --dependency=singleton\n"
         printf "*/5 * * * * %s\n" "${_master_script}"
         printf "#################################################################\n"