From 3f1fee3109c675a9f99de19e9b5a39c2424797f7 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 8 Apr 2025 14:47:41 -0400 Subject: [PATCH 01/15] Add a gaec6_build host --- sorc/build_compute.sh | 15 ++++++++++++++- workflow/build_compute.py | 4 +++- workflow/hosts.py | 3 ++- workflow/hosts/gaeac6_build.yaml | 6 ++++++ 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 workflow/hosts/gaeac6_build.yaml diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index 0bc21317b4a..bf794649062 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -77,10 +77,23 @@ rm -f "${build_xml}" "${build_db}" "${build_lock_db}" echo "Sourcing global-workflow modules ..." source "${HOMEgfs}/workflow/gw_setup.sh" +# Determine yaml and "host" to use +case ${MACHINE_ID} in + "gaeac6") + # C6 does not allow builds on compute nodes, so submit to the login nodes + yaml="${HOMEgfs}/workflow/build_service_opts.yaml" + host="${MACHINE_ID^^}_BUILD" + ;; + *) + yaml="${HOMEgfs}/workflow/build_opts.yaml" + host="${MACHINE_ID^^}" + ;; +esac + echo "Generating build.xml for building global-workflow programs on compute nodes ..." # Catch errors manually from here out set +e -"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${HOMEgfs}/workflow/build_opts.yaml" --systems "${systems}" +"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" --host "${host}" rc=$? if [[ "${rc}" -ne 0 ]]; then msg="FATAL ERROR: ${BASH_SOURCE[0]} failed to create 'build.xml' with error code ${rc}" diff --git a/workflow/build_compute.py b/workflow/build_compute.py index f281a899bd4..5945e8afd2c 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -34,6 +34,8 @@ def input_args(*argv): parser.add_argument('--yaml', help='Input YAML file', type=str, required=False, default='build_opts.yaml') parser.add_argument('--systems', help='System(s) to build (options: gfs, gefs, sfs, gsi, gdas, or all)', required=False, default='gfs') + parser.add_argument('--host', help='Input host name (used to determine which host yaml to load)', + type=str, required=False, default=None) inputs = parser.parse_args(list(*argv) if len(argv) else None) @@ -127,7 +129,7 @@ def main(*argv): # Gather host specs and place the user supplied account # into the host_specs dict - host_specs = get_host_specs(Host()) + host_specs = get_host_specs(Host(user_inputs.host)) host_specs.account = user_inputs.account # Retrieve build specificatiosn from user provided yaml diff --git a/workflow/hosts.py b/workflow/hosts.py index 834027beedc..de0e09f69d5 100644 --- a/workflow/hosts.py +++ b/workflow/hosts.py @@ -17,7 +17,8 @@ class Host: SUPPORTED_HOSTS = ['HERA', 'ORION', 'JET', 'HERCULES', 'WCOSS2', 'S4', 'CONTAINER', 'GAEAC5', - 'GAEAC6', 'AWSPW', 'AZUREPW', 'GOOGLEPW'] + 'GAEAC6', 'AWSPW', 'AZUREPW', 'GOOGLEPW', + 'GAEAC6_BUILD'] def __init__(self, host=None): diff --git a/workflow/hosts/gaeac6_build.yaml b/workflow/hosts/gaeac6_build.yaml new file mode 100644 index 00000000000..7ebf1434012 --- /dev/null +++ b/workflow/hosts/gaeac6_build.yaml @@ -0,0 +1,6 @@ +# BQS properties +SCHEDULER: slurm +QUEUE: normal +CLUSTERS: 'es' +PARTITION_BATCH: 'eslogin_c6' +USE_SCRONTAB: 'YES' From c8c07105ec1e6d429e0cf1e8c0dee1d6a9b46982 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 8 Apr 2025 14:50:01 -0400 Subject: [PATCH 02/15] Add definitions for service node builds --- workflow/build_service_opts.yaml | 82 ++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 workflow/build_service_opts.yaml diff --git a/workflow/build_service_opts.yaml b/workflow/build_service_opts.yaml new file mode 100644 index 00000000000..b426df0a8f1 --- /dev/null +++ b/workflow/build_service_opts.yaml @@ -0,0 +1,82 @@ +systems: + common: + - "ufs_utils" + - "gfs_utils" + - "upp" + gfs: + - "gfs_model" + - "gfs_ww3prepost" + gsi: + - "gsi_enkf" + - "gsi_utils" + - "gsi_monitor" + gdas: + - "gdas" + - "gsi_utils" + - "gsi_monitor" + gefs: + - "gefs_model" + - "gefs_ww3_prepost" + sfs: + - "sfs_model" + - "gefs_ww3_prepost" +build: + gfs_model: + command: "./build_ufs.sh -e gfs_model.x" + cores: 8 + walltime: "00:45:00" + + gfs_ww3prepost: + command: "./build_ww3prepost.sh" + cores: 4 + walltime: "00:15:00" + + gefs_model: + command: "./build_ufs.sh -w -e gefs_model.x" + cores: 8 + walltime: "00:45:00" + + gefs_ww3_prepost: + command: "./build_ww3prepost.sh -w" + cores: 4 + walltime: "00:15:00" + + sfs_model: + command: "./build_ufs.sh -y -e sfs_model.x" + cores: 8 + walltime: "00:45:00" + + upp: + command: "./build_upp.sh" + cores: 8 + walltime: "00:15:00" + + gsi_enkf: + command: "./build_gsi_enkf.sh" + cores: 8 + walltime: "00:15:00" + + gsi_monitor: + command: "./build_gsi_monitor.sh" + cores: 4 + walltime: "00:15:00" + + gsi_utils: + command: "./build_gsi_utils.sh" + cores: 6 + walltime: "00:15:00" + + ufs_utils: + command: "./build_ufs_utils.sh" + cores: 8 + walltime: "00:15:00" + + gfs_utils: + command: "./build_gfs_utils.sh" + cores: 6 + walltime: "00:15:00" + + gdas: + command: "./build_gdas.sh" + cores: 8 + walltime: "02:30:00" From fbddaa125e88eaa6b1a50ff8375880216053cb71 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 8 Apr 2025 14:55:33 -0400 Subject: [PATCH 03/15] Increase gsi and ww3 build times --- workflow/build_service_opts.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/build_service_opts.yaml b/workflow/build_service_opts.yaml index b426df0a8f1..5bbb4280c4d 100644 --- a/workflow/build_service_opts.yaml +++ b/workflow/build_service_opts.yaml @@ -29,7 +29,7 @@ build: gfs_ww3prepost: command: "./build_ww3prepost.sh" cores: 4 - walltime: "00:15:00" + walltime: "00:30:00" gefs_model: command: "./build_ufs.sh -w -e gefs_model.x" @@ -39,7 +39,7 @@ build: gefs_ww3_prepost: command: "./build_ww3prepost.sh -w" cores: 4 - walltime: "00:15:00" + walltime: "00:30:00" sfs_model: command: "./build_ufs.sh -y -e sfs_model.x" @@ -54,7 +54,7 @@ build: gsi_enkf: command: "./build_gsi_enkf.sh" cores: 8 - walltime: "00:15:00" + walltime: "00:30:00" gsi_monitor: command: "./build_gsi_monitor.sh" From 0b6b6521b9b6f775d9a78ae666e411283ff645e5 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 8 Apr 2025 15:00:53 -0400 Subject: [PATCH 04/15] Update CODEOWNERS for new yaml file --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7e5c4b8bcf3..0a7c0b3e2aa 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -212,3 +212,4 @@ ush/python/pygfs/utils/marine_da_utils.py @guillaumevernieres @AndrewEichmann-NO # Specific workflow scripts workflow/generate_workflows.sh @DavidHuber-NOAA workflow/build_compute.py @DavidHuber-NOAA @aerorahul +workflow/build_*opts.yaml @DavidHuber-NOAA @aerorahul From 6d2b6d10ce24013ee7dbee039cc915e7ebb60119 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 13:35:59 -0400 Subject: [PATCH 05/15] Add overrides for C6 --- sorc/build_compute.sh | 16 +------ workflow/build_compute.py | 54 +++++++++++++++++---- workflow/build_opts.yaml | 8 ++++ workflow/build_service_opts.yaml | 82 -------------------------------- workflow/hosts/gaeac6.yaml | 2 + 5 files changed, 58 insertions(+), 104 deletions(-) delete mode 100644 workflow/build_service_opts.yaml diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index bf794649062..0fb24aa1c4f 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -77,23 +77,11 @@ rm -f "${build_xml}" "${build_db}" "${build_lock_db}" echo "Sourcing global-workflow modules ..." source "${HOMEgfs}/workflow/gw_setup.sh" -# Determine yaml and "host" to use -case ${MACHINE_ID} in - "gaeac6") - # C6 does not allow builds on compute nodes, so submit to the login nodes - yaml="${HOMEgfs}/workflow/build_service_opts.yaml" - host="${MACHINE_ID^^}_BUILD" - ;; - *) - yaml="${HOMEgfs}/workflow/build_opts.yaml" - host="${MACHINE_ID^^}" - ;; -esac - +yaml="${HOMEgfs}/workflow/build_opts.yaml" echo "Generating build.xml for building global-workflow programs on compute nodes ..." # Catch errors manually from here out set +e -"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" --host "${host}" +"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" rc=$? if [[ "${rc}" -ne 0 ]]; then msg="FATAL ERROR: ${BASH_SOURCE[0]} failed to create 'build.xml' with error code ${rc}" diff --git a/workflow/build_compute.py b/workflow/build_compute.py index 5945e8afd2c..7bd7b6958f5 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -8,7 +8,7 @@ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from typing import Dict -from wxflow import parse_yaml, AttrDict +from wxflow import parse_yaml, AttrDict, to_timedelta, timedelta_to_HMS from hosts import Host import rocoto.rocoto as rocoto @@ -34,8 +34,6 @@ def input_args(*argv): parser.add_argument('--yaml', help='Input YAML file', type=str, required=False, default='build_opts.yaml') parser.add_argument('--systems', help='System(s) to build (options: gfs, gefs, sfs, gsi, gdas, or all)', required=False, default='gfs') - parser.add_argument('--host', help='Input host name (used to determine which host yaml to load)', - type=str, required=False, default=None) inputs = parser.parse_args(list(*argv) if len(argv) else None) @@ -75,7 +73,7 @@ def get_task_spec(task_name: str, task_spec: Dict, host_spec: Dict) -> Dict: task_dict.resources.partition = host_spec.partition task_dict.resources.walltime = task_spec.walltime task_dict.resources.native = host_spec.native - task_dict.resources.memory = None + task_dict.resources.memory = task_spec.get("memory", None) task_dict.resources.nodes = 1 task_dict.resources.ntasks = task_spec.cores task_dict.resources.ppn = task_spec.cores @@ -84,6 +82,41 @@ def get_task_spec(task_name: str, task_spec: Dict, host_spec: Dict) -> Dict: return task_dict +def get_build_specs(build_specs: Dict, host_spec: Dict) -> Dict: + """Generate build specifications from a rendered yaml AttrDict + + Parameters + ---------- + build_specs : Dict + Build specifications and any host-specific overrides + host_spec: Dict + The specification of the host, containing account, queue, partition, and native. + + Returns + ------- + build_specs: Dict + Overridden build specifications + """ + + # Get host overrides, if present + if build_specs.get("host_override") is not None: + if build_specs.host_override.get(host_spec.machine, None) is not None: + override = build_specs.host_override[host_spec.machine] + for key in build_specs.build: + if build_specs.build[key].cores > override.max_cores: + # Adjust the walltime based on the ratio of max_cores/build.walltime + in_walltime = to_timedelta(build_specs.build[key].walltime) + override_walltime = in_walltime * (override.max_cores / build_specs.build[key].cores) + build_specs.build[key].cores = override.max_cores + build_specs.build[key].walltime = timedelta_to_HMS(override_walltime) + + # Adjust build walltime by the walltime_ratio + build_specs.build[key].walltime = timedelta_to_HMS( + to_timedelta(build_specs.build[key].walltime) * override.walltime_ratio) + + return build_specs + + def get_host_specs(host: Dict) -> Dict: """Generate host specs for the build.xml file based on Host() info @@ -105,13 +138,17 @@ def get_host_specs(host: Dict) -> Dict: native = '-l place=vscatter' elif host.info.SCHEDULER in ['slurm']: native = '--export=NONE' - if host.info.get("PARTITION_BATCH", "") != "": + if host.info.get("PARTITION_BUILD", "") != "": + partition = host.info.PARTITION_BUILD + elif host.info.get("PARTITION_BATCH", "") != "": partition = host.info.PARTITION_BATCH if host.info.get("RESERVATION", "") != "": native += f' --reservation={host.info.RESERVATION}' - if host.info.get("CLUSTERS", "") != "": + if host.info.get("CLUSTERS_BUILD", None) is not None: + native += f' --clusters={host.info.CLUSTERS_BUILD}' + elif host.info.get("CLUSTERS", None) is not None: native += f' --clusters={host.info.CLUSTERS}' specs = AttrDict() @@ -119,6 +156,7 @@ def get_host_specs(host: Dict) -> Dict: specs.queue = host.info.QUEUE specs.partition = partition specs.native = native + specs.machine = host.machine return specs @@ -129,11 +167,11 @@ def main(*argv): # Gather host specs and place the user supplied account # into the host_specs dict - host_specs = get_host_specs(Host(user_inputs.host)) + host_specs = get_host_specs(Host()) host_specs.account = user_inputs.account # Retrieve build specificatiosn from user provided yaml - build_specs = AttrDict(parse_yaml(user_inputs.yaml)) + build_specs = get_build_specs(AttrDict(parse_yaml(user_inputs.yaml)), host_specs) systems = user_inputs.systems.split() if "all" not in user_inputs.systems else ["all"] diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 736789ba591..db08a4fc2bd 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -1,3 +1,10 @@ +host_override: # over-ride options for host + # Gaea must use the login nodes for builds + GAEAC6: + PARTITION_BATCH: 'eslogin_c6' + CLUSTERS: 'es' + max_cores: 8 + walltime_ratio: 1.5 # Uniformly adjust the wallclock by this factor (builds are slower on head nodes) systems: common: - "ufs_utils" @@ -80,3 +87,4 @@ build: command: "./build_gdas.sh" cores: 40 walltime: "01:30:00" + memory: "30G" diff --git a/workflow/build_service_opts.yaml b/workflow/build_service_opts.yaml deleted file mode 100644 index 5bbb4280c4d..00000000000 --- a/workflow/build_service_opts.yaml +++ /dev/null @@ -1,82 +0,0 @@ -systems: - common: - - "ufs_utils" - - "gfs_utils" - - "upp" - gfs: - - "gfs_model" - - "gfs_ww3prepost" - gsi: - - "gsi_enkf" - - "gsi_utils" - - "gsi_monitor" - gdas: - - "gdas" - - "gsi_utils" - - "gsi_monitor" - gefs: - - "gefs_model" - - "gefs_ww3_prepost" - sfs: - - "sfs_model" - - "gefs_ww3_prepost" -build: - gfs_model: - command: "./build_ufs.sh -e gfs_model.x" - cores: 8 - walltime: "00:45:00" - - gfs_ww3prepost: - command: "./build_ww3prepost.sh" - cores: 4 - walltime: "00:30:00" - - gefs_model: - command: "./build_ufs.sh -w -e gefs_model.x" - cores: 8 - walltime: "00:45:00" - - gefs_ww3_prepost: - command: "./build_ww3prepost.sh -w" - cores: 4 - walltime: "00:30:00" - - sfs_model: - command: "./build_ufs.sh -y -e sfs_model.x" - cores: 8 - walltime: "00:45:00" - - upp: - command: "./build_upp.sh" - cores: 8 - walltime: "00:15:00" - - gsi_enkf: - command: "./build_gsi_enkf.sh" - cores: 8 - walltime: "00:30:00" - - gsi_monitor: - command: "./build_gsi_monitor.sh" - cores: 4 - walltime: "00:15:00" - - gsi_utils: - command: "./build_gsi_utils.sh" - cores: 6 - walltime: "00:15:00" - - ufs_utils: - command: "./build_ufs_utils.sh" - cores: 8 - walltime: "00:15:00" - - gfs_utils: - command: "./build_gfs_utils.sh" - cores: 6 - walltime: "00:15:00" - - gdas: - command: "./build_gdas.sh" - cores: 8 - walltime: "02:30:00" diff --git a/workflow/hosts/gaeac6.yaml b/workflow/hosts/gaeac6.yaml index 5a1dc11af4e..507be5ada25 100644 --- a/workflow/hosts/gaeac6.yaml +++ b/workflow/hosts/gaeac6.yaml @@ -15,11 +15,13 @@ SCHEDULER: slurm QUEUE: normal QUEUE_SERVICE: normal QUEUE_DTN: 'hpss' +PARTITION_BUILD: eslogin_c6 PARTITION_BATCH: batch PARTITION_SERVICE: batch PARTITION_DTN: 'dtn_f5_f6' CLUSTERS: 'c6' CLUSTERS_DTN: 'es' +CLUSTERS_BUILD: 'es' CONSTRAINT_DTN: 'f6' RESERVATION: '' PARTITION_CRON: 'cron_c6' From 53bedf92a77fde325f50435023fbcd5dd45c497b Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 13:41:19 -0400 Subject: [PATCH 06/15] Remove unused partition_batch and clusters from build_opts.yaml --- workflow/build_opts.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index db08a4fc2bd..5747f871eb6 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -1,8 +1,6 @@ host_override: # over-ride options for host # Gaea must use the login nodes for builds GAEAC6: - PARTITION_BATCH: 'eslogin_c6' - CLUSTERS: 'es' max_cores: 8 walltime_ratio: 1.5 # Uniformly adjust the wallclock by this factor (builds are slower on head nodes) systems: From bda152235e6f9090305d2f0d1e1158b5246dac53 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 13:42:43 -0400 Subject: [PATCH 07/15] Remove unused gaeac6_build.yaml --- workflow/hosts.py | 3 +-- workflow/hosts/gaeac6_build.yaml | 6 ------ 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 workflow/hosts/gaeac6_build.yaml diff --git a/workflow/hosts.py b/workflow/hosts.py index de0e09f69d5..834027beedc 100644 --- a/workflow/hosts.py +++ b/workflow/hosts.py @@ -17,8 +17,7 @@ class Host: SUPPORTED_HOSTS = ['HERA', 'ORION', 'JET', 'HERCULES', 'WCOSS2', 'S4', 'CONTAINER', 'GAEAC5', - 'GAEAC6', 'AWSPW', 'AZUREPW', 'GOOGLEPW', - 'GAEAC6_BUILD'] + 'GAEAC6', 'AWSPW', 'AZUREPW', 'GOOGLEPW'] def __init__(self, host=None): diff --git a/workflow/hosts/gaeac6_build.yaml b/workflow/hosts/gaeac6_build.yaml deleted file mode 100644 index 7ebf1434012..00000000000 --- a/workflow/hosts/gaeac6_build.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# BQS properties -SCHEDULER: slurm -QUEUE: normal -CLUSTERS: 'es' -PARTITION_BATCH: 'eslogin_c6' -USE_SCRONTAB: 'YES' From a526d6ea212236e3d7af0c311aacebb387f37970 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 14:25:24 -0400 Subject: [PATCH 08/15] Fix time calculation --- workflow/build_compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/build_compute.py b/workflow/build_compute.py index 7bd7b6958f5..d6c09bdfb43 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -106,7 +106,7 @@ def get_build_specs(build_specs: Dict, host_spec: Dict) -> Dict: if build_specs.build[key].cores > override.max_cores: # Adjust the walltime based on the ratio of max_cores/build.walltime in_walltime = to_timedelta(build_specs.build[key].walltime) - override_walltime = in_walltime * (override.max_cores / build_specs.build[key].cores) + override_walltime = in_walltime * (build_specs.build[key].cores / override.max_cores) build_specs.build[key].cores = override.max_cores build_specs.build[key].walltime = timedelta_to_HMS(override_walltime) From df9038191ed0e268591fec11abac88bb54275c0b Mon Sep 17 00:00:00 2001 From: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> Date: Wed, 9 Apr 2025 14:55:45 -0400 Subject: [PATCH 09/15] Construct build_specs in two steps Co-authored-by: Rahul Mahajan --- workflow/build_compute.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/build_compute.py b/workflow/build_compute.py index d6c09bdfb43..f0bbc4497b1 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -171,7 +171,8 @@ def main(*argv): host_specs.account = user_inputs.account # Retrieve build specificatiosn from user provided yaml - build_specs = get_build_specs(AttrDict(parse_yaml(user_inputs.yaml)), host_specs) + user_yaml_dict = AttrDict(parse_yaml(user_inputs.yaml)) + build_specs = get_build_specs(user_yaml_dict, host_specs) systems = user_inputs.systems.split() if "all" not in user_inputs.systems else ["all"] From 09b0408a4730ce3dd9bd79206a5d37760d35bee4 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 15:06:23 -0400 Subject: [PATCH 10/15] Simplify if override checking --- workflow/build_compute.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/workflow/build_compute.py b/workflow/build_compute.py index f0bbc4497b1..982fe51eb13 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -99,20 +99,21 @@ def get_build_specs(build_specs: Dict, host_spec: Dict) -> Dict: """ # Get host overrides, if present - if build_specs.get("host_override") is not None: - if build_specs.host_override.get(host_spec.machine, None) is not None: - override = build_specs.host_override[host_spec.machine] - for key in build_specs.build: - if build_specs.build[key].cores > override.max_cores: - # Adjust the walltime based on the ratio of max_cores/build.walltime - in_walltime = to_timedelta(build_specs.build[key].walltime) - override_walltime = in_walltime * (build_specs.build[key].cores / override.max_cores) - build_specs.build[key].cores = override.max_cores - build_specs.build[key].walltime = timedelta_to_HMS(override_walltime) - - # Adjust build walltime by the walltime_ratio - build_specs.build[key].walltime = timedelta_to_HMS( - to_timedelta(build_specs.build[key].walltime) * override.walltime_ratio) + if build_specs.get("host_override", None) is None or build_specs.host_override.get(host_spec.machine, None) is None: + return build_specs + + override = build_specs.host_override[host_spec.machine] + for key in build_specs.build: + if build_specs.build[key].cores > override.max_cores: + # Adjust the walltime based on the ratio of max_cores/build.walltime + in_walltime = to_timedelta(build_specs.build[key].walltime) + override_walltime = in_walltime * (build_specs.build[key].cores / override.max_cores) + build_specs.build[key].cores = override.max_cores + build_specs.build[key].walltime = timedelta_to_HMS(override_walltime) + + # Adjust build walltime by the walltime_ratio + build_specs.build[key].walltime = timedelta_to_HMS( + to_timedelta(build_specs.build[key].walltime) * override.walltime_ratio) return build_specs From 9155e0271f110bf96a81845a97ea3325d48da709 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 15:07:45 -0400 Subject: [PATCH 11/15] Add comments --- workflow/build_compute.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/build_compute.py b/workflow/build_compute.py index 982fe51eb13..28d0ffae53b 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -100,6 +100,7 @@ def get_build_specs(build_specs: Dict, host_spec: Dict) -> Dict: # Get host overrides, if present if build_specs.get("host_override", None) is None or build_specs.host_override.get(host_spec.machine, None) is None: + # Nothing to override, return with original build_specs return build_specs override = build_specs.host_override[host_spec.machine] From 4f7231dacd03e72704ebd86c262deed9cbcfd757 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 15:48:25 -0400 Subject: [PATCH 12/15] Create build-specific changes for gaea --- workflow/build_compute.py | 28 ++++++++++++++++++---------- workflow/build_opts.yaml | 6 +++++- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/workflow/build_compute.py b/workflow/build_compute.py index 28d0ffae53b..089cb740121 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -104,17 +104,25 @@ def get_build_specs(build_specs: Dict, host_spec: Dict) -> Dict: return build_specs override = build_specs.host_override[host_spec.machine] + override_build = override.get("build", {}) for key in build_specs.build: - if build_specs.build[key].cores > override.max_cores: - # Adjust the walltime based on the ratio of max_cores/build.walltime - in_walltime = to_timedelta(build_specs.build[key].walltime) - override_walltime = in_walltime * (build_specs.build[key].cores / override.max_cores) - build_specs.build[key].cores = override.max_cores - build_specs.build[key].walltime = timedelta_to_HMS(override_walltime) - - # Adjust build walltime by the walltime_ratio - build_specs.build[key].walltime = timedelta_to_HMS( - to_timedelta(build_specs.build[key].walltime) * override.walltime_ratio) + # Override the specific build specs if the key and spec is present in the + if key in override_build: + for spec in override_build[key]: + build_specs.build[key][spec] = override_build[key][spec] + + # Otherwise, take the blanket exceptions and apply to the job + else: + if build_specs.build[key].cores > override.max_cores: + # Adjust the walltime based on the ratio of max_cores/build.walltime + in_walltime = to_timedelta(build_specs.build[key].walltime) + override_walltime = in_walltime * (build_specs.build[key].cores / override.max_cores) + build_specs.build[key].cores = override.max_cores + build_specs.build[key].walltime = timedelta_to_HMS(override_walltime) + + # Adjust build walltime by the walltime_ratio + build_specs.build[key].walltime = timedelta_to_HMS( + to_timedelta(build_specs.build[key].walltime) * override.walltime_ratio) return build_specs diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 5747f871eb6..c44cb3b5d37 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -3,6 +3,11 @@ host_override: # over-ride options for host GAEAC6: max_cores: 8 walltime_ratio: 1.5 # Uniformly adjust the wallclock by this factor (builds are slower on head nodes) + build: + gdas: + memory: "30G" + cores: 8 + walltime: "02:30:00" systems: common: - "ufs_utils" @@ -85,4 +90,3 @@ build: command: "./build_gdas.sh" cores: 40 walltime: "01:30:00" - memory: "30G" From 9254fa0cf271e0d1e42c3ea147aadc33c30f8b73 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 20:19:50 +0000 Subject: [PATCH 13/15] Re-enable compute builds on C6 CI --- ci/scripts/utils/ci_utils.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index b808d2f6d29..5122ec68fc0 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -190,11 +190,6 @@ function cleanup_experiment() { function build_compute () { source "${HOMEgfs}/ci/platforms/config.${MACHINE_ID}" - # TODO: when it's safe to build on C6 compute nodes again, do so - if [[ "${MACHINE_ID}" == "gaeac6" ]]; then - "${HOMEgfs}/sorc/build_all.sh" -v -k all - else - "${HOMEgfs}/sorc/build_compute.sh" -A "${HPC_ACCOUNT}" -v all - fi + "${HOMEgfs}/sorc/build_compute.sh" -A "${HPC_ACCOUNT}" -v all } From 5f2daeb7be93e886fa467969b6a0ec0ac37f6540 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 20:23:36 +0000 Subject: [PATCH 14/15] bogus change --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3311f540a5c..feaeecd09a3 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ ![Custom badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/emcbot/e35aa2904a54deae6bbb1fdc2d960c71/raw/hercules.json) ![Custom badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/emcbot/e35aa2904a54deae6bbb1fdc2d960c71/raw/wcoss2.json) + The Global Workflow supporting the Global Forecast System (GFS), the Global Ensemble Forecasting System (GEFS), and the Seasonal Forecast System (SFS) with the [UFS-weather-model](https://github.com/ufs-community/ufs-weather-model). Data assimilation, currently only available for the GFS, is provides by both the [GSI](https://github.com/NOAA-EMC/GSI)- and [GDASApp (JEDI)](https://github.com/NOAA-EMC/GDASApp)-based Data Assimilation systems. In progress [documentation](https://global-workflow.readthedocs.io/en/latest/) is available. From e0b061281c1cd302a9a2e5b8283d16c9de737b48 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 9 Apr 2025 20:23:53 +0000 Subject: [PATCH 15/15] Revert bogus change --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index feaeecd09a3..3311f540a5c 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ ![Custom badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/emcbot/e35aa2904a54deae6bbb1fdc2d960c71/raw/hercules.json) ![Custom badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/emcbot/e35aa2904a54deae6bbb1fdc2d960c71/raw/wcoss2.json) - The Global Workflow supporting the Global Forecast System (GFS), the Global Ensemble Forecasting System (GEFS), and the Seasonal Forecast System (SFS) with the [UFS-weather-model](https://github.com/ufs-community/ufs-weather-model). Data assimilation, currently only available for the GFS, is provides by both the [GSI](https://github.com/NOAA-EMC/GSI)- and [GDASApp (JEDI)](https://github.com/NOAA-EMC/GDASApp)-based Data Assimilation systems. In progress [documentation](https://global-workflow.readthedocs.io/en/latest/) is available.