diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7cb78c106de..9dc6871f47b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -215,3 +215,4 @@ ush/python/pygfs/utils/marine_da_utils.py @guillaumevernieres @AndrewEichmann-NO # Specific workflow scripts workflow/generate_workflows.sh @DavidHuber-NOAA workflow/build_compute.py @DavidHuber-NOAA @aerorahul +workflow/build_*opts.yaml @DavidHuber-NOAA @aerorahul diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index b808d2f6d29..5122ec68fc0 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -190,11 +190,6 @@ function cleanup_experiment() { function build_compute () { source "${HOMEgfs}/ci/platforms/config.${MACHINE_ID}" - # TODO: when it's safe to build on C6 compute nodes again, do so - if [[ "${MACHINE_ID}" == "gaeac6" ]]; then - "${HOMEgfs}/sorc/build_all.sh" -v -k all - else - "${HOMEgfs}/sorc/build_compute.sh" -A "${HPC_ACCOUNT}" -v all - fi + "${HOMEgfs}/sorc/build_compute.sh" -A "${HPC_ACCOUNT}" -v all } diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index 0bc21317b4a..0fb24aa1c4f 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -77,10 +77,11 @@ rm -f "${build_xml}" "${build_db}" "${build_lock_db}" echo "Sourcing global-workflow modules ..." source "${HOMEgfs}/workflow/gw_setup.sh" +yaml="${HOMEgfs}/workflow/build_opts.yaml" echo "Generating build.xml for building global-workflow programs on compute nodes ..." # Catch errors manually from here out set +e -"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${HOMEgfs}/workflow/build_opts.yaml" --systems "${systems}" +"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" rc=$? if [[ "${rc}" -ne 0 ]]; then msg="FATAL ERROR: ${BASH_SOURCE[0]} failed to create 'build.xml' with error code ${rc}" diff --git a/workflow/build_compute.py b/workflow/build_compute.py index f281a899bd4..089cb740121 100755 --- a/workflow/build_compute.py +++ b/workflow/build_compute.py @@ -8,7 +8,7 @@ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from typing import Dict -from wxflow import parse_yaml, AttrDict +from wxflow import parse_yaml, AttrDict, to_timedelta, timedelta_to_HMS from hosts import Host import rocoto.rocoto as rocoto @@ -73,7 +73,7 @@ def get_task_spec(task_name: str, task_spec: Dict, host_spec: Dict) -> Dict: task_dict.resources.partition = host_spec.partition task_dict.resources.walltime = task_spec.walltime task_dict.resources.native = host_spec.native - task_dict.resources.memory = None + task_dict.resources.memory = task_spec.get("memory", None) task_dict.resources.nodes = 1 task_dict.resources.ntasks = task_spec.cores task_dict.resources.ppn = task_spec.cores @@ -82,6 +82,51 @@ def get_task_spec(task_name: str, task_spec: Dict, host_spec: Dict) -> Dict: return task_dict +def get_build_specs(build_specs: Dict, host_spec: Dict) -> Dict: + """Generate build specifications from a rendered yaml AttrDict + + Parameters + ---------- + build_specs : Dict + Build specifications and any host-specific overrides + host_spec: Dict + The specification of the host, containing account, queue, partition, and native. + + Returns + ------- + build_specs: Dict + Overridden build specifications + """ + + # Get host overrides, if present + if build_specs.get("host_override", None) is None or build_specs.host_override.get(host_spec.machine, None) is None: + # Nothing to override, return with original build_specs + return build_specs + + override = build_specs.host_override[host_spec.machine] + override_build = override.get("build", {}) + for key in build_specs.build: + # Override the specific build specs if the key and spec is present in the + if key in override_build: + for spec in override_build[key]: + build_specs.build[key][spec] = override_build[key][spec] + + # Otherwise, take the blanket exceptions and apply to the job + else: + if build_specs.build[key].cores > override.max_cores: + # Adjust the walltime based on the ratio of max_cores/build.walltime + in_walltime = to_timedelta(build_specs.build[key].walltime) + override_walltime = in_walltime * (build_specs.build[key].cores / override.max_cores) + build_specs.build[key].cores = override.max_cores + build_specs.build[key].walltime = timedelta_to_HMS(override_walltime) + + # Adjust build walltime by the walltime_ratio + build_specs.build[key].walltime = timedelta_to_HMS( + to_timedelta(build_specs.build[key].walltime) * override.walltime_ratio) + + return build_specs + + def get_host_specs(host: Dict) -> Dict: """Generate host specs for the build.xml file based on Host() info @@ -103,13 +148,17 @@ def get_host_specs(host: Dict) -> Dict: native = '-l place=vscatter' elif host.info.SCHEDULER in ['slurm']: native = '--export=NONE' - if host.info.get("PARTITION_BATCH", "") != "": + if host.info.get("PARTITION_BUILD", "") != "": + partition = host.info.PARTITION_BUILD + elif host.info.get("PARTITION_BATCH", "") != "": partition = host.info.PARTITION_BATCH if host.info.get("RESERVATION", "") != "": native += f' --reservation={host.info.RESERVATION}' - if host.info.get("CLUSTERS", "") != "": + if host.info.get("CLUSTERS_BUILD", None) is not None: + native += f' --clusters={host.info.CLUSTERS_BUILD}' + elif host.info.get("CLUSTERS", None) is not None: native += f' --clusters={host.info.CLUSTERS}' specs = AttrDict() @@ -117,6 +166,7 @@ def get_host_specs(host: Dict) -> Dict: specs.queue = host.info.QUEUE specs.partition = partition specs.native = native + specs.machine = host.machine return specs @@ -131,7 +181,8 @@ def main(*argv): host_specs.account = user_inputs.account # Retrieve build specificatiosn from user provided yaml - build_specs = AttrDict(parse_yaml(user_inputs.yaml)) + user_yaml_dict = AttrDict(parse_yaml(user_inputs.yaml)) + build_specs = get_build_specs(user_yaml_dict, host_specs) systems = user_inputs.systems.split() if "all" not in user_inputs.systems else ["all"] diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 736789ba591..c44cb3b5d37 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -1,3 +1,13 @@ +host_override: # over-ride options for host + # Gaea must use the login nodes for builds + GAEAC6: + max_cores: 8 + walltime_ratio: 1.5 # Uniformly adjust the wallclock by this factor (builds are slower on head nodes) + build: + gdas: + memory: "30G" + cores: 8 + walltime: "02:30:00" systems: common: - "ufs_utils" diff --git a/workflow/hosts/gaeac6.yaml b/workflow/hosts/gaeac6.yaml index 5a1dc11af4e..507be5ada25 100644 --- a/workflow/hosts/gaeac6.yaml +++ b/workflow/hosts/gaeac6.yaml @@ -15,11 +15,13 @@ SCHEDULER: slurm QUEUE: normal QUEUE_SERVICE: normal QUEUE_DTN: 'hpss' +PARTITION_BUILD: eslogin_c6 PARTITION_BATCH: batch PARTITION_SERVICE: batch PARTITION_DTN: 'dtn_f5_f6' CLUSTERS: 'c6' CLUSTERS_DTN: 'es' +CLUSTERS_BUILD: 'es' CONSTRAINT_DTN: 'f6' RESERVATION: '' PARTITION_CRON: 'cron_c6'