Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,4 @@ ush/python/pygfs/utils/marine_da_utils.py @guillaumevernieres @AndrewEichmann-NO
# Specific workflow scripts
workflow/generate_workflows.sh @DavidHuber-NOAA
workflow/build_compute.py @DavidHuber-NOAA @aerorahul
workflow/build_*opts.yaml @DavidHuber-NOAA @aerorahul
7 changes: 1 addition & 6 deletions ci/scripts/utils/ci_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,6 @@ function cleanup_experiment() {
function build_compute () {

source "${HOMEgfs}/ci/platforms/config.${MACHINE_ID}"
# TODO: when it's safe to build on C6 compute nodes again, do so
if [[ "${MACHINE_ID}" == "gaeac6" ]]; then
"${HOMEgfs}/sorc/build_all.sh" -v -k all
else
"${HOMEgfs}/sorc/build_compute.sh" -A "${HPC_ACCOUNT}" -v all
fi
"${HOMEgfs}/sorc/build_compute.sh" -A "${HPC_ACCOUNT}" -v all

}
3 changes: 2 additions & 1 deletion sorc/build_compute.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ rm -f "${build_xml}" "${build_db}" "${build_lock_db}"
echo "Sourcing global-workflow modules ..."
source "${HOMEgfs}/workflow/gw_setup.sh"

yaml="${HOMEgfs}/workflow/build_opts.yaml"
echo "Generating build.xml for building global-workflow programs on compute nodes ..."
# Catch errors manually from here out
set +e
"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${HOMEgfs}/workflow/build_opts.yaml" --systems "${systems}"
"${HOMEgfs}/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}"
rc=$?
if [[ "${rc}" -ne 0 ]]; then
msg="FATAL ERROR: ${BASH_SOURCE[0]} failed to create 'build.xml' with error code ${rc}"
Expand Down
61 changes: 56 additions & 5 deletions workflow/build_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from typing import Dict

from wxflow import parse_yaml, AttrDict
from wxflow import parse_yaml, AttrDict, to_timedelta, timedelta_to_HMS

from hosts import Host
import rocoto.rocoto as rocoto
Expand Down Expand Up @@ -73,7 +73,7 @@ def get_task_spec(task_name: str, task_spec: Dict, host_spec: Dict) -> Dict:
task_dict.resources.partition = host_spec.partition
task_dict.resources.walltime = task_spec.walltime
task_dict.resources.native = host_spec.native
task_dict.resources.memory = None
task_dict.resources.memory = task_spec.get("memory", None)
task_dict.resources.nodes = 1
task_dict.resources.ntasks = task_spec.cores
task_dict.resources.ppn = task_spec.cores
Expand All @@ -82,6 +82,51 @@ def get_task_spec(task_name: str, task_spec: Dict, host_spec: Dict) -> Dict:
return task_dict


def get_build_specs(build_specs: Dict, host_spec: Dict) -> Dict:
"""Generate build specifications from a rendered yaml AttrDict

Parameters
----------
build_specs : Dict
Build specifications and any host-specific overrides
host_spec: Dict
The specification of the host, containing account, queue, partition, and native.

Returns
-------
build_specs: Dict
Overridden build specifications
"""

# Get host overrides, if present
if build_specs.get("host_override", None) is None or build_specs.host_override.get(host_spec.machine, None) is None:
# Nothing to override, return with original build_specs
return build_specs

override = build_specs.host_override[host_spec.machine]
Comment on lines +101 to +106
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This works too!

override_build = override.get("build", {})
for key in build_specs.build:
# Override the specific build specs if the key and spec is present in the
if key in override_build:
for spec in override_build[key]:
build_specs.build[key][spec] = override_build[key][spec]

# Otherwise, take the blanket exceptions and apply to the job
else:
if build_specs.build[key].cores > override.max_cores:
# Adjust the walltime based on the ratio of max_cores/build.walltime
in_walltime = to_timedelta(build_specs.build[key].walltime)
override_walltime = in_walltime * (build_specs.build[key].cores / override.max_cores)
build_specs.build[key].cores = override.max_cores
build_specs.build[key].walltime = timedelta_to_HMS(override_walltime)

# Adjust build walltime by the walltime_ratio
build_specs.build[key].walltime = timedelta_to_HMS(
to_timedelta(build_specs.build[key].walltime) * override.walltime_ratio)

return build_specs


def get_host_specs(host: Dict) -> Dict:
"""Generate host specs for the build.xml file based on Host() info

Expand All @@ -103,20 +148,25 @@ def get_host_specs(host: Dict) -> Dict:
native = '-l place=vscatter'
elif host.info.SCHEDULER in ['slurm']:
native = '--export=NONE'
if host.info.get("PARTITION_BATCH", "") != "":
if host.info.get("PARTITION_BUILD", "") != "":
partition = host.info.PARTITION_BUILD
elif host.info.get("PARTITION_BATCH", "") != "":
partition = host.info.PARTITION_BATCH

if host.info.get("RESERVATION", "") != "":
native += f' --reservation={host.info.RESERVATION}'

if host.info.get("CLUSTERS", "") != "":
if host.info.get("CLUSTERS_BUILD", None) is not None:
native += f' --clusters={host.info.CLUSTERS_BUILD}'
elif host.info.get("CLUSTERS", None) is not None:
native += f' --clusters={host.info.CLUSTERS}'

specs = AttrDict()
specs.scheduler = host.info.SCHEDULER
specs.queue = host.info.QUEUE
specs.partition = partition
specs.native = native
specs.machine = host.machine

return specs

Expand All @@ -131,7 +181,8 @@ def main(*argv):
host_specs.account = user_inputs.account

# Retrieve build specificatiosn from user provided yaml
build_specs = AttrDict(parse_yaml(user_inputs.yaml))
user_yaml_dict = AttrDict(parse_yaml(user_inputs.yaml))
build_specs = get_build_specs(user_yaml_dict, host_specs)

systems = user_inputs.systems.split() if "all" not in user_inputs.systems else ["all"]

Expand Down
10 changes: 10 additions & 0 deletions workflow/build_opts.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
host_override: # over-ride options for host
# Gaea must use the login nodes for builds
GAEAC6:
max_cores: 8
walltime_ratio: 1.5 # Uniformly adjust the wallclock by this factor (builds are slower on head nodes)
build:
gdas:
memory: "30G"
cores: 8
walltime: "02:30:00"
systems:
common:
- "ufs_utils"
Expand Down
2 changes: 2 additions & 0 deletions workflow/hosts/gaeac6.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@ SCHEDULER: slurm
QUEUE: normal
QUEUE_SERVICE: normal
QUEUE_DTN: 'hpss'
PARTITION_BUILD: eslogin_c6
PARTITION_BATCH: batch
PARTITION_SERVICE: batch
PARTITION_DTN: 'dtn_f5_f6'
CLUSTERS: 'c6'
CLUSTERS_DTN: 'es'
CLUSTERS_BUILD: 'es'
CONSTRAINT_DTN: 'f6'
RESERVATION: ''
PARTITION_CRON: 'cron_c6'
Expand Down