Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
f071994
change noaacloud NodeName
weihuang-jedi Feb 25, 2025
11accd8
Merge branch 'develop' of github.com:NOAA-EPIC/global-workflow-cloud …
weihuang-jedi Feb 26, 2025
a4771df
add download fix subset data, also serve as a test PR to trigger CI t…
weihuang-jedi Feb 26, 2025
4f1e679
fix pynorm error
weihuang-jedi Feb 26, 2025
2241227
fix pynorm error 2
weihuang-jedi Feb 26, 2025
9679617
fix pynorm error3
weihuang-jedi Feb 26, 2025
fe88751
fix pynorm error 4
weihuang-jedi Feb 26, 2025
0284fb7
fix pynorm error 5
weihuang-jedi Feb 26, 2025
27ca2f1
fix pynorm error 6
weihuang-jedi Feb 26, 2025
f39bfd9
fix pynorm error 7
weihuang-jedi Feb 27, 2025
8f7fc23
fix pynorm error 8
weihuang-jedi Feb 27, 2025
c248e12
fix pynorm error 9
weihuang-jedi Feb 27, 2025
85120f3
fix pynorm error 10
weihuang-jedi Feb 27, 2025
2a7f1c9
fix a syntax error
weihuang-jedi Feb 27, 2025
9f8ee2d
use just /lutre/jenkins for CI testing work directory
weihuang-jedi Feb 27, 2025
fbeb220
Merge branch 'develop' of github.com:NOAA-EPIC/global-workflow-cloud …
weihuang-jedi Feb 28, 2025
df4179c
using argparse and logging instead of getopt and print
weihuang-jedi Mar 3, 2025
6a168ae
using argparse and logging instead of getopt and print
weihuang-jedi Mar 3, 2025
a23662f
Updating with proper nomenclature.
kbooker79 Mar 3, 2025
3c223cf
Merge pull request #5 from NOAA-EPIC/download-subset-fix-data
kbooker79 Mar 3, 2025
6b60421
consist with Terry's code
weihuang-jedi Mar 3, 2025
35f35f2
add a ls command to make sure code in cloned
weihuang-jedi Mar 3, 2025
9e53aa4
try clone the code directly
weihuang-jedi Mar 3, 2025
98c50d9
use https to clone
weihuang-jedi Mar 3, 2025
b4c507f
Merge branch 'NOAA-EMC:develop' into develop
weihuang-jedi Mar 4, 2025
3089f00
add more debug ls
weihuang-jedi Mar 4, 2025
57b2a51
fix a typo
weihuang-jedi Mar 4, 2025
8fefe2a
remove 2 comments
weihuang-jedi Mar 4, 2025
8f6db95
reset HOMEgfs
weihuang-jedi Mar 4, 2025
1f05a51
compile for gfs only for now
weihuang-jedi Mar 4, 2025
4640b6a
comment gh pr eidt for now
weihuang-jedi Mar 5, 2025
c906547
skip CI on AWS
weihuang-jedi Mar 5, 2025
eac272e
trying to fix runtime bug
weihuang-jedi Mar 5, 2025
35a6cd0
still trying to figure out HOEgfs issue
weihuang-jedi Mar 5, 2025
b55f4a1
add compile gefs
weihuang-jedi Mar 5, 2025
e1724cb
add compile gefs
weihuang-jedi Mar 5, 2025
4264c15
add compile gefs
weihuang-jedi Mar 5, 2025
20c1211
switch back to 'checkout scm instead of git clone'
weihuang-jedi Mar 11, 2025
f8a6a6d
using checkout scm
weihuang-jedi Mar 12, 2025
c952a73
using checkout scm
weihuang-jedi Mar 12, 2025
12b09df
remove memory from resource if on AWS
weihuang-jedi Mar 13, 2025
98570b7
remove memory requirement for AWS
weihuang-jedi Mar 13, 2025
2039132
Merge branch 'NOAA-EMC:develop' into develop
weihuang-jedi Mar 16, 2025
e4fe644
sync
weihuang-jedi Mar 18, 2025
fd63bdb
fix a crontab shell issue, and make sure fcst use compute partition, …
weihuang-jedi Mar 18, 2025
17e9047
fix a pynorms error
weihuang-jedi Mar 18, 2025
b861bda
remove a ls command, and combine compile to one command
weihuang-jedi Mar 20, 2025
a1a10d0
remove for this branch
weihuang-jedi Mar 20, 2025
2dbbbdd
Merge branch 'develop' of github.com:NOAA-EPIC/global-workflow-cloud …
weihuang-jedi Mar 20, 2025
2dc2fe6
Merge branch 'develop' of github.com:NOAA-EPIC/global-workflow-cloud …
weihuang-jedi Mar 24, 2025
0970ecb
SHELL="/bin/bash"
weihuang-jedi Mar 27, 2025
c08311e
Merge branch 'develop' of github.com:NOAA-EPIC/global-workflow-cloud …
weihuang-jedi Mar 27, 2025
408085c
sync with emc repo
weihuang-jedi Mar 28, 2025
3a3f195
Now build_compute.sh needs account name
weihuang-jedi Mar 28, 2025
69ca5d9
Merge branch 'NOAA-EMC:develop' into csp-sync
weihuang-jedi Mar 29, 2025
0b89c11
Merge branch 'NOAA-EMC:develop' into csp-sync
weihuang-jedi Apr 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions ci/Jenkinsfile4AWS
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ pipeline {
stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR
// which is used to designate the Nodes in the Jenkins Controller by the agent label
// Each Jenkins Node is connected to said machine via an JAVA agent via an ssh tunnel
// no op 2

stage('1. Get Machine') {
agent { label 'built-in' }
Expand All @@ -37,39 +36,41 @@ pipeline {
}

def run_nodes = []
def machine_names = []
if (isSpawnedFromAnotherJob) {
echo "machine being set to value passed to this spawned job"
echo "passed machine: ${params.machine}"
machine = params.machine
} else {
echo "This is parent job so getting list of nodes matching labels:"
for (label in pullRequest.labels) {
echo "Checking label: ${label}"
echo "label in pullReqest: ${label}"
if (label.matches("CI-(.*?)-Ready")) {
echo "Found Ready Label: ${label}"
def machine_name = label.split('-')[1].toString().toLowerCase()
print machine_name
jenkins.model.Jenkins.get().computers.each { c ->
print c.node.selfLabel.name
if (c.node.selfLabel.name == NodeName[machine_name]) {
run_nodes.add(c.node.selfLabel.name)
machine_names.add(machine_name) // record machine name alongside node
}
}
}
}
// Spawning all the jobs on the nodes matching the labels
// Spawning jobs using both run_nodes and machine_names arrays
if (run_nodes.size() > 1) {
run_nodes.init().each { node ->
def machine_name = node.split('-')[0].toLowerCase()
for (int i = 0; i < run_nodes.size() - 1; i++) {
def node = run_nodes[i]
def machine_name = machine_names[i] // use the corresponding machine name
echo "Spawning job on node: ${node} with machine name: ${machine_name}"
build job: "/${aws_gw_name}/AWS-EPIC-Global-Workflow-Pipeline/PR-${env.CHANGE_ID}", parameters: [
string(name: 'machine', value: machine_name),
string(name: 'Node', value: node) ],
wait: false
string(name: 'Node', value: node)
], wait: false
}
machine = run_nodes.last().split('-')[0].toLowerCase()
machine = machine_names[run_nodes.size() - 1]
echo "Running parent job: ${machine}"
} else {
machine = run_nodes[0].split('-')[0].toLowerCase()
machine = machine_names[0]
echo "Running only the parent job: ${machine}"
}
}
Expand All @@ -81,10 +82,11 @@ pipeline {
agent { label NodeName[machine].toLowerCase() }
steps {
script {
// Capitalize the first letter of the machine name and use if for labels
Machine = machine[0].toUpperCase() + machine.substring(1)
echo "Getting Common Workspace for ${Machine}"
ws("${custom_workspace[machine]}/${env.CHANGE_ID}") {
properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hercules-EMC', 'Hera-EMC', 'Orion-EMC', 'Gaea', 'Awsepicglobalworkflow'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])])
properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hercules-EMC', 'Hera-EMC', 'Orion-EMC', 'GaeaC5', 'GaeaC6-EMC', 'Awsepicglobalworkflow'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])])
GH = sh(script: "which gh || echo '~/bin/gh'", returnStdout: true).trim()
CUSTOM_WORKSPACE = "${custom_workspace[machine]}/${env.CHANGE_ID}/${aws_gw_name}"
HOMEgfs = "${CUSTOM_WORKSPACE}/${aws_gw_name}"
Expand Down Expand Up @@ -132,9 +134,8 @@ pipeline {
def error_logs_message = ""
dir("${HOMEgfs}/sorc") {
try {
sh(script: 'ls ./build_compute.sh') // list files here to make sure all files exist.
sh(script: './build_compute.sh gfs') // build the global-workflow executables
sh(script: './build_compute.sh gefs') // build the global-workflow executables
// sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh build_compute") // build the global-workflow executables
sh(script: './build_compute.sh -A ${USER} gfs gefs sfs') // build the global-workflow executables
} catch (Exception error_build) {
echo "Failed to build global-workflow: ${error_build.getMessage()}"
if ( fileExists("logs/error.logs") ) {
Expand All @@ -154,13 +155,13 @@ pipeline {
try {
sh(script: """
source ${HOMEgfs}/workflow/gw_setup.sh
${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_BUILD_${env.CHANGE_ID}
${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID} | tail -n 1
""")
gist_url=sh(script: """
source ${HOMEgfs}/workflow/gw_setup.sh
${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID}
""", returnStdout: true).trim()
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Build **FAILED** on **${Machine}** in Build# ${env.BUILD_NUMBER} with error logs:\n\\`\\`\\`\n${error_logs_message}\\`\\`\\`\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})" """)
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body 'Build **FAILED** on **${Machine}** in Build# ${env.BUILD_NUMBER} with error logs:\n```\n${error_logs_message}```\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})' """)
} catch (Exception error_comment) {
echo "Failed to comment on PR: ${error_comment.getMessage()}"
}
Expand Down
54 changes: 7 additions & 47 deletions parm/config/gefs/config.resources.AWSPW
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,26 @@

export is_exclusive="True"
unset memory

# shellcheck disable=SC2312
for mem_var in $(env | grep '^memory_' | cut -d= -f1); do
unset "${mem_var}"
done
unset "memory_${RUN}"

step=$1

case ${step} in
"fcst" | "efcs")
export PARTITION_BATCH="compute"
max_tasks_per_node=48
;;

"arch_vrfy" | "arch_tars")
export PARTITION_BATCH="process"
max_tasks_per_node=24
;;

"prep_emissions")
export PARTITION_BATCH="process"
max_tasks_per_node=24
export ntasks=1
export threads_per_task=1
export tasks_per_node=$(( max_tasks_per_node / threads_per_task ))
;;

"waveinit")
export PARTITION_BATCH="process"
max_tasks_per_node=24
export ntasks=12
export threads_per_task=1
export tasks_per_node=$(( max_tasks_per_node / threads_per_task ))
export NTASKS=${ntasks}
;;

"wavepostpnt")
"fcst" | "efcs" | "wavepostbndpnt" | "wavepostpnt")
export PARTITION_BATCH="compute"
unset PARTITION_SERVICE
max_tasks_per_node=48
export ntasks=240
export threads_per_task=1
export tasks_per_node=$(( max_tasks_per_node / threads_per_task ))
export NTASKS=${ntasks}
;;

"wavepostsbs" | "wavepostbndpnt" | "wavepostbndpntbll")
export PARTITION_BATCH="process"
max_tasks_per_node=24
export ntasks=24
export threads_per_task=1
export tasks_per_node=$(( max_tasks_per_node / threads_per_task ))
export NTASKS=${ntasks}
tasks_per_node=48
;;

*)
export PARTITION_BATCH="process"
unset PARTITION_SERVICE
max_tasks_per_node=24
tasks_per_node=24
;;

esac

export max_tasks_per_node

export tasks_per_node
22 changes: 8 additions & 14 deletions parm/config/gfs/config.resources.AWSPW
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,24 @@ export is_exclusive="True"
unset memory
unset "memory_${RUN}"

step=$1

case ${step} in
"fcst" | "efcs")
"fcst" | "efcs" | "wavepostpnt")
export PARTITION_BATCH="compute"
unset PARTITION_SERVICE
max_tasks_per_node=48
tasks_per_node=48
;;

"arch_vrfy" | "arch_tars")
export PARTITION_BATCH="process"
max_tasks_per_node=24
;;


"atmos_products" | "oceanice_products" | "wavepostsbs" )
export PARTITION_BATCH="process"
max_tasks_per_node=24
;;


*)
export PARTITION_BATCH="process"
unset PARTITION_SERVICE
max_tasks_per_node=24
tasks_per_node=24
;;

esac

export max_tasks_per_node

export tasks_per_node
3 changes: 3 additions & 0 deletions workflow/hosts/awspw.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ COMINsyn: '' #TODO: This does not yet exist.
SCHEDULER: slurm
QUEUE: batch
PARTITION_BATCH: compute
PARTITION_SERVICE: process
CHGRP_RSTPROD: 'YES'
CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported.
# HPSS properties
HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below.
ARCHCOM_TO: 'local'
Expand Down
4 changes: 4 additions & 0 deletions workflow/rocoto/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ def get_resource(self, task_name):

else: # This is a batch task
task_partition = self.partition_batch
# on CSPs, partition_batch for fcst/efcs/wavepostbndpnt is "compute",
# others are "process". So need to modify task_partition here.
if (task_config['PARTITION_BATCH'] != self.partition_batch):
task_partition = task_config['PARTITION_BATCH']
Comment thread
weihuang-jedi marked this conversation as resolved.
task_queue = self.queue_batch
task_clusters = self.clusters_batch
task_constraint = self.constraint_batch
Expand Down
8 changes: 1 addition & 7 deletions workflow/rocoto/workflow_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None:
else:
cron_cmd = rocotorunstr
crontab_strings.extend([
'SHELL="/bin/bash"',
f'MAILTO="{replyto}"'
])

Expand All @@ -218,13 +219,6 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None:
''
])

# AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start.
if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']:
crontab_strings.extend([
'SHELL="/bin/bash"',
'BASH_ENV="/etc/bashrc"'
])

if crontab_file is None:
crontab_file = f"{self.expdir}/{self.pslot}.crontab"

Expand Down