diff --git a/ci/Jenkinsfile4AWS b/ci/Jenkinsfile4AWS index d17fce36e1b..899558087cb 100644 --- a/ci/Jenkinsfile4AWS +++ b/ci/Jenkinsfile4AWS @@ -24,7 +24,6 @@ pipeline { stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR // which is used to designate the Nodes in the Jenkins Controller by the agent label // Each Jenkins Node is connected to said machine via an JAVA agent via an ssh tunnel - // no op 2 stage('1. Get Machine') { agent { label 'built-in' } @@ -37,6 +36,7 @@ pipeline { } def run_nodes = [] + def machine_names = [] if (isSpawnedFromAnotherJob) { echo "machine being set to value passed to this spawned job" echo "passed machine: ${params.machine}" @@ -44,32 +44,33 @@ pipeline { } else { echo "This is parent job so getting list of nodes matching labels:" for (label in pullRequest.labels) { - echo "Checking label: ${label}" + echo "label in pullReqest: ${label}" if (label.matches("CI-(.*?)-Ready")) { + echo "Found Ready Label: ${label}" def machine_name = label.split('-')[1].toString().toLowerCase() - print machine_name jenkins.model.Jenkins.get().computers.each { c -> - print c.node.selfLabel.name if (c.node.selfLabel.name == NodeName[machine_name]) { run_nodes.add(c.node.selfLabel.name) + machine_names.add(machine_name) // record machine name alongside node } } } } - // Spawning all the jobs on the nodes matching the labels + // Spawning jobs using both run_nodes and machine_names arrays if (run_nodes.size() > 1) { - run_nodes.init().each { node -> - def machine_name = node.split('-')[0].toLowerCase() + for (int i = 0; i < run_nodes.size() - 1; i++) { + def node = run_nodes[i] + def machine_name = machine_names[i] // use the corresponding machine name echo "Spawning job on node: ${node} with machine name: ${machine_name}" build job: "/${aws_gw_name}/AWS-EPIC-Global-Workflow-Pipeline/PR-${env.CHANGE_ID}", parameters: [ string(name: 'machine', value: machine_name), - string(name: 'Node', value: node) ], - wait: false + string(name: 'Node', value: node) + ], wait: false } - machine = run_nodes.last().split('-')[0].toLowerCase() + machine = machine_names[run_nodes.size() - 1] echo "Running parent job: ${machine}" } else { - machine = run_nodes[0].split('-')[0].toLowerCase() + machine = machine_names[0] echo "Running only the parent job: ${machine}" } } @@ -81,10 +82,11 @@ pipeline { agent { label NodeName[machine].toLowerCase() } steps { script { + // Capitalize the first letter of the machine name and use if for labels Machine = machine[0].toUpperCase() + machine.substring(1) echo "Getting Common Workspace for ${Machine}" ws("${custom_workspace[machine]}/${env.CHANGE_ID}") { - properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hercules-EMC', 'Hera-EMC', 'Orion-EMC', 'Gaea', 'Awsepicglobalworkflow'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])]) + properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hercules-EMC', 'Hera-EMC', 'Orion-EMC', 'GaeaC5', 'GaeaC6-EMC', 'Awsepicglobalworkflow'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])]) GH = sh(script: "which gh || echo '~/bin/gh'", returnStdout: true).trim() CUSTOM_WORKSPACE = "${custom_workspace[machine]}/${env.CHANGE_ID}/${aws_gw_name}" HOMEgfs = "${CUSTOM_WORKSPACE}/${aws_gw_name}" @@ -132,9 +134,8 @@ pipeline { def error_logs_message = "" dir("${HOMEgfs}/sorc") { try { - sh(script: 'ls ./build_compute.sh') // list files here to make sure all files exist. - sh(script: './build_compute.sh gfs') // build the global-workflow executables - sh(script: './build_compute.sh gefs') // build the global-workflow executables + // sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh build_compute") // build the global-workflow executables + sh(script: './build_compute.sh -A ${USER} gfs gefs sfs') // build the global-workflow executables } catch (Exception error_build) { echo "Failed to build global-workflow: ${error_build.getMessage()}" if ( fileExists("logs/error.logs") ) { @@ -154,13 +155,13 @@ pipeline { try { sh(script: """ source ${HOMEgfs}/workflow/gw_setup.sh - ${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_BUILD_${env.CHANGE_ID} + ${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID} | tail -n 1 """) gist_url=sh(script: """ source ${HOMEgfs}/workflow/gw_setup.sh ${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID} """, returnStdout: true).trim() - sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Build **FAILED** on **${Machine}** in Build# ${env.BUILD_NUMBER} with error logs:\n\\`\\`\\`\n${error_logs_message}\\`\\`\\`\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})" """) + sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body 'Build **FAILED** on **${Machine}** in Build# ${env.BUILD_NUMBER} with error logs:\n```\n${error_logs_message}```\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})' """) } catch (Exception error_comment) { echo "Failed to comment on PR: ${error_comment.getMessage()}" } diff --git a/parm/config/gefs/config.resources.AWSPW b/parm/config/gefs/config.resources.AWSPW index 43cfcf56cce..071c102e3af 100644 --- a/parm/config/gefs/config.resources.AWSPW +++ b/parm/config/gefs/config.resources.AWSPW @@ -4,66 +4,26 @@ export is_exclusive="True" unset memory - -# shellcheck disable=SC2312 -for mem_var in $(env | grep '^memory_' | cut -d= -f1); do - unset "${mem_var}" -done +unset "memory_${RUN}" step=$1 case ${step} in - "fcst" | "efcs") - export PARTITION_BATCH="compute" - max_tasks_per_node=48 - ;; - - "arch_vrfy" | "arch_tars") - export PARTITION_BATCH="process" - max_tasks_per_node=24 - ;; - - "prep_emissions") - export PARTITION_BATCH="process" - max_tasks_per_node=24 - export ntasks=1 - export threads_per_task=1 - export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) - ;; - - "waveinit") - export PARTITION_BATCH="process" - max_tasks_per_node=24 - export ntasks=12 - export threads_per_task=1 - export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) - export NTASKS=${ntasks} - ;; - - "wavepostpnt") + "fcst" | "efcs" | "wavepostbndpnt" | "wavepostpnt") export PARTITION_BATCH="compute" + unset PARTITION_SERVICE max_tasks_per_node=48 - export ntasks=240 - export threads_per_task=1 - export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) - export NTASKS=${ntasks} - ;; - - "wavepostsbs" | "wavepostbndpnt" | "wavepostbndpntbll") - export PARTITION_BATCH="process" - max_tasks_per_node=24 - export ntasks=24 - export threads_per_task=1 - export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) - export NTASKS=${ntasks} + tasks_per_node=48 ;; *) export PARTITION_BATCH="process" + unset PARTITION_SERVICE max_tasks_per_node=24 + tasks_per_node=24 ;; esac export max_tasks_per_node - +export tasks_per_node diff --git a/parm/config/gfs/config.resources.AWSPW b/parm/config/gfs/config.resources.AWSPW index 0255a9adad3..17a3924d29c 100644 --- a/parm/config/gfs/config.resources.AWSPW +++ b/parm/config/gfs/config.resources.AWSPW @@ -6,30 +6,24 @@ export is_exclusive="True" unset memory unset "memory_${RUN}" +step=$1 + case ${step} in - "fcst" | "efcs") + "fcst" | "efcs" | "wavepostpnt") export PARTITION_BATCH="compute" + unset PARTITION_SERVICE max_tasks_per_node=48 + tasks_per_node=48 ;; - "arch_vrfy" | "arch_tars") - export PARTITION_BATCH="process" - max_tasks_per_node=24 - ;; - - - "atmos_products" | "oceanice_products" | "wavepostsbs" ) - export PARTITION_BATCH="process" - max_tasks_per_node=24 - ;; - - *) export PARTITION_BATCH="process" + unset PARTITION_SERVICE max_tasks_per_node=24 + tasks_per_node=24 ;; esac export max_tasks_per_node - +export tasks_per_node diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index 105a35fd464..b353f002730 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -14,6 +14,9 @@ COMINsyn: '' #TODO: This does not yet exist. SCHEDULER: slurm QUEUE: batch PARTITION_BATCH: compute +PARTITION_SERVICE: process +CHGRP_RSTPROD: 'YES' +CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported. # HPSS properties HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. ARCHCOM_TO: 'local' diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index d8ef32627fd..02ac5c3b2ed 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -408,6 +408,10 @@ def get_resource(self, task_name): else: # This is a batch task task_partition = self.partition_batch + # on CSPs, partition_batch for fcst/efcs/wavepostbndpnt is "compute", + # others are "process". So need to modify task_partition here. + if (task_config['PARTITION_BATCH'] != self.partition_batch): + task_partition = task_config['PARTITION_BATCH'] task_queue = self.queue_batch task_clusters = self.clusters_batch task_constraint = self.constraint_batch diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index fa546b0c9e4..18170862859 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -209,6 +209,7 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: else: cron_cmd = rocotorunstr crontab_strings.extend([ + 'SHELL="/bin/bash"', f'MAILTO="{replyto}"' ]) @@ -218,13 +219,6 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: '' ]) - # AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. - if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']: - crontab_strings.extend([ - 'SHELL="/bin/bash"', - 'BASH_ENV="/etc/bashrc"' - ]) - if crontab_file is None: crontab_file = f"{self.expdir}/{self.pslot}.crontab"