diff --git a/scripts/exregional_make_ics.sh b/scripts/exregional_make_ics.sh index ec0fee623..1ee77918c 100755 --- a/scripts/exregional_make_ics.sh +++ b/scripts/exregional_make_ics.sh @@ -131,6 +131,7 @@ case "$MACHINE" in ;; "LINUX") + ulimit -s unlimited APRUN=$RUN_CMD_UTILS ;; diff --git a/scripts/exregional_make_lbcs.sh b/scripts/exregional_make_lbcs.sh index 1b1f84ff8..01aeaed96 100755 --- a/scripts/exregional_make_lbcs.sh +++ b/scripts/exregional_make_lbcs.sh @@ -131,6 +131,7 @@ case "$MACHINE" in ;; "LINUX") + ulimit -s unlimited APRUN=$RUN_CMD_UTILS ;; diff --git a/scripts/exregional_make_orog.sh b/scripts/exregional_make_orog.sh index e90b3b762..afd40d74d 100755 --- a/scripts/exregional_make_orog.sh +++ b/scripts/exregional_make_orog.sh @@ -153,6 +153,8 @@ case "$MACHINE" in "LINUX") APRUN=time + ulimit -s unlimited + ulimit -a ;; *) diff --git a/scripts/exregional_run_fcst.sh b/scripts/exregional_run_fcst.sh index 4b16f79d9..aae18a749 100755 --- a/scripts/exregional_run_fcst.sh +++ b/scripts/exregional_run_fcst.sh @@ -159,6 +159,8 @@ case "$MACHINE" in ;; "LINUX") + ulimit -s unlimited + ulimit -a APRUN=$RUN_CMD_FCST ;; diff --git a/ush/config_defaults.sh b/ush/config_defaults.sh index dd6a2968c..a12f5119f 100644 --- a/ush/config_defaults.sh +++ b/ush/config_defaults.sh @@ -41,7 +41,12 @@ RUN_ENVIR="nco" # Set machine and queue parameters. Definitions: # # MACHINE: -# Machine on which the workflow will run. +# Machine on which the workflow will run. If you are NOT on a named, +# supported platform, and you want to use the Rocoto workflow manager, +# you will need set MACHINE="linux" and WORKFLOW_MANAGER="rocoto". This +# combination will assume a Slurm batch manager when generating the XML. +# Please see ush/valid_param_vals.sh for a full list of supported +# platforms. # # ACCOUNT: # The account under which to submit jobs to the queue. @@ -49,7 +54,27 @@ RUN_ENVIR="nco" # WORKFLOW_MANAGER: # The workflow manager to use (e.g. rocoto). This is set to "none" by # default, but if the machine name is set to a platform that supports -# rocoto, this will be overwritten and set to "rocoto". +# rocoto, this will be overwritten and set to "rocoto". If set +# explicitly to rocoto along with the use of the MACHINE=linux target, +# the configuration layer assumes a Slurm batch manager when generating +# the XML. Valid options: "rocoto" or "none" +# +# NCORES_PER_NODE: +# The number of cores available per node on the compute platform. Set +# for supported platforms in setup.sh, but is now also configurable for +# all platforms. +# +# LMOD_PATH: +# Path to the LMOD sh file on your Linux system. Is set automatically +# for supported machines. +# +# BUILD_ENV_FN: +# Name of alternative build environment file to use if using an +# unsupported platform. Is set automatically for supported machines. +# +# WFLOW_ENV_FN: +# Name of alternative workflow environment file to use if using an +# unsupported platform. Is set automatically for supported machines. # # SCHED: # The job scheduler to use (e.g. slurm). Set this to an empty string in @@ -109,6 +134,10 @@ RUN_ENVIR="nco" MACHINE="BIG_COMPUTER" ACCOUNT="project_name" WORKFLOW_MANAGER="none" +NCORES_PER_NODE="" +LMOD_PATH="" +BUILD_ENV_FN="" +WFLOW_ENV_FN="" SCHED="" PARTITION_DEFAULT="" QUEUE_DEFAULT="" diff --git a/ush/launch_FV3LAM_wflow.sh b/ush/launch_FV3LAM_wflow.sh index 72cdd74c7..f92e5e076 100755 --- a/ush/launch_FV3LAM_wflow.sh +++ b/ush/launch_FV3LAM_wflow.sh @@ -84,6 +84,7 @@ fi #----------------------------------------------------------------------- # . $exptdir/var_defns.sh +. ${USHDIR}/source_util_funcs.sh # #----------------------------------------------------------------------- # @@ -101,13 +102,7 @@ expt_name="${EXPT_SUBDIR}" # #----------------------------------------------------------------------- # -if [ "$MACHINE" = "CHEYENNE" ]; then - module use -a /glade/p/ral/jntp/UFS_SRW_app/modules/ - module load rocoto -elif [ "$MACHINE" = "ORION" ]; then - module purge - module load contrib rocoto -elif [ "$MACHINE" = "WCOSS_DELL_P3" ]; then +if [ "$MACHINE" = "WCOSS_DELL_P3" ]; then module purge module load lsf/10.1 module use /gpfs/dell3/usrx/local/dev/emc_rocoto/modulefiles/ @@ -118,8 +113,14 @@ elif [ "$MACHINE" = "WCOSS_CRAY" ]; then module use -a /usrx/local/emc_rocoto/modulefiles module load rocoto/1.3.0rc2 else + machine=$(echo_lowercase $MACHINE) + env_fn=${WFLOW_ENV_FN:-"wflow_${machine}.env"} + env_fp="${SR_WX_APP_TOP_DIR}/env/${env_fn}" module purge - module load rocoto + source "${env_fp}" || print_err_msg_exit "\ + Sourcing platform-specific environment file (env_fp) for + the workflow task failed : + env_fp = \"${env_fp}\"" fi # #----------------------------------------------------------------------- @@ -162,35 +163,7 @@ cd "$exptdir" #----------------------------------------------------------------------- # -#rocotorun_output=$( ls -alF ) -#echo -#echo "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" -#echo "${rocotorun_output}" -#echo "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" - -#rocotorun_output=$( \ -#rocotorun -w "${WFLOW_XML_FN}" -d "${rocoto_database_fn}" -v 10 \ -#) -#rocotorun_output=$( (rocotorun -w "${WFLOW_XML_FN}" -d "${rocoto_database_fn}" -v 10) 2>&1 ) # This freezes the script. -#rocotorun_output=$( (rocotorun -w "${WFLOW_XML_FN}" -d "${rocoto_database_fn}" -v 10) 1>&2 ) # This leaves rocotorun_output empty. -#rocotorun_output=$( rocotorun -w "${WFLOW_XML_FN}" -d "${rocoto_database_fn}" -v 10 ) -#{ error=$(command 2>&1 1>&$out); } {out}>&1 -#{ rocotorun_output=$( rocotorun -w "${WFLOW_XML_FN}" -d "${rocoto_database_fn}" -v 10 2>&1 1>&$out); } {out}>&1 # This freezes the script. - -# -# Ideally, the following two lines should work, but for some reason the -# output of rocotorun cannot be captured in a variable using the $(...) -# notation. Maybe it's not being written to stdout, although I tried -# redirecting stderr to stdout and other tricks but nothing seemed to -# work. For this reason, below we first redirect the output of rocoto- -# run to a temporary file and then read in the contents of that file in- -# to the rocotorun_output variable using the cat command. -# -#rocotorun_cmd="rocotorun -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10" -#rocotorun_output=$( eval ${rocotorun_cmd} 2>&1 ) -# tmp_fn="rocotorun_output.txt" -#rocotorun_cmd="rocotorun -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10 > ${tmp_fn}" rocotorun_cmd="rocotorun -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10" eval ${rocotorun_cmd} > ${tmp_fn} 2>&1 rocotorun_output=$( cat "${tmp_fn}" ) @@ -217,18 +190,9 @@ done <<< "${rocotorun_output}" # #----------------------------------------------------------------------- # -#rocotostat_cmd="{ pwd; rocotostat -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10; }" -#rocotostat_cmd="{ pwd; ls -alF; rocotostat -w ${WFLOW_XML_FN} -d ${rocoto_database_fn} -v 10; }" -#rocotostat_cmd="{ pwd; ls -alF; rocotostat -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10; }" -#rocotostat_cmd="{ pwd; rocotostat -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10; }" -#rocotostat_cmd="{ rocotostat -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10; }" rocotostat_cmd="rocotostat -w \"${WFLOW_XML_FN}\" -d \"${rocoto_database_fn}\" -v 10" -#rocotostat_output=$( pwd; rocotostat -w "${WFLOW_XML_FN}" -d "${rocoto_database_fn}" -v 10 2>&1 ) -#rocotostat_output=$( rocotostat -w "${WFLOW_XML_FN}" -d "${rocoto_database_fn}" -v 10 2>&1 ) rocotostat_output=$( eval ${rocotostat_cmd} 2>&1 ) -#rocotostat_output=$( ${rocotostat_cmd} 2>&1 ) -#rocotostat_output=$( { pwd; ls -alF; } 2>&1 ) error_msg="DEAD" while read -r line; do grep_output=$( printf "$line" | grep "${error_msg}" ) diff --git a/ush/load_modules_run_task.sh b/ush/load_modules_run_task.sh index 411f0bdb1..1f0d88126 100755 --- a/ush/load_modules_run_task.sh +++ b/ush/load_modules_run_task.sh @@ -71,30 +71,6 @@ fi # ..." and "module load ..." calls later below that are used to load the # appropriate module file for the specified task. # -# Note that the build of the FV3 forecast model code generates the shell -# script at -# -# ${UFS_WTHR_MDL_DIR}/NEMS/src/conf/module-setup.sh -# -# that can be used to initialize the Lmod (Lua-based module) system/ -# software for handling modules. This script: -# -# 1) Detects the shell in which it is being invoked (i.e. the shell of -# the "parent" script in which it is being sourced). -# 2) Detects the machine it is running on and and calls the appropriate -# (shell- and machine-dependent) initalization script to initialize -# Lmod. -# 3) Purges all modules. -# 4) Uses the "module use ..." command to prepend or append paths to -# Lmod's search path (MODULEPATH). -# -# We could use this module-setup.sh script to initialize Lmod, but since -# it is only found in the forecast model's directory tree, here we pre- -# fer to perform our own initialization. Ideally, there should be one -# module-setup.sh script that is used by all external repos/codes, but -# such a script does not exist. If/when it does, we will consider -# switching to it instead of using the case-statement below. -# #----------------------------------------------------------------------- # print_info_msg "$VERBOSE" " @@ -128,10 +104,14 @@ case "$MACHINE" in ;; # *) - print_err_msg_exit "\ -The script to source to initialize lmod (module loads) has not yet been -specified for the current machine (MACHINE): - MACHINE = \"$MACHINE\"" + if [[ -n ${LMOD_PATH:-""} && -f ${LMOD_PATH:-""} ]] ; then + . ${LMOD_PATH} + else + print_err_msg_exit "\ + The script to source to initialize lmod (module loads) has not yet been + specified for the current machine (MACHINE): + MACHINE = \"$MACHINE\"" + fi ;; # esac @@ -147,13 +127,15 @@ jjob_fp="$2" # #----------------------------------------------------------------------- # -# Sourcing ufs-srweather-app README file (in directory specified by mod- -# ules_dir) for the specified task +# Sourcing ufs-srweather-app build env file # #----------------------------------------------------------------------- # + +module purge + machine=$(echo_lowercase $MACHINE) -env_fn="build_${machine}_${COMPILER}.env" +env_fn=${BUILD_ENV_FN:-"build_${machine}_${COMPILER}.env"} env_fp="${SR_WX_APP_TOP_DIR}/env/${env_fn}" source "${env_fp}" || print_err_msg_exit "\ Sourcing platform- and compiler-specific environment file (env_fp) for the @@ -172,25 +154,23 @@ workflow task specified by task_name failed: # sets environment variables (including prepending/appending to paths) # and loads modules. # -# The regional_workflow repository contains module files for all the +# The regional_workflow repository contains module files for the # workflow tasks in the template rocoto XML file for the FV3-LAM work- -# flow. The full path to a module file for a given task is +# flow that need modules not loaded in the env_fn above. +# +# The full path to a module file for a given task is # -# $HOMErrfs/modulefiles/$machine/${task_name} +# $HOMErrfs/modulefiles/$machine/${task_name}.local # # where HOMErrfs is the base directory of the workflow, machine is the # name of the machine that we're running on (in lowercase), and task_- -# name is the name of the current task (an input to this script). The -# collection of modulefiles is staged by the generate_workflow.sh -# script. Please see that script for information on their creation. +# name is the name of the current task (an input to this script). # #----------------------------------------------------------------------- # modules_dir="$HOMErrfs/modulefiles/tasks/$machine" modulefile_name="${task_name}" default_modules_dir="$HOMErrfs/modulefiles" -default_modulefile_name="${machine}.default" -use_default_modulefile=0 # #----------------------------------------------------------------------- # @@ -198,46 +178,28 @@ use_default_modulefile=0 # #----------------------------------------------------------------------- # - print_info_msg "$VERBOSE" " + +print_info_msg "$VERBOSE" " Loading modules for task \"${task_name}\" ..." - module use "${modules_dir}" || print_err_msg_exit "\ +module use "${modules_dir}" || print_err_msg_exit "\ Call to \"module use\" command failed." - # - # If NOT using the default modulefile... - # -# if [ ${use_default_modulefile} -eq 0 ]; then -# -# module use -a "${modules_dir}" || print_err_msg_exit "\ -#Call to \"module use\" command failed." -# - # - # Load the .local module file if available for the given task - # - modulefile_local="${task_name}.local" - if [ -f ${modules_dir}/${modulefile_local} ]; then - module load "${modulefile_local}" || print_err_msg_exit "\ -Loading .local module file (in directory specified by mod- -ules_dir) for the specified task (task_name) failed: - task_name = \"${task_name}\" - modulefile_local = \"${modulefile_local}\" - modules_dir = \"${modules_dir}\"" - fi - -# else # using default modulefile # -# module load "${default_modulefile_name}" || print_err_msg_exit "\ -#Loading of default module file failed: -# task_name = \"${task_name}\" -# default_modulefile_name = \"${default_modulefile_name}\" -# default_modules_dir = \"${default_modules_dir}\"" +# Load the .local module file if available for the given task # -# fi +modulefile_local="${task_name}.local" +if [ -f ${modules_dir}/${modulefile_local} ]; then + module load "${modulefile_local}" || print_err_msg_exit "\ + Loading .local module file (in directory specified by mod- + ules_dir) for the specified task (task_name) failed: + task_name = \"${task_name}\" + modulefile_local = \"${modulefile_local}\" + modules_dir = \"${modules_dir}\"" +fi - module list +module list -#fi #End if statement for tasks that load no modules # Modules that use conda and need an environment activated will set the # SRW_ENV variable to the name of the environment to be activated. That diff --git a/ush/setup.sh b/ush/setup.sh index 96b1ef071..cd60d5962 100755 --- a/ush/setup.sh +++ b/ush/setup.sh @@ -1,3 +1,4 @@ +#!/bin/bash # #----------------------------------------------------------------------- # @@ -512,7 +513,7 @@ check_var_valid_value "MACHINE" "valid_vals_MACHINE" # several queues. These queues are defined in the default and local # workflow/experiment configuration script. # -# Also, set the machine-dependent flag RELAITVE_OR_NULL that specifies +# Also, set the machine-dependent flag RELATIVE_OR_NULL that specifies # the flag to pass to the link creation command (ln_vrfy) when attempting # to create relative symlinks. On machines that don't support relative # symlinks, it should be set to a null string. @@ -520,12 +521,11 @@ check_var_valid_value "MACHINE" "valid_vals_MACHINE" #----------------------------------------------------------------------- # RELATIVE_LINK_FLAG="" -NCORES_PER_NODE="2" # Need some arbitrary default value to avoid division by zero errors case $MACHINE in "WCOSS_CRAY") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE="24" + NCORES_PER_NODE="${NCORES_PER_NODE:-24}" SCHED="lsfcray" QUEUE_DEFAULT=${QUEUE_DEFAULT:-"dev"} QUEUE_HPSS=${QUEUE_HPSS:-"dev_transfer"} @@ -536,7 +536,7 @@ case $MACHINE in "WCOSS_DELL_P3") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE=24 + NCORES_PER_NODE="${NCORES_PER_NODE:-24}" SCHED="lsf" QUEUE_DEFAULT=${QUEUE_DEFAULT:-"dev"} QUEUE_HPSS=${QUEUE_HPSS:-"dev_transfer"} @@ -547,7 +547,7 @@ case $MACHINE in "HERA") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE=40 + NCORES_PER_NODE="${NCORES_PER_NODE:-40}" SCHED=${SCHED:-"slurm"} PARTITION_DEFAULT=${PARTITION_DEFAULT:-"hera"} QUEUE_DEFAULT=${QUEUE_DEFAULT:-"batch"} @@ -561,7 +561,7 @@ case $MACHINE in "ORION") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE=40 + NCORES_PER_NODE="${NCORES_PER_NODE:-40}" SCHED=${SCHED:-"slurm"} PARTITION_DEFAULT=${PARTITION_DEFAULT:-"orion"} QUEUE_DEFAULT=${QUEUE_DEFAULT:-"batch"} @@ -575,7 +575,7 @@ case $MACHINE in "JET") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE=24 + NCORES_PER_NODE="${NCORES_PER_NODE:-24}" SCHED=${SCHED:-"slurm"} PARTITION_DEFAULT=${PARTITION_DEFAULT:-"sjet,vjet,kjet,xjet"} QUEUE_DEFAULT=${QUEUE_DEFAULT:-"batch"} @@ -589,7 +589,7 @@ case $MACHINE in "ODIN") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE=24 + NCORES_PER_NODE="${NCORES_PER_NODE:-24}" SCHED=${SCHED:-"slurm"} PARTITION_DEFAULT=${PARTITION_DEFAULT:-"workq"} QUEUE_DEFAULT=${QUEUE_DEFAULT:-"workq"} @@ -603,7 +603,7 @@ case $MACHINE in "CHEYENNE") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE=36 + NCORES_PER_NODE="${NCORES_PER_NODE:-36}" SCHED=${SCHED:-"pbspro"} QUEUE_DEFAULT=${QUEUE_DEFAULT:-"regular"} QUEUE_HPSS=${QUEUE_HPSS:-"regular"} @@ -614,7 +614,7 @@ case $MACHINE in "STAMPEDE") WORKFLOW_MANAGER="rocoto" - NCORES_PER_NODE=68 + NCORES_PER_NODE="${NCORES_PER_NODE:-68}" SCHED="slurm" PARTITION_DEFAULT=${PARTITION_DEFAULT:-"normal"} QUEUE_DEFAULT=${QUEUE_DEFAULT:-"normal"} @@ -632,11 +632,26 @@ case $MACHINE in ;; "LINUX") - WORKFLOW_MANAGER="none" - SCHED="none" + WORKFLOW_MANAGER=${WORKFLOW_MANAGER:-"none"} + SCHED=${SCHED:-"none"} + ;; + + "*") + NCORES_PER_NODE="2" # Need some arbitrary default value to avoid division by zero errors + + print_err_msg_exit "\ + You are running on an unknown platform! MACHINE=${MACHINE} is not a valid + choice." ;; esac + +if [ -z "$NCORES_PER_NODE" ]; then + print_err_msg_exit "\ + NCORES_PER_NODE is a required setting for your platform! Please + set it in config.sh. + MACHINE = ${MACHINE}" +fi # #----------------------------------------------------------------------- # @@ -658,9 +673,8 @@ check_var_valid_value "SCHED" "valid_vals_SCHED" # #----------------------------------------------------------------------- # -# If we are using a workflow manager, run some checks. First, -# verify that the ACCOUNT variable is not empty. Second, ensure that the -# custom RUN_CMD variables are not set. +# If we are using a workflow manager check that the ACCOUNT variable is +# not empty. # #----------------------------------------------------------------------- # @@ -671,9 +685,6 @@ The variable ACCOUNT cannot be empty if you are using a workflow manager: ACCOUNT = \"$ACCOUNT\" WORKFLOW_MANAGER = \"$WORKFLOW_MANAGER\"" fi - RUN_CMD_UTILS="" - RUN_CMD_FCST="" - RUN_CMD_POST="" fi # #----------------------------------------------------------------------- @@ -2942,6 +2953,15 @@ fi # #----------------------------------------------------------------------- # +# Because RUN_CMD_FCST can include PE_MEMBER01 (and theoretically other +# variables calculated in this script), delete the first occurrence of it +# in the var_defns file, and write it again at the end. +# +#----------------------------------------------------------------------- +$SED -i '/^RUN_CMD_FCST=/d' $GLOBAL_VAR_DEFNS_FP +# +#----------------------------------------------------------------------- +# # Continue appending variable definitions to the variable definitions # file. # @@ -3040,7 +3060,7 @@ FVCOM_FILE="${FVCOM_FILE}" # NCORES_PER_NODE="${NCORES_PER_NODE}" PE_MEMBER01="${PE_MEMBER01}" -RUN_CMD_FCST="${RUN_CMD_FCST}" +RUN_CMD_FCST=$(eval echo ${RUN_CMD_FCST}) # #----------------------------------------------------------------------- # diff --git a/ush/templates/FV3LAM_wflow.xml b/ush/templates/FV3LAM_wflow.xml index f7e7b9471..cc77b5ad4 100644 --- a/ush/templates/FV3LAM_wflow.xml +++ b/ush/templates/FV3LAM_wflow.xml @@ -414,7 +414,7 @@ MODULES_RUN_TASK_FP script. &RSRV_FCST; &LOAD_MODULES_RUN_TASK_FP; "&RUN_FCST_TN;" "&JOBSDIR;/JREGIONAL_RUN_FCST" - {%- if machine in ["JET", "HERA"] %} + {%- if machine in ["JET", "HERA", "LINUX"] %} {{ ncores_run_fcst }} {{ native_run_fcst }} {%- else %}