diff --git a/.gitignore b/.gitignore index 7524452d89f..7e1d7a90447 100644 --- a/.gitignore +++ b/.gitignore @@ -175,6 +175,7 @@ ush/bufr_snocvr_snomad.py ush/atparse.bash ush/run_bufr2ioda.py ush/bufr2ioda_insitu* +ush/container ush/python/ufsda ush/python/soca ush/python/gen_bufr2ioda_json.py diff --git a/dev/container/gen-run-cases.sh b/dev/container/gen-run-cases.sh new file mode 100755 index 00000000000..99d22c86915 --- /dev/null +++ b/dev/container/gen-run-cases.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +set -x + +HOMEgfs="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." >/dev/null 2>&1 && pwd )" +source "${HOMEgfs}/ush/detect_machine.sh" + +run_with_container="YES" +#run_with_container="NO" + + casetype="pr" +#yamllist="C48_ATM" + yamllist="C48_S2SW" +#yamllist="C48_S2SWA_gefs" +#yamllist="C96mx100_S2S" + +#casetype=hires +#yamllist="C768_S2SW" + +HOMEDIR=${HOMEgfs} +img=ubuntu22.04-intel-ufs-env-v1.9.2.img +if [[ ${MACHINE_ID} = ursa* ]] ; then + container=/scratch3/NCEPDEV/nems/role.epic/containers/${img} + rundir=/scratch3/NAGAPE/epic/${USER}/run + bindings="-B /scratch3 -B /scratch4" + HPC_ACCOUNT=epic + + module load rocoto/1.3.7 + rocotocmd=$(command -v rocotorun) +elif [[ ${MACHINE_ID} = gaea* ]] ; then + container=/gpfs/f6/scratch/Wei.Huang/container/${img} + rundir=/gpfs/f6/scratch/${USER}/run + bindings="-B /gpfs/f6/scratch -B /ncrc/home1/${USER}" + HPC_ACCOUNT=bil-fire8 + + rocotocmd=/autofs/ncrc-svm1_home2/Christopher.W.Harrop/rocoto-1.3.7/bin/rocotorun +elif [[ ${MACHINE_ID} = noaacloud* ]] ; then + TOPICDIR=/bucket/global-workflow-shared-data/ICSDIR + container=/contrib/containers/${img} + rundir=/lustre/${USER}/run + bindings="--env \"I_MPI_FABRICS=shm:ofi,I_MPI_DEBUG=6\" -B /apps/slurm/default/lib/libpmi2.so -B /contrib -B /lustre -B /bucket" + #bindings="-B /apps/slurm/default/lib/libpmi2.so -B /contrib -B /lustre -B /bucket" + HPC_ACCOUNT=${USER} + + module load rocoto/1.3.7 + rocotocmd=$(command -v rocotorun) +fi + +set -x + +mkdir -p "${rundir}" +mkdir -p "${HOMEDIR}"/exec +mkdir -p "${HOMEDIR}"/ush/container + +cd "${HOMEDIR}/dev/workflow" || exit 1 + +if [[ "${run_with_container}" == "YES" ]]; then + "${HOMEDIR}/dev/container/utils/gen-wrapper.sh" -H "${HOMEDIR}" -c "${container}" -b "${bindings}" -v + + TOPICDIR=${TOPICDIR} \ + RUNTESTS=${rundir} \ + RUNDIRS=${rundir} \ + ./generate_workflows.sh \ + -H "${HOMEDIR}" \ + -y "${yamllist}" \ + -Y "${HOMEDIR}/dev/ci/cases/${casetype}" \ + -A "${HPC_ACCOUNT}" \ + -e "${USER}@noaa.gov" \ + -r "${rocotocmd}" \ + -v -R + + "${HOMEDIR}/dev/container/utils/create-atmos-products.sh" -H "${HOMEDIR}" -c "${container}" -b "${bindings}" + "${HOMEDIR}/dev/container/utils/create-container-links.sh" -H "${HOMEDIR}" -c "${container}" -b "${bindings}" -M "${MACHINE_ID}" +else + TOPICDIR=${TOPICDIR} \ + RUNTESTS=${rundir} \ + RUNDIRS=${rundir} \ + ./generate_workflows.sh \ + -H "${HOMEDIR}" \ + -y "${yamllist}" \ + -Y "${HOMEDIR}/dev/ci/cases/${casetype}" \ + -A "${HPC_ACCOUNT}" \ + -e "${USER}@noaa.gov" \ + -v +fi + diff --git a/dev/container/shell-in-container.sh b/dev/container/shell-in-container.sh new file mode 100755 index 00000000000..9d2a0a4d1f3 --- /dev/null +++ b/dev/container/shell-in-container.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -x + +HOMEgfs="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." >/dev/null 2>&1 && pwd )" +source "${HOMEgfs}/ush/detect_machine.sh" +sif=ubuntu22.04-intel-ufs-env-v1.9.2.img + +if [[ ${MACHINE_ID} = ursa* ]] ; then + img=/scratch3/NCEPDEV/nems/role.epic/containers/${sif} + bindings="-B /scratch3 -B /scratch4" +elif [[ ${MACHINE_ID} = gaea* ]] ; then + img=/gpfs/f6/scratch/${USER}/container/${sif} + bindings="-B /gpfs/f6/scratch -B /ncrc/home1/${USER}" +elif [[ ${MACHINE_ID} = noaacloud* ]] ; then + img=/contrib/containers/${sif} + bindings="-B /contrib -B /lustre -B /bucket" +fi + +singularity shell -e ${bindings} "${img}" + diff --git a/dev/container/utils/create-atmos-products.sh b/dev/container/utils/create-atmos-products.sh new file mode 100755 index 00000000000..27d902ed79b --- /dev/null +++ b/dev/container/utils/create-atmos-products.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +verbose=false + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -H|--HOMEgfs) + HOMEgfs="$2" + shift 2 + ;; + -c|--container) + container="$2" + shift 2 + ;; + -b|--bindings) + bindings="$2" + shift 2 + ;; + -v|--verbose) + verbose=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [[ ! -v HOMEgfs || ! -v container || ! -v bindings ]]; then + echo "Usage: create-container-links.sh -H/--HOMEgfs gw-home-dir -c/--container container-fullpath -b/--bindings list-of-binding-dirs [-v]" + exit 11 +fi + +if [[ "${verbose}" == "true" ]]; then + echo "Verbose: ${verbose}" + echo "HOMEgfs: ${HOMEgfs}" + echo "container: ${container}" + echo "bindings: ${bindings}" +fi + +eap_script="${HOMEgfs}"/exec/exglobal_atmos_products.sh +cat > "${eap_script}" << EOF_ATMOS_PRODUCTS +#!/bin/bash + LD_LIBRARY_PATH=\$(dirname ${HOMEgfs}) + export LD_LIBRARY_PATH + + singularity exec \\ + ${bindings} \\ + ${container} \\ + ${HOMEgfs}/scripts/exglobal_atmos_products.sh "\$@" +EOF_ATMOS_PRODUCTS + +chmod +x "${eap_script}" + diff --git a/dev/container/utils/create-container-links.sh b/dev/container/utils/create-container-links.sh new file mode 100755 index 00000000000..808d1902b1c --- /dev/null +++ b/dev/container/utils/create-container-links.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +verbose=false + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -H|--HOMEgfs) + HOMEgfs="$2" + shift 2 + ;; + -c|--container) + container="$2" + shift 2 + ;; + -b|--bindings) + bindings="$2" + shift 2 + ;; + -v|--verbose) + verbose=true + shift + ;; + -M|--MACHINE_ID) + machineid="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [[ ! -v HOMEgfs || ! -v container || ! -v bindings || ! -v MACHINE_ID ]]; then + echo "Usage: create-container-links.sh -H/--HOMEgfs gw-home-dir -c/--container container-fullpath -b/--bindings -M|--MACHINE_ID list-of-binding-dirs [-v]" + exit 11 +fi + +if [[ "${verbose}" == "true" ]]; then + echo "HOMEgfs: ${HOMEgfs}" + echo "container: ${container}" + echo "bindings: ${bindings}" + echo "Verbose: ${verbose}" +fi + +"${HOMEgfs}/dev/container/utils/link_ww3.sh" -H "${HOMEgfs}" -c "${container}" -b "${bindings}" -t gfs +"${HOMEgfs}/dev/container/utils/link_ww3.sh" -H "${HOMEgfs}" -c "${container}" -b "${bindings}" -t sfs +"${HOMEgfs}/dev/container/utils/link_ww3.sh" -H "${HOMEgfs}" -c "${container}" -b "${bindings}" -t gefs + +"${HOMEgfs}/dev/container/utils/link_model.sh" -H "${HOMEgfs}" -c "${container}" -m gfs_model -b "${bindings}" -M "${machineid}" +"${HOMEgfs}/dev/container/utils/link_model.sh" -H "${HOMEgfs}" -c "${container}" -m sfs_model -b "${bindings}" -M "${machineid}" +"${HOMEgfs}/dev/container/utils/link_model.sh" -H "${HOMEgfs}" -c "${container}" -m gefs_model -b "${bindings}" -M "${machineid}" + +"${HOMEgfs}/dev/container/utils/link_gfs_utils.sh" -H "${HOMEgfs}" -c "${container}" -b "${bindings}" +"${HOMEgfs}/dev/container/utils/link_ufs_utils.sh" -H "${HOMEgfs}" -c "${container}" -b "${bindings}" + diff --git a/dev/container/utils/gen-wrapper.sh b/dev/container/utils/gen-wrapper.sh new file mode 100755 index 00000000000..ad9a5fbe842 --- /dev/null +++ b/dev/container/utils/gen-wrapper.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +verbose=false + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -H|--HOMEgfs) + HOMEgfs="$2" + shift 2 + ;; + -b|--bindings) + bindings="$2" + shift 2 + ;; + -c|--container) + container="$2" + shift 2 + ;; + -v|--verbose) + verbose=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [[ ! -v HOMEgfs || ! -v container ]]; then + echo "Usage: link_model.sh -H/-HOMEgfs gw-home-dir -c/--container full-path-container-image -b/--bindings -B dirname [-B dirname1 [...]] [-v]" + exit 11 +fi + +if [[ "${verbose}" == "true" ]]; then + set -x +fi + +exec_python_script="${HOMEgfs}"/exec/run_python.sh + +cat > "${exec_python_script}" << EOF_EXEC_PYTHON +#!/bin/bash + LD_LIBRARY_PATH=\$(dirname "${container}") + export LD_LIBRARY_PATH + + singularity exec \\ + ${bindings} \\ + ${container} \\ + ${HOMEgfs}/ush/container/run_python.sh "\$@" +EOF_EXEC_PYTHON + +run_python_script="${HOMEgfs}"/ush/container/run_python.sh + +cat > "${run_python_script}" << EOF_RUN_PYTHON +#!/bin/bash + +source /usr/lmod/lmod/init/bash +module purge +module use "${HOMEgfs}"/sorc/gfs_utils.fd/modulefiles +module load gfsutils_container.intel +module load python +module load py-netcdf4 +module load py-xarray +module load py-f90nml +module load py-numpy +module load py-jinja2 +module load py-pyyaml + +wxflowPATH=${HOMEgfs}/ush/python:${HOMEgfs}/sorc/wxflow/src +export PYTHONPATH=\${PYTHONPATH:+\${PYTHONPATH}:}${HOMEgfs}/ush:\${wxflowPATH} + +python "\$@" +EOF_RUN_PYTHON + +sed -i 's/RUN_WITH_CONTAINER=NO/RUN_WITH_CONTAINER=YES/g' "${HOMEgfs}/ush/preamble.sh" +chmod +x "${exec_python_script}" +chmod +x "${run_python_script}" + +for item in JGLOBAL_WAVE_INIT +do + exec_script="${HOMEgfs}"/exec/"${item}" + +cat > "${exec_script}" << EOF_SCRIPT +#!/bin/bash +#Need these lines on AWS to run more than one node. +#export I_MPI_DEBUG=10 +#export I_MPI_FABRICS=shm:ofi +#export I_MPI_OFI_PROVIDER=tcp +#export FI_PROVIDER=tcp +#export FI_TCP_IFACE=eth0 + + LD_LIBRARY_PATH=\$(dirname "${container}") + export LD_LIBRARY_PATH + + singularity exec \\ + ${bindings} \\ + ${container} \\ + ${HOMEgfs}/jobs/${item} +EOF_SCRIPT + + chmod +x "${exec_script}" +done + diff --git a/dev/container/utils/link_gfs_utils.sh b/dev/container/utils/link_gfs_utils.sh new file mode 100755 index 00000000000..362c6a802bc --- /dev/null +++ b/dev/container/utils/link_gfs_utils.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +verbose=false + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -H|--HOMEgfs) + HOMEgfs="$2" + shift 2 + ;; + -b|--bindings) + bindings="$2" + shift 2 + ;; + -c|--container) + container="$2" + shift 2 + ;; + -v|--verbose) + verbose=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [[ ! -v HOMEgfs || ! -v container ]]; then + echo "Usage: link_model.sh -H/-HOMEgfs gw-home-dir -c/--container full-path-container-image -b/--bindings -B dirname [-B dirname1 [...]] [-v]" + exit 11 +fi + +if [[ "${verbose}" == "true" ]]; then + set -x +fi + +for nm in enkf_chgres_recenter_nc ensadd ensppf ensstat fbwndgfs \ + gaussian_sfcanl gefs_6h_ave_1mem gfs_bufr \ + mkgfsawps ocnicepost overgridid reg2grb2 supvit \ + syndat_getjtbul syndat_maksynrc syndat_qctropcy \ + tave tocsbufr vint wave_stat webtitle rdbfmsua +do + model=${nm} + # echo "model: $model" + + run_model_script=${HOMEgfs}/ush/container/run_${model}.sh + rm -f "${run_model_script}" + + cat > "${run_model_script}" << EOF_MODEL +#!/bin/bash + +source /usr/lmod/lmod/init/bash +module use "${HOMEgfs}/sorc/gfs_utils.fd/modulefiles" +module load gfsutils_container.intel +module load wgrib2/3.6.0 + +${HOMEgfs}/sorc/gfs_utils.fd/install/bin/${model}.x "\$@" +EOF_MODEL + + chmod 755 "${run_model_script}" + + #link_model_script=${HOMEgfs}/exec/${model} + #rm -f ${link_model_script} + + link_model_script=${HOMEgfs}/exec/${model}.x + rm -f "${link_model_script}" + + cat > "${link_model_script}" << EOF_LINK +#!/bin/bash + LD_LIBRARY_PATH=$(dirname "${container}") + export LD_LIBRARY_PATH + singularity exec "${bindings}" "${container}" "${run_model_script}" "\$@" +EOF_LINK + + chmod 755 "${link_model_script}" +done + +direct_model_script=${HOMEgfs}/exec/ocnicepost.x +rm -f "${direct_model_script}" + +cat > "${direct_model_script}" << EOF_DIRECT +#!/bin/bash + +source /usr/lmod/lmod/init/bash +module use "${HOMEgfs}/sorc/gfs_utils.fd/modulefiles" +module load gfsutils_container.intel +module load wgrib2/3.6.0 + +${HOMEgfs}/sorc/gfs_utils.fd/install/bin/ocnicepost.x "\$@" +EOF_DIRECT + +chmod 755 "${direct_model_script}" + diff --git a/dev/container/utils/link_model.sh b/dev/container/utils/link_model.sh new file mode 100755 index 00000000000..9d0b383217a --- /dev/null +++ b/dev/container/utils/link_model.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +verbose=false +bindings="-B /scratch3 -B /scratch4" +machineid="ursa" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -H|--HOMEgfs) + HOMEgfs="$2" + shift 2 + ;; + -b|--bindings) + bindings="$2" + shift 2 + ;; + -c|--container) + container="$2" + shift 2 + ;; + -m|--model) + model="$2" + shift 2 + ;; + -M|--MACHINE_ID) + machineid="$2" + shift 2 + ;; + -v|--verbose) + verbose=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [[ ! -v HOMEgfs || ! -v container || ! -v model || ! -v MACHINE_ID ]]; then + echo "Usage: link_model.sh -H/-HOMEgfs gw-home-dir -c/--container full-path-container-image \\" + echo " -m/--model name_model -M/MACHINE_ID MACHINE_ID -b/--bindings [...]] [-v]" + exit 11 +fi + +if [[ "${verbose}" == "true" ]]; then + set -x +fi + +run_model_script=${HOMEgfs}/ush/container/run_${model}.sh +rm -f "${run_model_script}" + +cat > "${run_model_script}" << EOF_MODEL +#!/bin/bash + +source /usr/lmod/lmod/init/bash +module use "${HOMEgfs}/sorc/ufs_model.fd/modulefiles" +module load ufs_container.intel + +${HOMEgfs}/sorc/ufs_model.fd/tests/${model}.x "\$@" +EOF_MODEL + +link_model_script=${HOMEgfs}/exec/${model}.x +rm -f "${link_model_script}" + +case "${machineid}" in + ursa) +cat > "${link_model_script}" << EOF_URSA +#!/bin/bash + +# --- MPI and Fabric Configuration --- +# 1. Force Intel MPI to use Slurm's PMI2 library for job startup +# for Ursa +export I_MPI_PMI_LIBRARY=/apps/slurm/default/lib/libpmi2.so + +HOST_SLURM_PATH=/apps/slurm/default +HOST_MPI_PATH=/apps/spack-2024-12/linux-rocky9-x86_64/gcc-11.4.1/intel-oneapi-compilers-2024.2.1-oqhstbmawnrsdw472p4pjsopj547o6xs/compiler/2024.2/opt/compiler + + LD_LIBRARY_PATH=$(dirname "${container}") + export LD_LIBRARY_PATH + singularity exec \\ + --bind \${HOST_SLURM_PATH}:\${HOST_SLURM_PATH} \\ + --bind \${HOST_MPI_PATH}:\${HOST_MPI_PATH} \\ + ${bindings} \\ + ${container} \\ + ${run_model_script} "\$@" +EOF_URSA + ;; + + gaea*) +cat > "${link_model_script}" << EOF_GAEA +#!/bin/bash +#export SINGULARITY_ENABLE_OVERLAY=try +#export SINGULARITY_DISABLE_OVERLAY=yes +#export SINGULARITY_DEBUG=10 +#export SINGULARITY_DEBUG=0 +#unset SINGULARITY_DEBUG + + LD_LIBRARY_PATH=$(dirname "${container}") + export LD_LIBRARY_PATH + set +x + singularity exec \\ + ${bindings} \\ + ${container} \\ + ${run_model_script} "\$@" +EOF_GAEA + ;; + + noaacloud) +cat > "${link_model_script}" << EOF_NOAACLOUD +#!/bin/bash + +#Need these lines on AWS to run more than one node. +#export I_MPI_DEBUG=10 + export I_MPI_FABRICS=shm:ofi + export I_MPI_OFI_PROVIDER=tcp + export FI_PROVIDER=tcp + export FI_TCP_IFACE=eth0 + + LD_LIBRARY_PATH=$(dirname "${container}") + export LD_LIBRARY_PATH + set +x + singularity exec \\ + ${bindings} \\ + ${container} \\ + ${run_model_script} "\$@" +EOF_NOAACLOUD + ;; + + *) +cat > "${link_model_script}" << EOF_LINK +#!/bin/bash + LD_LIBRARY_PATH=$(dirname "${container}") + export LD_LIBRARY_PATH + set +x + singularity exec \\ + ${bindings} \\ + ${container} \\ + ${run_model_script} "\$@" +EOF_LINK + ;; + +esac + +chmod 755 "${run_model_script}" +chmod 755 "${link_model_script}" + diff --git a/dev/container/utils/link_ufs_utils.sh b/dev/container/utils/link_ufs_utils.sh new file mode 100755 index 00000000000..e7b5be9fd1e --- /dev/null +++ b/dev/container/utils/link_ufs_utils.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +verbose=false + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -H|--HOMEgfs) + HOMEgfs="$2" + shift 2 + ;; + -b|--bindings) + bindings="$2" + shift 2 + ;; + -c|--container) + container="$2" + shift 2 + ;; + -v|--verbose) + verbose=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [[ ! -v HOMEgfs || ! -v container ]]; then + echo "Usage: link_model.sh -H/-HOMEgfs gw-home-dir -c/--container full-path-container-image -b/--bindings [-v]" + exit 11 +fi + +if [[ "${verbose}" == "true" ]]; then + set -x +fi + +for nm in emcsfc_ice_blend emcsfc_snow2mdl fregrid global_cycle regridStates.x +do + model=${nm} + + run_model_script=${HOMEgfs}/ush/container/run_${model}.sh + rm -f "${run_model_script}" + + cat > "${run_model_script}" << EOF_MODEL +#!/bin/bash + +source /usr/lmod/lmod/init/bash +module purge +module use ${HOMEgfs}/sorc/ufs_utils.fd/modulefiles +module load build.container.intel + +${HOMEgfs}/sorc/ufs_utils.fd/exec/${model} "\$@" +EOF_MODEL + + chmod 755 "${run_model_script}" + + #link_model_script=${HOMEgfs}/exec/${model} + #rm -f ${link_model_script} + + link_model_script=${HOMEgfs}/exec/${model} + rm -f "${link_model_script}" + + cat > "${link_model_script}" << EOF_LINK +#!/bin/bash + LD_LIBRARY_PATH=$(dirname "${container}") + export LD_LIBRARY_PATH + singularity exec ${bindings} ${container} ${run_model_script} "\$@" +EOF_LINK + + chmod 755 "${link_model_script}" +done + diff --git a/dev/container/utils/link_ww3.sh b/dev/container/utils/link_ww3.sh new file mode 100755 index 00000000000..1950ab0de14 --- /dev/null +++ b/dev/container/utils/link_ww3.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +verbose=false + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -H|--HOMEgfs) + HOMEgfs="$2" + shift 2 + ;; + -b|--bindings) + bindings="$2" + shift 2 + ;; + -c|--container) + container="$2" + shift 2 + ;; + -t|--type) + type="$2" + shift 2 + ;; + -v|--verbose) + verbose=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [[ ! -v HOMEgfs || ! -v container || ! -v type ]]; then + echo "Usage: link_model.sh -H/-HOMEgfs gw-home-dir -c/--container full-path-container-image \\" + echo " -b/--bindings -B dirname [-B dirname1 [...]] -t/--type [gfs|sfs|gefs] [-v]" + exit 11 +fi + +if [[ "${verbose}" == "true" ]]; then + set -x +fi + +#if [[ "$type" == "gfs" ]]; then + pdlib=pdlib_ON +#else +# pdlib=pdlib_OFF +#fi + +#for nm in gint grib grid ounf ounp outf outp prep prnc +for nm in gint grib ounf ounp outf outp prep prnc +do + model=ww3_${nm} + #echo "model: $model" + + run_model_script=${HOMEgfs}/ush/container/run_${type}_${model}.sh + rm -f "${run_model_script}" + + cat > "${run_model_script}" << EOF_MODEL +#!/bin/bash + +# Set OMP_NUM_THREADS to 1 to avoid oversubscription when doing MPMD +export OMP_NUM_THREADS=1 + +source /usr/lmod/lmod/init/bash +module purge +module use ${HOMEgfs}/sorc/gfs_utils.fd/modulefiles +module load gfsutils_container.intel + +${HOMEgfs}/sorc/ufs_model.fd/WW3/install/${pdlib}/bin/${model} "\$@" +EOF_MODEL + + chmod 755 "${run_model_script}" + + link_model_script=${HOMEgfs}/exec/${type}_${model}.x + rm -f "${link_model_script}" + + cat > "${link_model_script}" << EOF_LINK +#!/bin/bash + LD_LIBRARY_PATH=$(dirname "${container}") + export LD_LIBRARY_PATH + singularity exec ${bindings} ${container} ${run_model_script} "\$@" +EOF_LINK + + chmod 755 "${link_model_script}" +done + diff --git a/dev/jobs/waveinit.sh b/dev/jobs/waveinit.sh index c04a50298ec..5ee6b42ea43 100755 --- a/dev/jobs/waveinit.sh +++ b/dev/jobs/waveinit.sh @@ -4,7 +4,8 @@ set -x ############################################################### #source "${HOMEgfs}/dev/ush/load_modules.sh" run -source "${HOMEgfs}/dev/ush/load_modules.sh" ufswm +#source "${HOMEgfs}/dev/ush/load_modules.sh" ufswm +source "${HOMEgfs}/ush/preamble.sh" status=$? if [[ ${status} -ne 0 ]]; then exit "${status}" @@ -15,7 +16,11 @@ export jobid="${job}.$$" ############################################################### # Execute the JJOB +if [[ "${RUN_WITH_CONTAINER}" == "YES" ]]; then +"${HOMEgfs}/exec/JGLOBAL_WAVE_INIT" +else "${HOMEgfs}/jobs/JGLOBAL_WAVE_INIT" +fi status=$? exit "${status}" diff --git a/dev/parm/config/gefs/config.resources.AWSPW b/dev/parm/config/gefs/config.resources.AWSPW deleted file mode 100644 index f572a5621fb..00000000000 --- a/dev/parm/config/gefs/config.resources.AWSPW +++ /dev/null @@ -1,30 +0,0 @@ -#! /usr/bin/env bash -# shellcheck disable=SC2034 - -# AWS-specific job resources - -export is_exclusive="True" -unset memory -unset "memory_${RUN}" - -step=$1 - -case ${step} in - "fcst" | "efcs" | "wavepostbndpnt" | "wavepostpnt") - export PARTITION_BATCH="compute" - unset PARTITION_SERVICE - max_tasks_per_node=48 - tasks_per_node=48 - ;; - - *) - export PARTITION_BATCH="process" - unset PARTITION_SERVICE - max_tasks_per_node=24 - tasks_per_node=24 - ;; - -esac - -export max_tasks_per_node -export tasks_per_node diff --git a/dev/parm/config/gfs/config.resources b/dev/parm/config/gfs/config.resources index d42ea8ca632..bf287f87725 100644 --- a/dev/parm/config/gfs/config.resources +++ b/dev/parm/config/gfs/config.resources @@ -86,11 +86,6 @@ case ${machine} in # TODO Supply a max mem/node value for GOOGLE mem_node_max="" ;; - "CONTAINER") - max_tasks_per_node=1 - # TODO Supply a max mem/node value for a container - mem_node_max="" - ;; *) echo "FATAL ERROR: Unknown machine encountered by ${BASH_SOURCE[0]}" exit 2 diff --git a/dev/parm/config/gfs/config.resources.AWSPW b/dev/parm/config/gfs/config.resources.AWSPW index c76f50bf0ce..dd59cf46a09 100644 --- a/dev/parm/config/gfs/config.resources.AWSPW +++ b/dev/parm/config/gfs/config.resources.AWSPW @@ -91,6 +91,14 @@ case ${step} in esac ;; + "atmos_products") + export PARTITION_BATCH="highmemory" + #export PARTITION_BATCH="process" + unset PARTITION_SERVICE + max_tasks_per_node=24 + tasks_per_node=24 + ;; + *) export PARTITION_BATCH="process" unset PARTITION_SERVICE diff --git a/dev/parm/config/gfs/config.resources.URSA b/dev/parm/config/gfs/config.resources.URSA index 39e6df015a5..8aeb513b7b7 100644 --- a/dev/parm/config/gfs/config.resources.URSA +++ b/dev/parm/config/gfs/config.resources.URSA @@ -1,6 +1,6 @@ #! /usr/bin/env bash -# Hera-specific job resources +# Ursa-specific job resources case ${step} in "fcst" | "efcs") diff --git a/dev/ush/load_modules.sh b/dev/ush/load_modules.sh index b222308afbd..9d29a4709f6 100644 --- a/dev/ush/load_modules.sh +++ b/dev/ush/load_modules.sh @@ -62,6 +62,8 @@ fi source "${HOMEgfs}/ush/detect_machine.sh" source "${HOMEgfs}/ush/module-setup.sh" +echo "MACHINE_ID: ${MACHINE_ID}" + # Handle different module types case "${MODULE_TYPE}" in "ufswm") @@ -82,6 +84,15 @@ case "${MODULE_TYPE}" in module load cray-mpich-ucx module load python/3.8.6 module load wgrib2 + elif [[ "${MACHINE_ID}" = "container" ]]; then + source /usr/lmod/lmod/init/bash + module purge + module use "${HOMEgfs}/sorc/gfs_utils.fd/modulefiles" + module load gfsutils_container.intel + module load wgrib2 + module load gettext + module load prod_util + export UTILROOT=${prod_util_ROOT} else export UTILROOT=${prod_util_ROOT} source "${HOMEgfs}/versions/run.ver" @@ -166,6 +177,17 @@ case "${MODULE_TYPE}" in exit 1 fi + if [[ "${MACHINE_ID}" = "container" ]]; then + source /usr/lmod/lmod/init/bash + module purge + module use "${HOMEgfs}/sorc/gfs_utils.fd/modulefiles" + module load gfsutils_container.intel + module load wgrib2 + module load gettext + module load prod_util + export UTILROOT=${prod_util_ROOT} + else + # Load our modules: module use "${HOMEgfs}/modulefiles" @@ -191,6 +213,7 @@ case "${MODULE_TYPE}" in echo "FATAL ERROR: Could not determine target module for MODULE_TYPE='${MODULE_TYPE}' and MACHINE_ID='${MACHINE_ID}'" exit 1 fi + fi module list diff --git a/dev/workflow/create_experiment.py b/dev/workflow/create_experiment.py index f3e502f455a..156d4e5d73d 100755 --- a/dev/workflow/create_experiment.py +++ b/dev/workflow/create_experiment.py @@ -76,6 +76,8 @@ def input_args(): '-o', '--overwrite', help='overwrite previously created experiment', action="store_true", required=False) parser.add_argument('--force', help='raise warnings instead of errors when possible', action='store_true', dest="force") + parser.add_argument('-r', '--rocotorun', help='rocotorun fullpath', + default=None, required=False) return parser.parse_args() @@ -131,6 +133,10 @@ def input_args(): if user_inputs.force: setup_workflow_args.append("--force") + if user_inputs.rocotorun is not None: + setup_workflow_args.append("--rocotorun") + setup_workflow_args.append(user_inputs.rocotorun) + logger.info(f"Call: setup_workflow.main()") logger.debug(f"setup_workflow.py {' '.join(setup_workflow_args)}") setup_workflow.main(setup_workflow_args) diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh index e4f4886d7f1..11537bb4791 100755 --- a/dev/workflow/generate_workflows.sh +++ b/dev/workflow/generate_workflows.sh @@ -65,9 +65,13 @@ function _usage() { If this option is not specified, then the existing email address in the crontab will be preserved. + -r specify rocotorun fullpath (mainly work with container) + -t Add a 'tag' to the end of the case names in the pslots to distinguish pslots between multiple sets of tests. + -R Run with Container + -v Verbose mode. Prints output of all commands to stdout. -V Very verbose mode. Passes -v to all commands and prints to stdout. @@ -93,11 +97,14 @@ _specified_yaml_dir=false _run_all_gfs=false _run_all_gefs=false _run_all_sfs=false +_run_with_container=false _run_all_gcafs=false _hpc_account="" _set_account=false _update_cron=false _email="" +_has_rocotorun=false +_rocotorun_fullpath="" _tag="" _set_email=false _verbose=false @@ -110,7 +117,7 @@ _auto_del=false _nonflag_option_count=0 while [[ $# -gt 0 && "$1" != "--" ]]; do - while getopts ":H:bDuy:Y:GESCA:ce:t:vVdh" option; do + while getopts ":H:bDuy:Y:GESCA:ce:t:r:vVdhR" option; do case "${option}" in H) HOMEgfs="${OPTARG}" @@ -141,6 +148,8 @@ while [[ $# -gt 0 && "$1" != "--" ]]; do t) _tag="_${OPTARG}" ;; v) _verbose=true ;; V) _very_verbose=true && _verbose=true && _verbose_flag="-v" ;; + R) _run_with_container=true ;; + r) _rocotorun_fullpath="${OPTARG}" && _has_rocotorun=true ;; A) _set_account=true && _hpc_account="${OPTARG}" ;; d) _debug=true && _very_verbose=true && _verbose=true && _verbose_flag="-v" && PS4='${LINENO}: ' ;; h) _usage && exit 0 ;; @@ -271,6 +280,17 @@ if [[ "${_specified_home}" == "false" ]]; then fi fi +if [[ "${_verbose}" == "true" ]]; then + echo "_run_with_container: ${_run_with_container}" +fi + +# Set RUN_WITH_CONTAINER if it is set by the user +if [[ "${_run_with_container}" == "true" ]]; then + sed -i "s?RUN_WITH_CONTAINER=NO?RUN_WITH_CONTAINER=YES?g" ../../ush/preamble.sh +else + sed -i "s?RUN_WITH_CONTAINER=YES?RUN_WITH_CONTAINER=NO?g" ../../ush/preamble.sh +fi + # Set the _yaml_dir to HOMEgfs/dev/ci/cases/pr if not explicitly set if [[ "${_specified_yaml_dir}" == false ]]; then _yaml_dir="${HOMEgfs}/dev/ci/cases/pr" @@ -452,16 +472,17 @@ if [[ "${_verbose}" == true ]]; then printf "Linking the workflow\n\n" fi if ! "${HOMEgfs}/sorc/link_workflow.sh" >& stdout; then - cat stdout - echo "link_workflow.sh failed!" - if [[ "${_set_email}" == true ]]; then - _stdout=$(cat stdout) - send_email "link_workflow.sh failed with the message"$'\n'"${_stdout}" - fi - rm -f stdout - exit 9 + cat stdout + echo "link_workflow.sh failed!" + if [[ "${_set_email}" == true ]]; then + _stdout=$(cat stdout) + send_email "link_workflow.sh failed with the message"$'\n'"${_stdout}" + fi + rm -f stdout + exit 9 fi rm -f stdout +unset HOMEgfs # Configure the environment for running create_experiment.py if [[ "${_verbose}" == true ]]; then @@ -522,7 +543,15 @@ for _case in "${_yaml_list[@]}"; do echo "${_case}" fi _pslot="${_case}${_tag}" - _create_exp_cmd="./create_experiment.py -y ${_yaml_dir}/${_case}.yaml --overwrite" + if [[ "${_run_with_container}" == "true" ]]; then + if [[ "${_has_rocotorun}" == "true" ]]; then + _create_exp_cmd="../../exec/run_python.sh ./create_experiment.py -y ${_yaml_dir}/${_case}.yaml -r ${_rocotorun_fullpath} --overwrite" + else + _create_exp_cmd="../../exec/run_python.sh ./create_experiment.py -y ${_yaml_dir}/${_case}.yaml --overwrite" + fi + else + _create_exp_cmd="./create_experiment.py -y ${_yaml_dir}/${_case}.yaml --overwrite" + fi if [[ "${_verbose}" == true ]]; then pslot=${_pslot} RUNTESTS=${_runtests} ${_create_exp_cmd} else diff --git a/dev/workflow/hosts.py b/dev/workflow/hosts.py index 0093674df89..25d21513e02 100644 --- a/dev/workflow/hosts.py +++ b/dev/workflow/hosts.py @@ -15,7 +15,7 @@ class Host: Gather Host specific information. """ - SUPPORTED_HOSTS = ['HERA', 'URSA', 'ORION', 'HERCULES', 'WCOSS2', 'CONTAINER', + SUPPORTED_HOSTS = ['HERA', 'URSA', 'ORION', 'HERCULES', 'WCOSS2', 'GAEAC5', 'GAEAC6', 'AWSPW', 'AZUREPW', 'GOOGLEPW'] def __init__(self, host=None): @@ -45,7 +45,6 @@ def detect(self) -> None: machine_id = os.getenv('MACHINE_ID', 'UNKNOWN') pw_csp = os.getenv('PW_CSP', 'UNKNOWN') - container = os.getenv('SINGULARITY_NAME', None) # Detect the machine since MACHINE_ID is set, # Additionaly, if PW_CSP is set, then the machine is a cloud machine @@ -65,11 +64,11 @@ def detect(self) -> None: for line in f: fields = line.strip().split() mount_point = fields[4] - if mount_point == "/apps": - mount_source = fields[9] - if "ursa" in mount_source.lower(): + if mount_point.find("/home") >= 0: + mount_source = fields[9].lower() + if mount_source.find("ursa") >= 0: self.machine = "URSA" - elif "hera" in mount_source.lower(): + elif mount_source.find("hera") >= 0: self.machine = "HERA" # TODO: When Hera is no longer used, remove this check and switch to Ursa. @@ -77,7 +76,7 @@ def detect(self) -> None: if self.machine != 'HERA' and self.machine != 'URSA': machine = socket.gethostname().upper() print(f'Detected host {machine}; assuming this is a GitHub runner.') - self.machine = 'HERA' + self.machine = 'URSA' elif os.path.exists('/work/noaa'): # Orion or Hercules @@ -88,8 +87,6 @@ def detect(self) -> None: self.machine = 'GAEAC5' elif os.path.exists('/gpfs/f6'): self.machine = 'GAEAC6' - elif container is not None: - self.machine = 'CONTAINER' elif pw_csp != "UNKNOWN": if pw_csp.lower() not in ['azure', 'aws', 'google']: raise ValueError( diff --git a/dev/workflow/rocoto/rocoto_xml.py b/dev/workflow/rocoto/rocoto_xml.py index 176a6d644f1..a3a243a3b86 100644 --- a/dev/workflow/rocoto/rocoto_xml.py +++ b/dev/workflow/rocoto/rocoto_xml.py @@ -139,10 +139,20 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: # No point creating a crontab if rocotorun is not available. rocotorun = which('rocotorun') if rocotorun is None: - print('Failed to find rocotorun, crontab will not be created') - return - - rocotoruncmd = rocotorun.command + try: + if ('rocotorun' in self.rocoto_config.keys()): + rocotoruncmd = self.rocoto_config['rocotorun'] + else: + rocotoruncmd = '/apps/rocoto/default/bin/rocotorun' + os.path.exists(rocotoruncmd) + except Exception as ee: + raise Exception("Failed to find rocotorun, crontab will not be created: ") from ee + return + + version = rocotoruncmd.split('/')[-3] + else: + version = rocotorun("--version", output=str, error=str).split()[-1].strip() + rocotoruncmd = rocotorun.command rocotorunstr = f'{rocotoruncmd} -d {self.expdir}/{self.pslot}.db -w {self.expdir}/{self.pslot}.xml' cronintstr = f'*/{cronint} * * * *' @@ -253,9 +263,18 @@ def _check_rocotorc(self): rocotorun = which("rocotorun") if rocotorun is None: - raise FileNotFoundError("Could not find the rocotorun executable. Make sure you have the module loaded!") - - version = rocotorun("--version", output=str, error=str).split()[-1].strip() + try: + if ('rocotorun' in self.rocoto_config.keys()): + rocotorun = self.rocoto_config['rocotorun'] + else: + rocotorun = '/apps/rocoto/default/bin/rocotorun' + os.path.exists(rocotorun) + except Exception as ee: + raise Exception("Could not find the rocotorun executable. Make sure you have the module loaded!: ") from ee + + version = rocotorun.split('/')[-3] + else: + version = rocotorun("--version", output=str, error=str).split()[-1].strip() homedir = os.path.expanduser("~") rocotorc_file = os.path.join(homedir, ".rocoto", version, "rocotorc") diff --git a/dev/workflow/setup_workflow.py b/dev/workflow/setup_workflow.py index f7b5958e527..47d45cb58ff 100755 --- a/dev/workflow/setup_workflow.py +++ b/dev/workflow/setup_workflow.py @@ -36,6 +36,8 @@ def input_args(*argv): type=str, default=os.environ['PWD']) parser.add_argument('--force', help='raise warnings instead of errors when possible', action='store_true', dest="force") + parser.add_argument('--rocotorun', help='rocotorun fullpath', type=str, + default=None, required=False) # Create subparsers for workflow engines subparsers = parser.add_subparsers(dest='workflow', required=True, @@ -53,6 +55,8 @@ def input_args(*argv): default=25, required=False) rocoto_parser.add_argument('--verbosity', help='verbosity level of Rocoto', type=int, default=10, required=False) + rocoto_parser.add_argument('--rocotorun', help='rocotorun fullpath', type=str, + default=None, required=False) # EcFlow subparser ecflow_parser = subparsers.add_parser('ecflow', diff --git a/env/CONTAINER.env b/env/CONTAINER.env deleted file mode 100755 index ba01fcf0dd9..00000000000 --- a/env/CONTAINER.env +++ /dev/null @@ -1,32 +0,0 @@ -#! /usr/bin/env bash - -if [[ $# -ne 1 ]]; then - - echo "Must specify an input argument to set runtime environment variables!" - exit 1 - -fi - -step=$1 - -export launcher="mpirun" -export mpmd_opt="--multi-prog" - -# Configure MPI environment -export MPI_BUFS_PER_PROC=2048 -export MPI_BUFS_PER_HOST=2048 -export MPI_GROUP_MAX=256 -export MPI_MEMMAP_OFF=1 -export MP_STDOUTMODE="ORDERED" -export KMP_AFFINITY=scatter -export OMP_STACKSIZE=2048000 -export NTHSTACK=1024000000 - -ulimit -s unlimited -ulimit -a - - -if [ "${step}" = "marineanlvar" ]; then - export NTHREADS_OCNANAL=1 - export APRUN_MARINEANLVAR="${launcher} -n 2" -fi diff --git a/env/URSA.env b/env/URSA.env index d4dc0161877..1b01b877421 100644 --- a/env/URSA.env +++ b/env/URSA.env @@ -9,7 +9,7 @@ fi step=$1 -export launcher="srun -l --export=ALL --hint=nomultithread --distribution=block:block" +export launcher="srun --mpi=pmi2 -l --export=ALL --hint=nomultithread --distribution=block:block" export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out" #export POSTAMBLE_CMD='report-mem' diff --git a/jobs/JGLOBAL_ARCHIVE_VRFY b/jobs/JGLOBAL_ARCHIVE_VRFY index e5965c22706..ca266c4cd89 100755 --- a/jobs/JGLOBAL_ARCHIVE_VRFY +++ b/jobs/JGLOBAL_ARCHIVE_VRFY @@ -23,8 +23,16 @@ done ############################################################### # Run archive script ############################################################### - -${GLOBALARCHIVESH:-${SCRgfs}/exglobal_archive_vrfy.py} +#if [[ "${RUN_WITH_CONTAINER}" == "YES" ]]; then +# "${HOMEgfs}/exec/run_python.sh" "${GLOBALARCHIVESH:-${SCRgfs}/exglobal_archive_vrfy.py}" -c -v +#else +# ${GLOBALARCHIVESH:-${SCRgfs}/exglobal_archive_vrfy.py} +#fi +if [[ -v GLOBALARCHIVESH ]]; then + ${GLOBALARCHIVESH} +else + "${PYCMD}" "${SCRgfs}"/exglobal_archive_vrfy.py ${PYEXTRAARGS} +fi err=$? if [[ ${err} -ne 0 ]]; then exit "${err}" diff --git a/jobs/JGLOBAL_ATMOS_PRODUCTS b/jobs/JGLOBAL_ATMOS_PRODUCTS index 8625d8e9a3d..9ee87131bdd 100755 --- a/jobs/JGLOBAL_ATMOS_PRODUCTS +++ b/jobs/JGLOBAL_ATMOS_PRODUCTS @@ -1,6 +1,8 @@ #! /usr/bin/env bash source "${HOMEgfs}/ush/jjob_header.sh" -e "atmos_products" -c "base atmos_products" +#source "${HOMEgfs}/ush/preamble.sh" +echo "RUN_WITH_CONTAINER: ${RUN_WITH_CONTAINER}" ############################################## # Begin JOB SPECIFIC work @@ -23,7 +25,11 @@ export PREFIX="${RUN}.t${cyc}z." ############################################################### # Run exglobal script -"${SCRgfs}/exglobal_atmos_products.sh" && true +if [[ "${RUN_WITH_CONTAINER}" == "YES" ]]; then + "${HOMEgfs}/exec/exglobal_atmos_products.sh" && true +else + "${SCRgfs}/exglobal_atmos_products.sh" && true +fi export err=$? if [[ ${err} -ne 0 ]]; then err_exit diff --git a/jobs/JGLOBAL_OCEANICE_PRODUCTS b/jobs/JGLOBAL_OCEANICE_PRODUCTS index 2d7718017ca..393bfcbb0ce 100755 --- a/jobs/JGLOBAL_OCEANICE_PRODUCTS +++ b/jobs/JGLOBAL_OCEANICE_PRODUCTS @@ -13,7 +13,12 @@ YMD="${PDY}" HH="${cyc}" declare_from_tmpl -rx "COMOUT_${COMPONENT^^}_NETCDF":"C ############################################################### # Run exglobal script -"${SCRgfs}/exglobal_oceanice_products.py" && true +#if [[ "${RUN_WITH_CONTAINER}" == "YES" ]]; then +# "${HOMEgfs}/exec/run_python.sh" "${SCRgfs}/exglobal_oceanice_products.py" -c -v +#else +# "${SCRgfs}/exglobal_oceanice_products.py" && true +#fi +"${PYCMD}" "${SCRgfs}"/exglobal_oceanice_products.py ${PYEXTRAARGS} && true export err=$? if [[ ${err} -ne 0 ]]; then err_exit diff --git a/jobs/JGLOBAL_PREP_EMISSIONS b/jobs/JGLOBAL_PREP_EMISSIONS index 72c9d40604b..6ce23f06796 100755 --- a/jobs/JGLOBAL_PREP_EMISSIONS +++ b/jobs/JGLOBAL_PREP_EMISSIONS @@ -16,7 +16,14 @@ source "${HOMEgfs}/ush/jjob_header.sh" -e "prep_emissions" -c "base prep_emissio ############################################################### # Run relevant script EXSCRIPT=${PREP_EMISSIONS_PY:-${SCRgfs}/exglobal_prep_emissions.py} -${EXSCRIPT} && true + +# Execute staging +#if [[ "${RUN_WITH_CONTAINER}" == "YES" ]]; then +# "${HOMEgfs}/exec/run_python.sh" "${EXSCRIPT}" && true +#else +# ${EXSCRIPT} && true +#fi +"${PYCMD}" "${EXSCRIPT}" && true export err=$? if [[ ${err} -ne 0 ]]; then err_exit "Error executing ${EXSCRIPT}, ABORT!" diff --git a/jobs/JGLOBAL_STAGE_IC b/jobs/JGLOBAL_STAGE_IC index 6a2f9571166..baf8cb66903 100755 --- a/jobs/JGLOBAL_STAGE_IC +++ b/jobs/JGLOBAL_STAGE_IC @@ -3,7 +3,12 @@ source "${HOMEgfs}/ush/jjob_header.sh" -e "stage_ic" -c "base stage_ic" # Execute staging -"${SCRgfs}/exglobal_stage_ic.py" +#if [[ "${RUN_WITH_CONTAINER}" == "YES" ]]; then +# "${HOMEgfs}/exec/run_python.sh" "${SCRgfs}/exglobal_stage_ic.py" +#else +# "${SCRgfs}/exglobal_stage_ic.py" +#fi +"${PYCMD}" "${SCRgfs}"/exglobal_stage_ic.py err=$? ############################################################### diff --git a/modulefiles/gw_run.common.lua b/modulefiles/gw_run.common.lua index 997466b5748..f60a98234b3 100644 --- a/modulefiles/gw_run.common.lua +++ b/modulefiles/gw_run.common.lua @@ -38,7 +38,7 @@ local common_modules = { "gsi-ncdiag", "crtm", "bufr", - --"wgrib2", temporarily disable wgrib2 until it is installed with ipolates + "wgrib2", "py-f90nml", "py-netcdf4", "py-pyyaml", diff --git a/modulefiles/gw_run.noaacloud.lua b/modulefiles/gw_run.noaacloud.lua index 4f7da7cd202..6329ea4468c 100644 --- a/modulefiles/gw_run.noaacloud.lua +++ b/modulefiles/gw_run.noaacloud.lua @@ -16,13 +16,13 @@ load(pathJoin("mkl", (os.getenv("mkl_ver") or "None"))) load("gw_run.common") load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None"))) -- TODO: Remove once wgrib2 is loaded in gw_run.common -prepend_path("MODULEPATH", pathJoin("/contrib/git/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles")) -load(pathJoin("prepobs", (os.getenv("prepobs_run_ver") or "None"))) +-- prepend_path("MODULEPATH", pathJoin("/contrib/git/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles")) +-- load(pathJoin("prepobs", (os.getenv("prepobs_run_ver") or "None"))) -prepend_path("MODULEPATH", pathJoin("/contrib/git/Fit2Obs/v" .. (os.getenv("fit2obs_ver") or "None"), "modulefiles")) -load(pathJoin("fit2obs", (os.getenv("fit2obs_ver") or "None"))) +-- prepend_path("MODULEPATH", pathJoin("/contrib/git/Fit2Obs/v" .. (os.getenv("fit2obs_ver") or "None"), "modulefiles")) +-- load(pathJoin("fit2obs", (os.getenv("fit2obs_ver") or "None"))) -load(pathJoin("imagemagick", (os.getenv("imagemagick_ver") or "None"))) +-- load(pathJoin("imagemagick", (os.getenv("imagemagick_ver") or "None"))) setenv("CRTM_FIX", "/contrib/global-workflow-shared-data/fix/crtm/v2.4.0.2") diff --git a/scripts/exglobal_atmos_products.sh b/scripts/exglobal_atmos_products.sh index 0c34247bb3c..d98d2997a2a 100755 --- a/scripts/exglobal_atmos_products.sh +++ b/scripts/exglobal_atmos_products.sh @@ -1,5 +1,7 @@ #! /usr/bin/env bash +source "${HOMEgfs}/dev/ush/load_modules.sh" + # Scripts used INTERP_ATMOS_MASTERSH=${INTERP_ATMOS_MASTERSH:-"${USHgfs}/interp_atmos_master.sh"} INTERP_ATMOS_SFLUXSH=${INTERP_ATMOS_SFLUXSH:-"${USHgfs}/interp_atmos_sflux.sh"} diff --git a/scripts/exglobal_oceanice_products.py b/scripts/exglobal_oceanice_products.py index bb03840842a..7e973c9e768 100755 --- a/scripts/exglobal_oceanice_products.py +++ b/scripts/exglobal_oceanice_products.py @@ -5,6 +5,8 @@ from wxflow import AttrDict, Logger, logit, cast_strdict_as_dtypedict from pygfs.task.oceanice_products import OceanIceProducts +import argparse + # initialize root logger logger = Logger(level=os.environ.get("LOGGING_LEVEL", "DEBUG"), colored_log=True) @@ -12,6 +14,16 @@ @logit(logger) def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-v", "--verbose", action="store_true", + help="increase output verbosity") + parser.add_argument("-c", "--container", action="store_true", + help="use container") + args = parser.parse_args() + + if args.verbose: + logger.info(f"use contaier {args.container}") + config = cast_strdict_as_dtypedict(os.environ) # Instantiate the OceanIce object @@ -39,7 +51,7 @@ def main(): oceanice.configure(oceanice_dict, grid) # Run the oceanice post executable to interpolate and create grib2 files - oceanice.execute(oceanice_dict, grid) + oceanice.execute(oceanice_dict, grid, run_with_container=args.container) # Subset raw model data to create netCDF products oceanice.subset(oceanice_dict) diff --git a/ush/detect_machine.sh b/ush/detect_machine.sh index ee6c2c2c79b..bb49f43b4a7 100755 --- a/ush/detect_machine.sh +++ b/ush/detect_machine.sh @@ -8,6 +8,12 @@ # # Thank you for your contribution +# overwrite MACHINE_ID if in container +if [[ -v SINGULARITY_CONTAINER ]]; then + # We are in a container + MACHINE_ID=container +fi + # If the MACHINE_ID variable is set, skip this script. if [[ -n "${MACHINE_ID:-}" ]]; then return diff --git a/ush/forecast_postdet.sh b/ush/forecast_postdet.sh index 8f3bd9a1307..293f2850502 100755 --- a/ush/forecast_postdet.sh +++ b/ush/forecast_postdet.sh @@ -108,7 +108,7 @@ FV3_postdet() { # Check for consistency # TODO: the checker has a --fatal option, which is not used here. This needs to be decided how to handle. if [[ "${CHECK_LAND_RESTART_OROG:-NO}" == "YES" ]]; then - "${USHgfs}/check_land_input_orography.py" \ + "${PYCMD}" "${USHgfs}"/check_land_input_orography.py \ --input_dir "${DATA}/INPUT" --orog_dir "${DATA}/INPUT" err=$? if [[ ${err} -ne 0 ]]; then diff --git a/ush/preamble.sh b/ush/preamble.sh index d9c95225e69..4c88032e89c 100755 --- a/ush/preamble.sh +++ b/ush/preamble.sh @@ -176,6 +176,19 @@ trap "postamble ${_calling_script} ${start_time} \$?" EXIT source "${HOMEgfs}/ush/bash_utils.sh" +# Define if run with container, default as NO. +export RUN_WITH_CONTAINER=NO + +if [[ "${RUN_WITH_CONTAINER}" == "YES" ]]; then + # if within container, will run python executbale inside container, + # also need to tell some scripts that it is run in container, with "-c" option. + export PYCMD="${HOMEgfs}"/exec/run_python.sh + export PYEXTRAARGS=" -c -v" +else + export PYCMD=python + export PYEXTRAARGS="" +fi + # Turn on our settings export SHELLOPTS declare -xf set_strict diff --git a/ush/python/pygfs/task/oceanice_products.py b/ush/python/pygfs/task/oceanice_products.py index d319608ad14..4a21aca5b69 100644 --- a/ush/python/pygfs/task/oceanice_products.py +++ b/ush/python/pygfs/task/oceanice_products.py @@ -155,7 +155,7 @@ def configure(config: Dict, product_grid: str) -> None: @staticmethod @logit(logger) - def execute(config: Dict, product_grid: str) -> None: + def execute(config: Dict, product_grid: str, run_with_container=False) -> None: """Run the ocnicepost.x executable to interpolate and convert to grib2 Parameters @@ -171,14 +171,15 @@ def execute(config: Dict, product_grid: str) -> None: """ # Run the ocnicepost.x executable - OceanIceProducts.interp(config.DATA, config.APRUN_OCNICEPOST, exec_name="ocnicepost.x") + OceanIceProducts.interp(config.DATA, config.APRUN_OCNICEPOST, + exec_name="ocnicepost.x", run_with_container=run_with_container) # Index the interpolated grib2 file OceanIceProducts.index(config, product_grid) @staticmethod @logit(logger) - def interp(workdir: str, aprun_cmd: str, exec_name: str = "ocnicepost.x") -> None: + def interp(workdir: str, aprun_cmd: str, exec_name: str = "ocnicepost.x", run_with_container=False) -> None: """ Run the interpolation executable to generate interpolated file @@ -200,7 +201,11 @@ def interp(workdir: str, aprun_cmd: str, exec_name: str = "ocnicepost.x") -> Non os.chdir(workdir) logger.debug(f"Current working directory: {os.getcwd()}") - exec_cmd = Executable(aprun_cmd) + print(f'aprun_cmd: {aprun_cmd}') + if run_with_container: + exec_cmd = Executable('time') + else: + exec_cmd = Executable(aprun_cmd) exec_cmd.add_default_arg(os.path.join(workdir, exec_name)) try: exec_cmd() diff --git a/ush/run_mpmd.sh b/ush/run_mpmd.sh index 6ffbd3106d1..e95e295a260 100755 --- a/ush/run_mpmd.sh +++ b/ush/run_mpmd.sh @@ -28,8 +28,6 @@ # ################################################################################ -source "${USHgfs}/preamble.sh" - cmdfile=${1:?"run_mpmd requires an input file containing commands to execute in MPMD/serial mode"} # If USE_CFP is not set, run in serial mode @@ -45,63 +43,22 @@ fi # Set OMP_NUM_THREADS to 1 to avoid oversubscription when doing MPMD export OMP_NUM_THREADS=1 -# Determine the number of MPMD processes from incoming ${cmdfile} -nprocs=$(wc -l < "${cmdfile}") - -# Local MPMD file containing instructions to run in CFP -mpmd_cmdfile="${DATA:-}/mpmd_cmdfile" -if [[ -s "${mpmd_cmdfile}" ]]; then rm -f "${mpmd_cmdfile}"; fi - -cat << EOF - INFO: Executing MPMD job, STDOUT redirected for each process separately - INFO: On failure, logs for each job will be available in ${DATA}/mpmd.proc_num.out - INFO: The proc_num corresponds to the line in '${mpmd_cmdfile}' -EOF - -if [[ "${launcher:-}" =~ ^srun.* ]]; then # srun-based system e.g. Hera, Orion, etc. - - # Slurm requires a counter in front of each line in the script - # Read the incoming cmdfile and create srun usable cmdfile - nm=0 - # shellcheck disable=SC2312 - while IFS= read -r line; do - echo "${nm} ${line}" >> "${mpmd_cmdfile}" - ((nm=nm+1)) - done < "${cmdfile}" - - set +e - # shellcheck disable=SC2086 - ${launcher:-} ${mpmd_opt:-} -n ${nprocs} "${mpmd_cmdfile}" - err=$? - set_strict +# Redirect output from each process to its own stdout +# Read the incoming cmdfile and create mpiexec usable cmdfile +nm=0 +# shellcheck disable=SC2312 +while IFS= read -r line; do + echo "Line ${nm}: ${line}" + ${line} > "mpmd.${nm}.out" & + ((nm=nm+1)) +done < "${cmdfile}" +wait -elif [[ "${launcher:-}" =~ ^mpiexec.* ]]; then # mpiexec - - # Redirect output from each process to its own stdout - # Read the incoming cmdfile and create mpiexec usable cmdfile - nm=0 - echo "#!/bin/bash" >> "${mpmd_cmdfile}" - # shellcheck disable=SC2312 - while IFS= read -r line; do - echo "${line} > mpmd.${nm}.out" >> "${mpmd_cmdfile}" - ((nm=nm+1)) - done < "${cmdfile}" - chmod 755 "${mpmd_cmdfile}" - - # shellcheck disable=SC2086 - ${launcher:-} -np ${nprocs} ${mpmd_opt:-} "${mpmd_cmdfile}" - err=$? - -else - - echo "FATAL ERROR: CFP is not usable with launcher: '${launcher:-}'" - err=1 - -fi +err=$? +set_strict # On success concatenate processor specific output into a single mpmd.out if [[ ${err} -eq 0 ]]; then - rm -f "${mpmd_cmdfile}" out_files=$(find . -name 'mpmd.*.out') for file in ${out_files}; do cat "${file}" >> mpmd.out