From 9a41fb85078cb04eeb6d857cbef674b40fbe5b68 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 10 May 2024 06:46:02 -0500 Subject: [PATCH 1/3] Limit wavepostpnt PEs to 40/node --- parm/config/gefs/config.resources | 8 +++++++- parm/config/gfs/config.resources | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/parm/config/gefs/config.resources b/parm/config/gefs/config.resources index d98e437359c..0fa562ade04 100644 --- a/parm/config/gefs/config.resources +++ b/parm/config/gefs/config.resources @@ -275,8 +275,14 @@ case ${step} in export npe_wavepostpnt=200 export nth_wavepostpnt=1 export npe_node_wavepostpnt=$(( npe_node_max / nth_wavepostpnt )) - export NTASKS=${npe_wavepostpnt} export is_exclusive=True + # This MPMD job is very I/O heavy and does not scale well. Limit jobs/node to 40. + # For now this affects only Hercules, but will also affect Gaea. + if [[ ${npe_node_wavepostpnt} -gt 40 ]]; then + export npe_node_wavepostpnt=40 + export is_exclusive=False + fi + export NTASKS=${npe_wavepostpnt} ;; *) diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index 3c6ccfff6fa..0dbfbe94acf 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -163,8 +163,14 @@ case ${step} in export npe_wavepostpnt=200 export nth_wavepostpnt=1 export npe_node_wavepostpnt=$(( npe_node_max / nth_wavepostpnt )) - export NTASKS=${npe_wavepostpnt} export is_exclusive=True + # This MPMD job is very I/O heavy and does not scale well. Limit jobs/node to 40. + # For now this affects only Hercules, but will also affect Gaea. + if [[ ${npe_node_wavepostpnt} -gt 40 ]]; then + export npe_node_wavepostpnt=40 + export is_exclusive=False + fi + export NTASKS=${npe_wavepostpnt} ;; "wavegempak") From 76ad800d69664c85f3e495617d4ab5252d8ff941 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 10 May 2024 07:47:58 -0500 Subject: [PATCH 2/3] Expand task/node limits to other wavepost*pnt jobs --- parm/config/gfs/config.resources | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index 0dbfbe94acf..e2893d63377 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -140,13 +140,19 @@ case ${step} in export memory_wavepostsbs_gfs="10GB" ;; + # The wavepost*pnt* jobs are I/O heavy and do not scale well to large nodes. + # Limit the number of tasks/node to 40. "wavepostbndpnt") export wtime_wavepostbndpnt="01:00:00" export npe_wavepostbndpnt=240 export nth_wavepostbndpnt=1 export npe_node_wavepostbndpnt=$(( npe_node_max / nth_wavepostbndpnt )) - export NTASKS=${npe_wavepostbndpnt} export is_exclusive=True + if [[ ${npe_node_wavepostbndpnt} -gt 40 ]]; then + export npe_node_wavepostbndpnt=40 + export is_exclusive=False + fi + export NTASKS=${npe_wavepostbndpnt} ;; "wavepostbndpntbll") @@ -154,8 +160,12 @@ case ${step} in export npe_wavepostbndpntbll=448 export nth_wavepostbndpntbll=1 export npe_node_wavepostbndpntbll=$(( npe_node_max / nth_wavepostbndpntbll )) - export NTASKS=${npe_wavepostbndpntbll} export is_exclusive=True + if [[ ${npe_node_wavepostbndpntbll} -gt 40 ]]; then + export npe_node_wavepostbndpntbll=40 + export is_exclusive=False + fi + export NTASKS=${npe_wavepostbndpntbll} ;; "wavepostpnt") @@ -164,8 +174,6 @@ case ${step} in export nth_wavepostpnt=1 export npe_node_wavepostpnt=$(( npe_node_max / nth_wavepostpnt )) export is_exclusive=True - # This MPMD job is very I/O heavy and does not scale well. Limit jobs/node to 40. - # For now this affects only Hercules, but will also affect Gaea. if [[ ${npe_node_wavepostpnt} -gt 40 ]]; then export npe_node_wavepostpnt=40 export is_exclusive=False From ef56339aa303822ca14a1149078080f750a98169 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 10 May 2024 07:50:48 -0500 Subject: [PATCH 3/3] Expand to gefs as well. --- parm/config/gefs/config.resources | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/parm/config/gefs/config.resources b/parm/config/gefs/config.resources index 0fa562ade04..04d55ae082d 100644 --- a/parm/config/gefs/config.resources +++ b/parm/config/gefs/config.resources @@ -252,13 +252,19 @@ case ${step} in export memory_wavepostsbs="10GB" ;; + # The wavepost*pnt* jobs are I/O heavy and do not scale well to large nodes. + # Limit the number of tasks/node to 40. "wavepostbndpnt") export wtime_wavepostbndpnt="01:00:00" export npe_wavepostbndpnt=240 export nth_wavepostbndpnt=1 export npe_node_wavepostbndpnt=$(( npe_node_max / nth_wavepostbndpnt )) - export NTASKS=${npe_wavepostbndpnt} export is_exclusive=True + if [[ ${npe_node_wavepostbndpnt} -gt 40 ]]; then + export npe_node_wavepostbndpnt=40 + export is_exclusive=False + fi + export NTASKS=${npe_wavepostbndpnt} ;; "wavepostbndpntbll") @@ -266,8 +272,12 @@ case ${step} in export npe_wavepostbndpntbll=448 export nth_wavepostbndpntbll=1 export npe_node_wavepostbndpntbll=$(( npe_node_max / nth_wavepostbndpntbll )) - export NTASKS=${npe_wavepostbndpntbll} export is_exclusive=True + if [[ ${npe_node_wavepostbndpntbll} -gt 40 ]]; then + export npe_node_wavepostbndpntbll=40 + export is_exclusive=False + fi + export NTASKS=${npe_wavepostbndpntbll} ;; "wavepostpnt") @@ -276,8 +286,6 @@ case ${step} in export nth_wavepostpnt=1 export npe_node_wavepostpnt=$(( npe_node_max / nth_wavepostpnt )) export is_exclusive=True - # This MPMD job is very I/O heavy and does not scale well. Limit jobs/node to 40. - # For now this affects only Hercules, but will also affect Gaea. if [[ ${npe_node_wavepostpnt} -gt 40 ]]; then export npe_node_wavepostpnt=40 export is_exclusive=False