-
Notifications
You must be signed in to change notification settings - Fork 37
AthenaPK scaling instructions
Philipp Grete edited this page Feb 11, 2021
·
6 revisions
- Assumes a Power9 node with 4x V100
- Recommended environment: SpectrumMPI and GCC host compiler
# get source
git clone https://gitlab.com/theias/hpc/jmstone/athena-parthenon/athenapk.git athenaPK
cd athenaPK
# change to branch for scaling test
git checkout pgrete/pack-in-one
# get submodules (mainly Kokkos and Parthenon)
git submodule init
git submodule update
# Configure and build. Reusing Summit machine file (same architecture)
mkdir build-cuda-mpi && cd build-cuda-mpi
cmake -DMACHINE_CFG=$(pwd)/../external/parthenon/cmake/machinecfg/Summit.cmake ..
make -j8 athenaPK
# get source
git clone https://gitlab.com/theias/hpc/jmstone/athena-parthenon/athenapk.git athenaPK
cd athenaPK
# change to branch for scaling test
git checkout pgrete/pack-in-one
# get submodules (mainly Kokkos and Parthenon)
git submodule init
git submodule update
cmake -S. -B build -DCMAKE_TOOLCHAIN_FILE=$(pwd)/external/parthenon/cmake/machinecfg/RZAnsel.cmake
cmake --build build
- For static meshes we'll use a workload of 256^3 cells per GPU
- Adjust launch command as needed (e.g., use
-M "-gpu"
parameter ofjsrun
instead ofMY_SPECTRUM_OPTIONS
environment variable)
# enable Cuda aware MPI
export MY_SPECTRUM_OPTIONS="--gpu"
# make Kokkos pick GPUs round robin
export KOKKOS_NUM_DEVICES=4
cd build-cuda-mpi
# mesh dimensions
export MB=256
export MX=256
export MY=256
export MZ=256
ibrun -n 1 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 2.2e8 zone-cycles/wsec_step
export MX=512
ibrun -n 2 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 4.4e8 zone-cycles/wsec_step
export MY=512
ibrun -n 4 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 8.6e8 zone-cycles/wsec_step
# Test with overdecomposition
export MB=128
ibrun -n 4 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 9.5e8 zone-cycles/wsec_step
# And much more overdecomposition
export MB=32
ibrun -n 4 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 2.2e8 zone-cycles/wsec_step
# And now with process<->GPU overdecomposition (requires MPS): Using 32 on a single host for 4 GPUs
ibrun -n 32 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 3.2e8 zone-cycles/wsec_step
# default
mb = 256 # MeshBlock size, used for x,y,z
mx = 256 # x Mesh size
my = 256 # y Mesh size
mz = 256 # z Mesh size
nlim = 10 # max number of cycles for sim
refinement = 'none' # static, uniform mesh
nodes = 1 # number of nodes to be used
max_nodes = 2048 # scale up to this number of nodes
cmd_str = " ./src/athenaPK "
input_str = " -i ../inputs/advection_3d.in "
param_str = lambda: f' parthenon/meshblock/nx1={mb:d} parthenon/meshblock/nx2={mb:d} parthenon/meshblock/nx3={mb:d} parthenon/time/nlim={nlim:d} parthenon/mesh/nx1={mx:d} parthenon/mesh/nx2={my:d} parthenon/mesh/nx3={mz:d} parthenon/mesh/refinement={refinement:s} '
machine = 'RZAnsel'
if machine == 'Summit_4_GPUs_per_node':
print("Configuration for Summit using 4 GPUs per node to mirror Sierra")
print("############## ENVIRONMENT ###############")
print("export KOKKOS_NUM_DEVICES=2")
tasks_per_gpu = 1
init_mx = 512
init_my = 512
init_mz = 256
gpus_per_node = 4
launch_str = lambda: f"jsrun --nrs {2*nodes} --tasks_per_rs {2*tasks_per_gpu} --cpu_per_rs 21 --gpu_per_rs 3 --rs_per_host 2 --smpiargs=-gpu"
elif machine == 'RZAnsel':
print("Configuration for RZAnsel using 4 GPUs per node")
print("############## ENVIRONMENT ###############")
print("export KOKKOS_NUM_DEVICES=1")
init_mx = 512
init_my = 512
init_mz = 256
gpus_per_node = 4
launch_str = lambda: f"lrun -N {nodes} -T {gpus_per_node*tasks_per_gpu} -g 1 -M \"-gpu\""
for tasks_per_gpu in [1]:
for mb in [128, 256]:
mx = init_mx
my = init_my
mz = init_mz
nodes = 1
# ensure that there's at leat one task per meshblock
if tasks_per_gpu*gpus_per_node > (init_mx/mb)*(init_my/mb)*(init_mz/mb):
continue
print("##########################################")
print(f"\n# Weak static uniform grid scaling with one {mb}^3 meshblock per device. MPS oversub. = {tasks_per_gpu}.")
log_str = lambda: f' |tee weak_static.out.nodes_{nodes}-mb_{mb}-mps_{tasks_per_gpu}\n'
while True:
print(launch_str() + cmd_str + input_str + param_str() + log_str())
mx *= 2
nodes *= 2
if nodes > max_nodes:
break
print(launch_str() + cmd_str + input_str + param_str() + log_str())
my *= 2
nodes *= 2
if nodes > max_nodes:
break
print(launch_str() + cmd_str + input_str + param_str() + log_str())
mz *= 2
nodes *= 2
if nodes > max_nodes:
break