Skip to content

Commit 36e49f1

Browse files
committed
Better error handling
Signed-off-by: Kaiyu Xie <[email protected]>
1 parent 9890896 commit 36e49f1

File tree

4 files changed

+15
-7
lines changed

4 files changed

+15
-7
lines changed

examples/disaggregated/slurm/benchmark/disaggr_torch.slurm

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
#SBATCH --job-name=${job_name} # add your job name here or specify in the sbatch command
88
#SBATCH --time=02:00:00
99

10+
set -u
11+
set -e
12+
set -x
13+
1014
# Context servers arguments
1115
num_ctx_servers=${1}
1216
ctx_tp_size=${2}
@@ -227,9 +231,10 @@ srun -l --container-name=${container_name} \
227231

228232
# start benchmarking
229233
srun -l --container-name=${container_name} \
230-
--container-mounts=${mounts} \
231-
--mpi=pmix --overlap -N 1 -n 1 \
232-
bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir} > ${full_logdir}/benchmark.log 2>&1
234+
--container-mounts=${mounts} \
235+
--mpi=pmix --overlap -N 1 -n 1 \
236+
bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir} \
237+
&> ${full_logdir}/benchmark.log 2>&1
233238

234239
# try to kill the server and workers
235240
srun -l --container-name=${container_name} \

examples/disaggregated/slurm/benchmark/run_benchmark.sh

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#!/bin/bash
2-
3-
# Add error handling
4-
set -e
52
set -u
3+
set -e
4+
set -x
65
trap 'echo "Error occurred at line $LINENO"; exit 1' ERR
76

87
# Add parameter validation
@@ -26,7 +25,6 @@ if [[ ${SLURM_PROCID} != "0" ]]; then
2625
exit 0
2726
fi
2827

29-
set -x
3028
config_file=${log_path}/server_config.yaml
3129

3230
# check if the config file exists every 10 seconds timeout 1800 seconds

examples/disaggregated/slurm/benchmark/start_server.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
#!/bin/bash
2+
set -u
3+
set -e
4+
set -x
25

36
num_ctx_servers=$1
47
num_gen_servers=$2

examples/disaggregated/slurm/benchmark/start_worker.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#! /bin/bash
2+
set -u
3+
set -e
24
set -x
35

46
role=$1

0 commit comments

Comments
 (0)