Skip to content

Commit e5cead1

Browse files
authored
[TRTLLM-6295][test] Exit as early as possible and propagate exit status correctly for multi-node testing (#7739)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent c076a02 commit e5cead1

File tree

3 files changed

+52
-138
lines changed

3 files changed

+52
-138
lines changed

jenkins/scripts/slurm_run.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@ export LLM_ROOT=$llmSrcNode
4545
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
4646
export UCX_TLS=^gdr_copy
4747

48-
# TODO: Move back to tensorrt_llm/llmapi/trtllm-llmapi-launch later
49-
llmapiLaunchScript="$llmSrcNode/jenkins/scripts/trtllm-llmapi-launch"
48+
llmapiLaunchScript="$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
5049
chmod +x $llmapiLaunchScript
5150
cd $llmSrcNode/tests/integration/defs
5251
testCmdLines=(

jenkins/scripts/trtllm-llmapi-launch

Lines changed: 0 additions & 129 deletions
This file was deleted.

tensorrt_llm/llmapi/trtllm-llmapi-launch

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
set -e
2+
set -Eeo pipefail
33

44
task_with_command=("$@")
55
native_mpi_rank=$OMPI_COMM_WORLD_RANK
@@ -47,7 +47,7 @@ log_stderr "tllm_mpi_size: $tllm_mpi_size"
4747
export_free_tcp_addr_for_spawn_proxy_process
4848

4949
if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
50-
log_stderr "rank${mpi_rank} run ${task_with_command[@]} in background"
50+
log_stderr "Rank${mpi_rank} run ${task_with_command[@]} in background"
5151

5252
# MPI doesn't allow spawn a process sharing the MPI environment in a MPI
5353
# process, or duplicate MPI_Init in the child process will cause undefined
@@ -70,16 +70,60 @@ if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
7070
done
7171
done
7272

73+
# Turn off "exit on error" so the following lines always run
74+
set +e
75+
7376
# Execute the task with cleaned environment
74-
"${task_with_command[@]}"
75-
# stop the MPI Comm server
77+
"${task_with_command[@]}"
78+
task_exit_code=$?
79+
log_stderr "Rank${mpi_rank} Task exit code: $task_exit_code"
80+
81+
# Stop the MPI Comm server
7682
python3 -m tensorrt_llm.llmapi.mgmn_leader_node --action stop
83+
mpi_exit_code=$?
84+
log_stderr "Rank${mpi_rank} MPI Comm server exit code: $mpi_exit_code"
85+
86+
# Propagate task exit status
87+
if [ $task_exit_code -ne 0 ]; then
88+
exit $task_exit_code
89+
else
90+
exit $mpi_exit_code
91+
fi
7792
) &
7893

79-
log_stderr "rank${mpi_rank} run mgmn leader node with mpi_world_size: $(mpi_world_size) ..."
80-
log_stderr "rank0 host: $HOSTNAME"
94+
# Turn off "exit on error" so the following lines always run
95+
set +e
96+
97+
# Capture subshell PID
98+
subshell_pid=$!
99+
log_stderr "Rank${mpi_rank} Subshell PID: $subshell_pid"
100+
101+
log_stderr "Rank${mpi_rank} run mgmn leader node with mpi_world_size: $(mpi_world_size) ..."
102+
log_stderr "Rank0 host: $HOSTNAME"
81103
python3 -m tensorrt_llm.llmapi.mgmn_leader_node
104+
mgmn_leader_node_exit_code=$?
105+
log_stderr "Rank${mpi_rank} MGMN leader node exit code: $mgmn_leader_node_exit_code"
106+
107+
# Wait for subshell
108+
wait $subshell_pid
109+
# This is subshell's exit code
110+
subshell_exit_code=$?
111+
log_stderr "Rank${mpi_rank} Subshell exit code: $subshell_exit_code"
112+
113+
# Propagate subshell exit status
114+
if [ $subshell_exit_code -ne 0 ]; then
115+
exit $subshell_exit_code
116+
else
117+
exit $mgmn_leader_node_exit_code
118+
fi
82119
else
83-
log_stderr "rank${mpi_rank} run mgmn worker node with mpi_world_size: $(mpi_world_size) ..."
120+
# Turn off "exit on error" so the following lines always run
121+
set +e
122+
123+
log_stderr "Rank${mpi_rank} run mgmn worker node with mpi_world_size: $(mpi_world_size) ..."
84124
python3 -m tensorrt_llm.llmapi.mgmn_worker_node
125+
mgmn_worker_node_exit_code=$?
126+
log_stderr "Rank${mpi_rank} MGMN worker node exit code: $mgmn_worker_node_exit_code"
127+
128+
exit $mgmn_worker_node_exit_code
85129
fi

0 commit comments

Comments
 (0)