11#! /bin/bash
2- set -e
2+ set -Eeo pipefail
33
44task_with_command=(" $@ " )
55native_mpi_rank=$OMPI_COMM_WORLD_RANK
@@ -47,7 +47,7 @@ log_stderr "tllm_mpi_size: $tllm_mpi_size"
4747export_free_tcp_addr_for_spawn_proxy_process
4848
4949if [ -z " $mpi_rank " ] || [ " $mpi_rank " -eq 0 ]; then
50- log_stderr " rank ${mpi_rank} run ${task_with_command[@]} in background"
50+ log_stderr " Rank ${mpi_rank} run ${task_with_command[@]} in background"
5151
5252 # MPI doesn't allow spawn a process sharing the MPI environment in a MPI
5353 # process, or duplicate MPI_Init in the child process will cause undefined
@@ -70,16 +70,60 @@ if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
7070 done
7171 done
7272
73+ # Turn off "exit on error" so the following lines always run
74+ set +e
75+
7376 # Execute the task with cleaned environment
74- " ${task_with_command[@]} "
75- # stop the MPI Comm server
77+ " ${task_with_command[@]} "
78+ task_exit_code=$?
79+ log_stderr " Rank${mpi_rank} Task exit code: $task_exit_code "
80+
81+ # Stop the MPI Comm server
7682 python3 -m tensorrt_llm.llmapi.mgmn_leader_node --action stop
83+ mpi_exit_code=$?
84+ log_stderr " Rank${mpi_rank} MPI Comm server exit code: $mpi_exit_code "
85+
86+ # Propagate task exit status
87+ if [ $task_exit_code -ne 0 ]; then
88+ exit $task_exit_code
89+ else
90+ exit $mpi_exit_code
91+ fi
7792 ) &
7893
79- log_stderr " rank${mpi_rank} run mgmn leader node with mpi_world_size: $( mpi_world_size) ..."
80- log_stderr " rank0 host: $HOSTNAME "
94+ # Turn off "exit on error" so the following lines always run
95+ set +e
96+
97+ # Capture subshell PID
98+ subshell_pid=$!
99+ log_stderr " Rank${mpi_rank} Subshell PID: $subshell_pid "
100+
101+ log_stderr " Rank${mpi_rank} run mgmn leader node with mpi_world_size: $( mpi_world_size) ..."
102+ log_stderr " Rank0 host: $HOSTNAME "
81103 python3 -m tensorrt_llm.llmapi.mgmn_leader_node
104+ mgmn_leader_node_exit_code=$?
105+ log_stderr " Rank${mpi_rank} MGMN leader node exit code: $mgmn_leader_node_exit_code "
106+
107+ # Wait for subshell
108+ wait $subshell_pid
109+ # This is subshell's exit code
110+ subshell_exit_code=$?
111+ log_stderr " Rank${mpi_rank} Subshell exit code: $subshell_exit_code "
112+
113+ # Propagate subshell exit status
114+ if [ $subshell_exit_code -ne 0 ]; then
115+ exit $subshell_exit_code
116+ else
117+ exit $mgmn_leader_node_exit_code
118+ fi
82119else
83- log_stderr " rank${mpi_rank} run mgmn worker node with mpi_world_size: $( mpi_world_size) ..."
120+ # Turn off "exit on error" so the following lines always run
121+ set +e
122+
123+ log_stderr " Rank${mpi_rank} run mgmn worker node with mpi_world_size: $( mpi_world_size) ..."
84124 python3 -m tensorrt_llm.llmapi.mgmn_worker_node
125+ mgmn_worker_node_exit_code=$?
126+ log_stderr " Rank${mpi_rank} MGMN worker node exit code: $mgmn_worker_node_exit_code "
127+
128+ exit $mgmn_worker_node_exit_code
85129fi
0 commit comments