diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 57762532ccc9..50ebacddc8b8 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -1229,24 +1229,49 @@ def _wait_for_scheduler_ready( scheduler_pipe_readers: List, scheduler_procs: List, ) -> List[Dict]: - """Wait for the model to finish loading and return scheduler infos.""" + """Wait for the model to finish loading and return scheduler infos. + + Uses polling to detect child process death quickly, rather than blocking + indefinitely on pipe recv(). This prevents the launch from hanging when + a child process is killed (e.g. by OOM killer via SIGKILL) before it can + send any data through the pipe. + + On each poll timeout, checks ALL processes (not just the current one) so that + a death in any rank is detected promptly regardless of iteration order. + """ scheduler_infos = [] for i in range(len(scheduler_pipe_readers)): - try: - data = scheduler_pipe_readers[i].recv() - except EOFError: - logger.error( - f"Rank {i} scheduler is dead. Please check if there are relevant logs." - ) - scheduler_procs[i].join() - logger.error(f"Exit code: {scheduler_procs[i].exitcode}") - raise + while True: + if scheduler_pipe_readers[i].poll(timeout=5.0): + try: + data = scheduler_pipe_readers[i].recv() + except EOFError: + scheduler_procs[i].join(timeout=10) + raise RuntimeError( + f"Rank {i} scheduler died during initialization " + f"(exit code: {scheduler_procs[i].exitcode}). " + f"If exit code is -9 (SIGKILL), a common cause is the OS OOM killer. " + f"Run `dmesg -T | grep -i oom` to check." + ) + scheduler_infos.append(data) + break + else: + # Check ALL processes, not just the current one + for j in range(len(scheduler_procs)): + if not scheduler_procs[j].is_alive(): + scheduler_procs[j].join(timeout=10) + raise RuntimeError( + f"Rank {j} scheduler died during initialization " + f"(exit code: {scheduler_procs[j].exitcode}). " + f"If exit code is -9 (SIGKILL), a common cause is the OS OOM killer. " + f"Run `dmesg -T | grep -i oom` to check." + ) + for data in scheduler_infos: if data["status"] != "ready": raise RuntimeError( "Initialization failed. Please see the error messages above." ) - scheduler_infos.append(data) return scheduler_infos