Skip to content
Merged
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
13a1c49
fix: scheduler launch hang when non-current rank dies
alphabetc1 Mar 10, 2026
dee36dd
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 11, 2026
172fc18
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 13, 2026
6610bf2
fix
alphabetc1 Mar 13, 2026
e06e239
fix
alphabetc1 Mar 13, 2026
1814856
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 18, 2026
b816dac
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 20, 2026
fbd4fe0
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 21, 2026
13cfeb4
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 22, 2026
7951f88
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 24, 2026
81becc4
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 25, 2026
1d3fe08
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 26, 2026
15b1c46
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 26, 2026
babc90d
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 26, 2026
066ffe6
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 26, 2026
e1c7dc7
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 26, 2026
a9bcdcf
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 27, 2026
5409e77
Merge branch 'main' into fix/scheduler_wait_polling
alphabetc1 Mar 27, 2026
235060f
Merge branch 'main' into fix/scheduler_wait_polling
hnyls2002 Mar 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 36 additions & 11 deletions python/sglang/srt/entrypoints/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1229,24 +1229,49 @@ def _wait_for_scheduler_ready(
scheduler_pipe_readers: List,
scheduler_procs: List,
) -> List[Dict]:
"""Wait for the model to finish loading and return scheduler infos."""
"""Wait for the model to finish loading and return scheduler infos.

Uses polling to detect child process death quickly, rather than blocking
indefinitely on pipe recv(). This prevents the launch from hanging when
a child process is killed (e.g. by OOM killer via SIGKILL) before it can
send any data through the pipe.

On each poll timeout, checks ALL processes (not just the current one) so that
a death in any rank is detected promptly regardless of iteration order.
"""
scheduler_infos = []
for i in range(len(scheduler_pipe_readers)):
try:
data = scheduler_pipe_readers[i].recv()
except EOFError:
logger.error(
f"Rank {i} scheduler is dead. Please check if there are relevant logs."
)
scheduler_procs[i].join()
logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
raise
while True:
if scheduler_pipe_readers[i].poll(timeout=5.0):
try:
data = scheduler_pipe_readers[i].recv()
except EOFError:
scheduler_procs[i].join(timeout=10)
raise RuntimeError(
f"Rank {i} scheduler died during initialization "
f"(exit code: {scheduler_procs[i].exitcode}). "
f"If exit code is -9 (SIGKILL), a common cause is the OS OOM killer. "
f"Run `dmesg -T | grep -i oom` to check."
)
scheduler_infos.append(data)
break
else:
# Check ALL processes, not just the current one
for j in range(len(scheduler_procs)):
if not scheduler_procs[j].is_alive():
scheduler_procs[j].join(timeout=10)
raise RuntimeError(
f"Rank {j} scheduler died during initialization "
f"(exit code: {scheduler_procs[j].exitcode}). "
f"If exit code is -9 (SIGKILL), a common cause is the OS OOM killer. "
f"Run `dmesg -T | grep -i oom` to check."
)

for data in scheduler_infos:
if data["status"] != "ready":
raise RuntimeError(
"Initialization failed. Please see the error messages above."
)
scheduler_infos.append(data)
return scheduler_infos


Expand Down
Loading