diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 7740362687e0..29a86eec20ef 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2074,14 +2074,21 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: self.running_batch.batch_is_full = False # Merge the new batch into the running batch. - # For prefill-only batch, we can avoid going through decoding step. - if not self.last_batch.is_empty() and not self.last_batch.is_prefill_only: + if not self.last_batch.is_empty(): if self.running_batch.is_empty(): self.running_batch = self.last_batch else: # Merge running_batch with prefill batch self.running_batch.merge_batch(self.last_batch) + # For prefill-only batch, filter out finished requests since they + # won't go through the decode step. This keeps running_batch accurate + # for load reporting (num_running_reqs via /get_load). + # Runs outside the last_batch block so stale requests are cleaned + # even when no new batches arrive (e.g. traffic stops). + if self.running_batch.is_prefill_only: + self.running_batch.filter_batch() + if self.dllm_config is not None: new_batch = self.get_new_batch_dllm() else: @@ -2100,8 +2107,11 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # Run prefill first if possible ret = new_batch else: - # Run decode - if not self.running_batch.is_empty(): + # Run decode (skip for prefill-only batches) + if ( + not self.running_batch.is_empty() + and not self.running_batch.is_prefill_only + ): self.running_batch = self.update_running_batch(self.running_batch) ret = self.running_batch if not self.running_batch.is_empty() else None else: