From de3859646b9041ad3c3e87f13f6b8c96cd937b64 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:43:21 +0800 Subject: [PATCH 01/52] Prep: abort_request dedup for chunked-resume dual-queue holding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When chunked-resume reqs are held in both waiting_queue and batch.reqs (stateless-scheduler refactor), abort_request would otherwise process them twice (queue pop + to_finish), causing duplicate send_output and double release_kv_cache. Build batch_rids upfront and skip waiting_queue removal for reqs already in batch — let to_finish path handle them. Pre-flight for stateless-scheduler v2. --- python/sglang/srt/managers/scheduler.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 10df1914af20..40b027cbdf8b 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -3550,10 +3550,22 @@ def handle_rpc_request(self, recv_req: RpcReqInput): def abort_request(self, recv_req: AbortReq): # todo hisparse, release resources for abort requests in hisparse coordinator + # Build batch rid set: chunked-resume reqs may live in both waiting_queue + # and batch.reqs simultaneously (stateless-scheduler refactor). Skip the + # waiting_queue removal for those — let the to_finish path below handle + # them, otherwise we send_output / release_kv_cache twice. + if self.cur_batch is self.running_batch or self.cur_batch is None: + batch_reqs = self.running_batch.reqs + else: + batch_reqs = self.running_batch.reqs + self.cur_batch.reqs + batch_rids = {r.rid for r in batch_reqs} + # Delete requests in the waiting queue to_del = [] for i, req in enumerate(self.waiting_queue): - if recv_req.abort_all or req.rid.startswith(recv_req.rid): + if (recv_req.abort_all or req.rid.startswith(recv_req.rid)) and ( + req.rid not in batch_rids + ): to_del.append(i) # Sort in reverse order to avoid index issues when deleting @@ -3632,13 +3644,8 @@ def abort_request(self, recv_req: AbortReq): remaining_retracted.append(decode_req) self.disagg_decode_prealloc_queue.retracted_queue = remaining_retracted - # Delete requests in the running batch - if self.cur_batch is self.running_batch or self.cur_batch is None: - reqs = self.running_batch.reqs - else: - reqs = self.running_batch.reqs + self.cur_batch.reqs - - for req in reqs: + # Delete requests in the running batch (reuse batch_reqs built above) + for req in batch_reqs: if not req.finished() and ( recv_req.abort_all or req.rid.startswith(recv_req.rid) ): From c79a73bec4ae0d007dd80529882f3b59545b8e78 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:44:35 +0800 Subject: [PATCH 02/52] Prep: subtract prefix_indices from waiting_queue pending tokens sum For chunked-resume reqs (after the upcoming stateless-scheduler switch) that live in waiting_queue with non-empty prefix_indices, summing req.seqlen overcounts the committed prefix. Switch to seqlen - prefix for waiting reqs; keep the chunked_req block until that field is removed. Today's behavior is unchanged for fresh waiting reqs whose prefix_indices is empty. Pre-flight for stateless-scheduler v2. --- python/sglang/srt/observability/scheduler_metrics_mixin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/observability/scheduler_metrics_mixin.py b/python/sglang/srt/observability/scheduler_metrics_mixin.py index 050895373542..86cd5bfb1e81 100644 --- a/python/sglang/srt/observability/scheduler_metrics_mixin.py +++ b/python/sglang/srt/observability/scheduler_metrics_mixin.py @@ -973,7 +973,9 @@ def _get_num_pending_tokens(self: Scheduler, chunk_deduct: int = 0) -> int: time ``prefix_indices`` is already up-to-date, so the default 0 is correct. """ - num_pending_tokens = sum(req.seqlen for req in self.waiting_queue) + num_pending_tokens = sum( + req.seqlen - len(req.prefix_indices) for req in self.waiting_queue + ) if self.chunked_req is not None: req = self.chunked_req num_pending_tokens += req.seqlen - len(req.prefix_indices) - chunk_deduct From a5915a193fa21dfd525e667bbd8783442288baea Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:45:07 +0800 Subject: [PATCH 03/52] Prep: document filter_batch chunked-exclusion invariant Explicit comment that reqs still doing prefill (chunked-resume or DLLM staging) must not be merged into running_batch. Today enforced via chunked_req_to_exclude param; stateless-scheduler v2 will move to a per-req predicate. Pre-flight for v2. --- python/sglang/srt/managers/schedule_batch.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 75ed09458364..97deb27b03f2 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -2403,6 +2403,12 @@ def filter_batch( # FIXME(lsyin): deprecate this API after spec v1 is deprecated v1_spec_info_filtered: Optional[bool] = False, ): + # Invariant: reqs still doing prefill (chunked-resume or DLLM staging) + # must never be merged into running_batch via this filter — running_batch + # runs decode forward, and admitting a mid-prefill req there causes + # shape mismatch + double KV accounting. Today the invariant is enforced + # by callers passing chunked_req_to_exclude; the stateless-scheduler v2 + # refactor will move this to a per-req predicate. # FIXME(lsyin): used here to get the correct seq_lens # The batch has been launched but we need it verified to get correct next batch info self.maybe_wait_verify_done() From 1c3bf8e7dbc7f6a099a4af6b079209f48eed7c1a Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:46:47 +0800 Subject: [PATCH 04/52] Bound cache_unfinished_req row read by kv_committed_len init_next_round_input resets req.fill_ids to len(origin_input_ids) + len(output_ids) before stash, but the req_to_token row only holds valid KV indices up to kv_committed_len. Under SWA early-return (and other paths where admission backs off after init_next_round_input), reading [req_pool_idx, :len(fill_ids)] yields garbage beyond kv_committed_len, which then gets inserted into the radix tree as a prefix entry, causing prefix-hit corruption. Bound the read to req.kv_committed_len in all 6 cache impls. Add assert kv_committed_len >= cache_protected_len at each entry to surface state-machine violations as crashes rather than silent slice underflow. Touches: radix_cache, swa_radix_cache, unified_radix_cache, mamba_radix_cache, radix_cache_cpp, chunk_cache. Pre-flight for stateless-scheduler v2. --- python/sglang/srt/mem_cache/chunk_cache.py | 3 ++- python/sglang/srt/mem_cache/mamba_radix_cache.py | 9 ++++++--- python/sglang/srt/mem_cache/radix_cache.py | 11 +++++++++-- python/sglang/srt/mem_cache/radix_cache_cpp.py | 6 ++++-- python/sglang/srt/mem_cache/swa_radix_cache.py | 11 ++++++++--- python/sglang/srt/mem_cache/unified_radix_cache.py | 9 ++++++--- 6 files changed, 35 insertions(+), 14 deletions(-) diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py index 6d34a3aa1fc2..8a970b4bedcb 100644 --- a/python/sglang/srt/mem_cache/chunk_cache.py +++ b/python/sglang/srt/mem_cache/chunk_cache.py @@ -84,8 +84,9 @@ def cache_finished_req(self, req: Req, is_insert: bool = True): self.token_to_kv_pool_allocator.free(kv_indices) def cache_unfinished_req(self, req: Req, chunked=False): + # Bound row read by kv_committed_len; see radix_cache.py for rationale. kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(req.fill_ids) + req.req_pool_idx, : req.kv_committed_len ] # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True) diff --git a/python/sglang/srt/mem_cache/mamba_radix_cache.py b/python/sglang/srt/mem_cache/mamba_radix_cache.py index 55ee7983e953..325b3aa08aae 100644 --- a/python/sglang/srt/mem_cache/mamba_radix_cache.py +++ b/python/sglang/srt/mem_cache/mamba_radix_cache.py @@ -599,17 +599,20 @@ def cache_finished_req(self, req: Req, is_insert: bool = True) -> None: def cache_unfinished_req(self, req: Req, chunked=False) -> None: """Cache request when it is unfinished.""" + # Bound row read by kv_committed_len; see radix_cache.py for rationale. + assert req.kv_committed_len >= req.cache_protected_len + read_len = req.kv_committed_len def _skip_cache_unfinished_req(req: Req) -> None: kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(req.fill_ids) + req.req_pool_idx, :read_len ] # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True) return - token_ids = req.fill_ids + token_ids = req.fill_ids[:read_len] cache_len = ( req.mamba_last_track_seqlen if self.enable_mamba_extra_buffer @@ -619,7 +622,7 @@ def _skip_cache_unfinished_req(req: Req) -> None: return _skip_cache_unfinished_req(req) kv_indices_orig = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(token_ids) + req.req_pool_idx, :read_len ] # kv_indices is the kv indices to be cached kv_indices = kv_indices_orig[:cache_len] diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py index 8a24c5e15926..9e1e93e48a79 100644 --- a/python/sglang/srt/mem_cache/radix_cache.py +++ b/python/sglang/srt/mem_cache/radix_cache.py @@ -487,9 +487,16 @@ def cache_unfinished_req(self, req: Req, chunked=False): if self.disable: return - token_ids = req.fill_ids + # Bound the row read by kv_committed_len (the actually-written prefix + # length on the row), not by len(fill_ids). They are equal in the + # common path, but init_next_round_input resets fill_ids to the full + # origin + output length while the row only holds KV up to + # kv_committed_len — reading beyond that yields garbage slot indices. + assert req.kv_committed_len >= req.cache_protected_len + read_len = req.kv_committed_len + token_ids = req.fill_ids[:read_len] kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(token_ids) + req.req_pool_idx, :read_len ] radix_key = RadixKey( diff --git a/python/sglang/srt/mem_cache/radix_cache_cpp.py b/python/sglang/srt/mem_cache/radix_cache_cpp.py index 66f9fad96ad7..834654e8e8f8 100644 --- a/python/sglang/srt/mem_cache/radix_cache_cpp.py +++ b/python/sglang/srt/mem_cache/radix_cache_cpp.py @@ -209,8 +209,10 @@ def cache_finished_req(self, req: Req, is_insert: bool = True): def cache_unfinished_req(self, req: Req, chunked=False): """Cache request when it is unfinished.""" assert req.req_pool_idx is not None - token_ids = req.fill_ids - prefill_len = len(token_ids) # prefill only (maybe chunked) + # Bound row read by kv_committed_len; see radix_cache.py for rationale. + assert req.kv_committed_len >= req.cache_protected_len + prefill_len = req.kv_committed_len + token_ids = req.fill_ids[:prefill_len] kv_indices = self.req_to_token_pool.req_to_token[ req.req_pool_idx, :prefill_len ].to(dtype=torch.int64, copy=True) diff --git a/python/sglang/srt/mem_cache/swa_radix_cache.py b/python/sglang/srt/mem_cache/swa_radix_cache.py index af2d99e96e6d..2457ec817446 100644 --- a/python/sglang/srt/mem_cache/swa_radix_cache.py +++ b/python/sglang/srt/mem_cache/swa_radix_cache.py @@ -482,18 +482,23 @@ def cache_finished_req(self, req: Req, is_insert: bool = True) -> None: def cache_unfinished_req(self, req: Req, chunked=False) -> None: """Cache request when it is unfinished.""" + # Bound the row read by kv_committed_len, not len(fill_ids); see + # radix_cache.py:cache_unfinished_req for the rationale (SWA early- + # return + init_next_round_input leaves fill_ids longer than the row). + assert req.kv_committed_len >= req.cache_protected_len + read_len = req.kv_committed_len if self.disable: kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(req.fill_ids) + req.req_pool_idx, :read_len ] # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later req.prefix_indices = kv_indices return - token_ids = req.fill_ids + token_ids = req.fill_ids[:read_len] kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(token_ids) + req.req_pool_idx, :read_len ] radix_key = RadixKey( diff --git a/python/sglang/srt/mem_cache/unified_radix_cache.py b/python/sglang/srt/mem_cache/unified_radix_cache.py index 80a3da5bb190..ad0bff3b80b6 100644 --- a/python/sglang/srt/mem_cache/unified_radix_cache.py +++ b/python/sglang/srt/mem_cache/unified_radix_cache.py @@ -490,17 +490,20 @@ def cache_unfinished_req(self, req: Req, chunked=False, **kwargs) -> None: if self.session.try_cache_unfinished_req(req, chunked=chunked, **kwargs): return - token_ids = req.fill_ids + # Bound row read by kv_committed_len; see radix_cache.py for rationale. + assert req.kv_committed_len >= req.cache_protected_len + read_len = req.kv_committed_len + token_ids = req.fill_ids[:read_len] if self.disable: kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(token_ids) + req.req_pool_idx, :read_len ] req.prefix_indices = kv_indices return kv_indices_orig = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(token_ids) + req.req_pool_idx, :read_len ] # components prepare insert data + return effective cache_len From 9b361aef46ae6af35700f0b1cf3c7aa59a4fa38a Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:47:22 +0800 Subject: [PATCH 05/52] Drop is_chunked from req_to_token_pool alloc assert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Row-pool reuse should only check 'this row has committed KV' — not whether the req is chunked. kv_committed_len > 0 covers chunked-resume, DLLM staging, and any other reuse case. Step in decoupling chunked from req_pool_idx. Same simplification applied to disaggregation/decode.py. Pre-flight for stateless-scheduler v2. --- python/sglang/srt/disaggregation/decode.py | 4 ++-- python/sglang/srt/mem_cache/memory_pool.py | 13 +++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index b9f8e5994381..03c70f8ee84b 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -152,8 +152,8 @@ def alloc(self, reqs: List["Req"]) -> Optional[List[int]]: len(reusing) <= 1 ), "only one chunked request may reuse req_pool_idx in a batch" assert all( - reqs[i].is_chunked > 0 or reqs[i].kv_committed_len > 0 for i in reusing - ), "reusing request must be chunked or have committed KV" + reqs[i].kv_committed_len > 0 for i in reusing + ), "reusing request must have committed KV" need_size = len(reqs) - len(reusing) if need_size > len(self.free_slots): diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index 23d15f71ac4d..43f76c112c34 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -160,15 +160,12 @@ def alloc(self, reqs: list[Req]) -> Optional[List[int]]: # Indices of reqs that already have a req_pool_idx and will reuse # their existing slot (e.g. chunked prefill continuing across chunks). reusing = [i for i, r in enumerate(reqs) if r.req_pool_idx is not None] - # NOTE: this check is relaxed temporarily - # https://github.com/sgl-project/sglang/pull/20476 - # if not any(r.is_dllm() for r in reqs): - # assert ( - # sum(1 for i in reusing if reqs[i].is_chunked > 0) <= 1 - # ), "only one chunked request may reuse req_pool_idx in a batch" + # The row pool only cares whether the row has committed KV — it does + # not need to know whether the req is chunked. kv_committed_len > 0 + # naturally covers chunked-resume + DLLM staging + any reuse case. assert all( - reqs[i].is_chunked > 0 or reqs[i].kv_committed_len > 0 for i in reusing - ), "reusing request must be chunked or have committed KV" + reqs[i].kv_committed_len > 0 for i in reusing + ), "reusing request must have committed KV" need_size = len(reqs) - len(reusing) if need_size > len(self.free_slots): From 74f1d8bbabba6b9395d7612e4e826c0978d7d10e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:51:40 +0800 Subject: [PATCH 06/52] Unify chunked admission via add_one_req reuse branch + add has_pending_chunk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the dedicated add_chunked_req method with a reuse branch inside add_one_req, gated on (req.kv_committed_len > 0 and not req.is_dllm()). The reuse branch: - skips _req_inc_lock_ref (lock already held by previous stash) - skips init_load_back (host_hit_length naturally 0 after reset in prepare_for_extend post-metric-computation) - passes 0 as prefix budget (already counted by previous stash) Add Req.has_pending_chunk: bool — persistent cross-iter flag set by admission when truncated=True, cleared on last-chunk admit or retract. Used (here) to mirror Scheduler.chunked_req; future commits will use it to drive Stage A stash and filter_batch predicates. Delete: - PrefillAdder.add_chunked_req method - PrefillAdder.new_chunked_req field - has_chunked_req= parameter on add_one_req (unused, removed call sites) Scheduler.chunked_req is retained at this commit and synchronized via has_pending_chunk after admission (single-flight invariant asserted). Add host_hit_length reset inside prepare_for_extend right after the cached_tokens_host metric is recorded — required so chunked-resume reqs don't re-trigger init_load_back on subsequent admissions (preflight 7). Part of stateless-scheduler v2. --- python/sglang/srt/dllm/mixin/scheduler.py | 1 - python/sglang/srt/managers/schedule_batch.py | 13 +++ python/sglang/srt/managers/schedule_policy.py | 83 +++++++++---------- python/sglang/srt/managers/scheduler.py | 36 ++++++-- 4 files changed, 78 insertions(+), 55 deletions(-) diff --git a/python/sglang/srt/dllm/mixin/scheduler.py b/python/sglang/srt/dllm/mixin/scheduler.py index 157ab219276b..e8a563703811 100644 --- a/python/sglang/srt/dllm/mixin/scheduler.py +++ b/python/sglang/srt/dllm/mixin/scheduler.py @@ -256,7 +256,6 @@ def process_dllm_incoming_reqs( req.init_next_round_input(self.tree_cache) res = adder.add_one_req( req, - has_chunked_req=True, truncation_align_size=self.truncation_align_size, ) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 97deb27b03f2..2d1733d9f4b2 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -757,6 +757,14 @@ def __init__( # processed. self.is_chunked = 0 + # Persistent (cross-iter) flag set by admission when this req's + # current admission was truncated (more chunks remain). Cleared + # when last chunk is admitted (truncated=False) or on retract. + # Used by Stage A stash detection, filter_batch exclusion, and + # add_one_req's reuse-vs-fresh branch. Independent of is_chunked + # counter (transient) and kv_committed_len (derived). + self.has_pending_chunk = False + # For retraction self.is_retracted = False # Indicates if the req has ever been retracted. @@ -1258,6 +1266,7 @@ def reset_for_retract(self): self.temp_input_top_logprobs_idx = None self.extend_logprob_start_len = 0 self.is_chunked = 0 + self.has_pending_chunk = False self.mamba_pool_idx = None self.mamba_ping_pong_track_buffer = None self.mamba_next_track_idx = None @@ -1835,6 +1844,10 @@ def prepare_for_extend(self): req.cached_tokens_host = host_portion req.cached_tokens_storage = storage_portion req._cache_breakdown_computed = True + # Reset host_hit_length after metric is computed so that + # subsequent chunks' admission paths see host_hit_length == 0 + # and naturally skip init_load_back (host KV already loaded). + req.host_hit_length = 0 req.already_computed = seq_len req.is_retracted = False diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index 29b90038ad26..b924592fc1f1 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -441,7 +441,6 @@ def __init__( self.req_states = None self.can_run_list = [] self.preempt_list = [] - self.new_chunked_req = None self.log_hit_tokens = 0 # TODO(lsyin): report the real input tokens excluding page alignment self.log_input_tokens = 0 @@ -663,41 +662,6 @@ def add_dllm_staging_req(self, req: Req): else AddReqResult.CONTINUE ) - def add_chunked_req(self, req: Req): - if self.dllm_config is not None: - _rem_tokens = self._get_dllm_remain_tokens() - else: - _rem_tokens = min(self.rem_chunk_tokens, int(self.rem_total_tokens)) - if self.is_hybrid_swa: - # alloc_extend needs extend_num_tokens + page_size per request, - # so reserve one page here to avoid OOM - _rem_tokens = min( - _rem_tokens, int(self.rem_swa_tokens) - self.page_size - ) - # The chunked_req must be added to the list; otherwise, it will cause a memory leak. - # Therefore, in certain cases where _rem_tokens <= 0, it should be replaced with rem_chunk_tokens. - if _rem_tokens <= 0: - if self.is_hybrid_swa: - return req - _rem_tokens = self.rem_chunk_tokens - - truncated = req.extend_input_len > _rem_tokens - req.set_extend_input_len(min(req.extend_input_len, _rem_tokens)) - req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len] - self.can_run_list.append(req) - self._update_prefill_budget( - 0, - req.extend_input_len, - ( - min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS) - if not truncated - else 0 - ), - ) - - # Return if chunked prefill not finished - return req if truncated else None - @contextmanager def _lock_node(self, last_node: TreeNode): dec_lock_params = None @@ -784,6 +748,7 @@ def add_req_state(r, insert_sort=False): return AddReqResult.OTHER self._add_dllm_req(req, 0) + truncated = False elif ( self.rem_chunk_tokens is None # chunked prefill is disabled or req.extend_input_len <= self.rem_chunk_tokens # it is the last chunk @@ -795,6 +760,7 @@ def add_req_state(r, insert_sort=False): req.extend_input_len, min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS), ) + truncated = False else: if self.rem_chunk_tokens <= 0: return AddReqResult.OTHER @@ -805,14 +771,24 @@ def add_req_state(r, insert_sort=False): req.set_extend_input_len(trunc_len) req.fill_ids = req.fill_ids[:trunc_len] self.can_run_list.append(req) - self.new_chunked_req = req self._update_prefill_budget(0, trunc_len, 0) + truncated = True + + if not req.is_dllm(): + req.has_pending_chunk = truncated return self.budget_state() def add_one_req( - self, req: Req, has_chunked_req: bool, truncation_align_size: Optional[int] + self, req: Req, truncation_align_size: Optional[int] ): + # Reuse path: this req was admitted in a previous iter, has a row + # with committed KV (kv_committed_len > 0), and is mid-prefill. Skip + # fresh-req setup (lock_ref already held by previous stash; + # init_load_back already ran on first admission; prefix already + # counted in tree). DLLM has its own path and never takes reuse here. + is_resume = req.kv_committed_len > 0 and not req.is_dllm() + if (self.prefill_delayer_single_pass is not None) and ( not self.prefill_delayer_single_pass.negotiate_should_allow_prefill( local_prefillable=True, @@ -874,6 +850,10 @@ def add_one_req( if swa_needed >= self.rem_swa_tokens: return AddReqResult.NO_TOKEN + # Fresh-only init_load_back. For reuse, host_hit_length was set + # on first admission and reset by prepare_for_extend after the + # cache-breakdown metric was computed, so the predicate naturally + # short-circuits here for reuse. if req.host_hit_length > 0: new_indices, req.last_node = self.tree_cache.init_load_back( InitLoadBackParams( @@ -892,6 +872,10 @@ def add_one_req( if input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0: return AddReqResult.OTHER + # Budget prefix_len: 0 for reuse (already counted by previous + # admission's stash into tree); actual prefix_len for fresh. + budget_prefix = 0 if is_resume else prefix_len + if self.dllm_config is not None: if self.rem_dllm_tokens <= 0: return AddReqResult.OTHER @@ -902,20 +886,24 @@ def add_one_req( self._add_dllm_req(req, prefix_len) self._req_inc_lock_ref(req) + truncated = False elif self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens: - # Non-chunked prefill + # Non-chunked prefill (or last chunk of a chunked-resume req). self.can_run_list.append(req) - self._req_inc_lock_ref(req) + if not is_resume: + self._req_inc_lock_ref(req) self._update_prefill_budget( - prefix_len, + budget_prefix, input_tokens, min( req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS, ), ) + truncated = False else: + # Chunked prefill: this admission doesn't complete the prefill. # Make sure at least one page is available trunc_len = self.rem_chunk_tokens // self.page_size * self.page_size @@ -940,15 +928,20 @@ def add_one_req( if trunc_len <= 0: return AddReqResult.OTHER - # Chunked prefill req.set_extend_input_len(trunc_len) req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len] self.can_run_list.append(req) - self.new_chunked_req = req - self._req_inc_lock_ref(req) - self._update_prefill_budget(prefix_len, trunc_len, 0) + if not is_resume: + self._req_inc_lock_ref(req) + self._update_prefill_budget(budget_prefix, trunc_len, 0) + truncated = True + + # has_pending_chunk: persistent flag carrying chunked-resume state + # across iters. DLLM uses its own staging_queue + is_chunked counter. + if not req.is_dllm(): + req.has_pending_chunk = truncated return self.budget_state() diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 40b027cbdf8b..79d680e57d15 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2698,12 +2698,23 @@ def _get_new_batch_prefill_raw( waiting_queue_len=len(self.waiting_queue), ) + # Re-admit the in-flight chunked req via the unified add_one_req + # entry. add_one_req's reuse branch (gated on kv_committed_len > 0) + # mirrors the old add_chunked_req's behavior: skip lock_ref inc, + # init_load_back, and prefix budget. Sets req.has_pending_chunk to + # truncated. if self.chunked_req is not None: self.chunked_req.init_next_round_input() - self.chunked_req = adder.add_chunked_req(self.chunked_req) - self._chunked_req_scheduled_last_iter = ( - self.chunked_req in adder.can_run_list + adder.add_one_req( + self.chunked_req, + truncation_align_size=self.truncation_align_size, ) + # After admit, has_pending_chunk reflects whether more chunks + # remain. Mirror it into self.chunked_req for the existing + # Stage A stash path (deleted in a later commit). + if not self.chunked_req.has_pending_chunk: + self.chunked_req = None + self._chunked_req_scheduled_last_iter = self.chunked_req is not None else: self._chunked_req_scheduled_last_iter = False @@ -2750,7 +2761,6 @@ def _get_new_batch_prefill_raw( req.init_next_round_input(self.tree_cache) res = adder.add_one_req( req, - has_chunked_req=(self.chunked_req is not None), truncation_align_size=self.truncation_align_size, ) @@ -2793,12 +2803,20 @@ def _get_new_batch_prefill_raw( for req in adder.preempt_list: self._add_request_to_queue(req) - if adder.new_chunked_req is not None: - # Update chunked prefill + # Identify newly-truncated chunked-resume reqs admitted this iter via + # add_one_req's reuse/chunked branch. has_pending_chunk is set by + # add_one_req when truncated=True. The "newly chunked" set excludes + # self.chunked_req which was already tracked from previous iter. + new_chunked = [ + r for r in can_run_list if r.has_pending_chunk and r is not self.chunked_req + ] + assert ( + len(new_chunked) <= 1 + ), "single-flight invariant: at most one new chunked req per iter" + if new_chunked: assert self.chunked_req is None - self.chunked_req = adder.new_chunked_req - # new_chunked_req is added to can_run_list by add_one_req, - # so it will be scheduled this iter -> stash is needed next iter. + self.chunked_req = new_chunked[0] + # The chunked req is scheduled this iter -> stash needed next iter. self._chunked_req_scheduled_last_iter = True if self.chunked_req is not None: From c445a82cf5738f6165456e3ae4541596e8fa4d72 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:56:56 +0800 Subject: [PATCH 07/52] Switch chunked-resume to waiting_queue holding; delete chunked_req fields Core semantic switch of the stateless-scheduler refactor. Scheduler / Batch no longer maintain any chunked-aware state. Chunked- resume reqs sit in self.waiting_queue across iters with priority + the new Req.has_pending_chunk flag from commit 6. Deletes: - Scheduler.chunked_req field - Scheduler._chunked_req_scheduled_last_iter field - Scheduler.stash_chunked_request method - ScheduleBatch.chunked_req field - filter_batch's chunked_req_to_exclude= parameter - ScheduleBatch.init_new's chunked_req= parameter Changes: - Stage A stash: replaced 'if self.chunked_req: stash(self.chunked_req)' with 'for req in waiting_queue: if req.has_pending_chunk and not is_dllm: cache_unfinished_req(...)'. DLLM staging stash kept as separate sub-loop (DLLM reqs live in dllm_manager.staging_queue, not waiting_queue). - filter_batch predicate: not finished AND not has_pending_chunk AND not is_chunked > 0 AND not is_dllm. The is_chunked > 0 clause covers the PP window where mb_a's last-chunk admit cleared has_pending_chunk but mb_b's middle chunk is still in-flight (preflight 2). - merge_batch asserts no req has has_pending_chunk (downstream invariant). - get_new_batch_prefill: admission loop now handles chunked-resume naturally via priority + reuse branch in add_one_req. Removed dedicated pre-loop block. Dynamic chunking + chunk_deduct now derived from the single chunked-resume req in waiting_queue. - waiting_queue removal at end of admission: keeps reqs with has_pending_chunk so they stay for next iter. - Use init_next_round_input() without tree_cache for chunked-resume in the admission loop (preserves last_node + lock_ref pairing). Disaggregation: - prefill.py process_prefill_chunk: per-req stash for waiting_queue chunked-resume; filter_batch uses internal predicate. - decode.py prebuilt path: assert reframed against has_pending_chunk. Metrics: - _get_num_pending_tokens: drop the chunked_req block (already counted via waiting_queue sum). Part of stateless-scheduler v2. --- python/sglang/srt/disaggregation/decode.py | 2 +- python/sglang/srt/disaggregation/prefill.py | 38 ++-- python/sglang/srt/managers/schedule_batch.py | 30 +-- python/sglang/srt/managers/scheduler.py | 177 ++++++++---------- python/sglang/srt/mem_cache/chunk_cache.py | 2 +- .../sglang/srt/mem_cache/mamba_radix_cache.py | 4 +- python/sglang/srt/mem_cache/radix_cache.py | 2 +- .../sglang/srt/mem_cache/swa_radix_cache.py | 4 +- .../observability/scheduler_metrics_mixin.py | 9 +- 9 files changed, 118 insertions(+), 150 deletions(-) diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index 03c70f8ee84b..797071794d68 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -1646,7 +1646,7 @@ def get_next_disagg_decode_batch_to_run( # Process pending prebuilt batch: output processing + filter + merge new_prebuilt_batch = self.get_new_prebuilt_batch() if new_prebuilt_batch: - assert self.chunked_req is None + assert not any(r.has_pending_chunk for r in self.waiting_queue) self.process_batch_result_prebuilt(new_prebuilt_batch) new_prebuilt_batch.filter_batch() if not new_prebuilt_batch.is_empty(): diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 7ddcbe169d7d..715b7739ccbf 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -715,30 +715,26 @@ def get_transferred_rids(self: Scheduler) -> List[str]: return transferred_rids def process_prefill_chunk(self: Scheduler) -> None: - chunked_req_to_exclude = set() - if self.chunked_req: - chunked_req_to_exclude.add(self.chunked_req) - maybe_cache_unfinished_req(self.chunked_req, self.tree_cache, chunked=True) - if self.enable_overlap: - # Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved - self.chunked_req.tmp_end_idx = min( - len(self.chunked_req.fill_ids), - len(self.chunked_req.origin_input_ids), - ) - else: - self.send_kv_chunk(self.chunked_req) - self.running_batch.batch_is_full = False + # Per-req stash for any in-flight chunked-resume reqs (now sitting in + # the waiting_queue with has_pending_chunk == True). + for req in self.waiting_queue: + if req.has_pending_chunk and not req.is_dllm(): + maybe_cache_unfinished_req(req, self.tree_cache, chunked=True) + if self.enable_overlap: + # Delay KV transfer to process_batch_result_disagg_prefill + # when overlap is enabled to ensure results are resolved. + req.tmp_end_idx = min( + len(req.fill_ids), + len(req.origin_input_ids), + ) + else: + self.send_kv_chunk(req) + self.running_batch.batch_is_full = False if self.last_batch and self.last_batch.forward_mode.is_extend(): - if self.last_batch.chunked_req: - # In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req. - # We need to discard it. - chunked_req_to_exclude.add(self.last_batch.chunked_req) - + # filter_batch's internal predicate excludes still-prefilling reqs. last_bs = self.last_batch.batch_size() - self.last_batch.filter_batch( - chunked_req_to_exclude=list(chunked_req_to_exclude) - ) + self.last_batch.filter_batch() if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 2d1733d9f4b2..83254cb50e9f 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1402,9 +1402,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): # This is an optimization to reduce the overhead of the prefill check. batch_is_full: bool = False - # For chunked prefill in PP - chunked_req: Optional[Req] = None - # Sampling info sampling_info: SamplingBatchInfo = None @@ -1538,7 +1535,6 @@ def init_new( model_config: ModelConfig, enable_overlap: bool, spec_algorithm: SpeculativeAlgorithm, - chunked_req: Optional[Req] = None, dllm_config: Optional[DllmConfig] = None, ): return_logprob = any(req.return_logprob for req in reqs) @@ -1564,7 +1560,6 @@ def init_new( return_routed_experts=any(req.return_routed_experts for req in reqs), return_indexer_topk=any(req.return_indexer_topk for req in reqs), is_prefill_only=all(req.is_prefill_only for req in reqs), - chunked_req=chunked_req, dllm_config=dllm_config, ) return batch @@ -2411,31 +2406,29 @@ def maybe_wait_verify_done(self): def filter_batch( self, - chunked_req_to_exclude: Optional[Union[Req, List[Req]]] = None, keep_indices: Optional[List[int]] = None, # FIXME(lsyin): deprecate this API after spec v1 is deprecated v1_spec_info_filtered: Optional[bool] = False, ): - # Invariant: reqs still doing prefill (chunked-resume or DLLM staging) + # Invariant: reqs still doing prefill (chunked-resume or DLLM-managed) # must never be merged into running_batch via this filter — running_batch # runs decode forward, and admitting a mid-prefill req there causes - # shape mismatch + double KV accounting. Today the invariant is enforced - # by callers passing chunked_req_to_exclude; the stateless-scheduler v2 - # refactor will move this to a per-req predicate. + # shape mismatch + double KV accounting. Enforced per-req: + # - has_pending_chunk: chunked-resume scheduled to continue + # - is_chunked > 0: PP in-flight middle chunk for this req + # - is_dllm(): DllmManager-managed (separate staging queue) # FIXME(lsyin): used here to get the correct seq_lens # The batch has been launched but we need it verified to get correct next batch info self.maybe_wait_verify_done() if keep_indices is None: - if isinstance(chunked_req_to_exclude, Req): - chunked_req_to_exclude = [chunked_req_to_exclude] - elif chunked_req_to_exclude is None: - chunked_req_to_exclude = [] keep_indices = [ i for i in range(len(self.reqs)) if not self.reqs[i].finished() - and self.reqs[i] not in chunked_req_to_exclude + and not self.reqs[i].has_pending_chunk + and not self.reqs[i].is_chunked > 0 + and not self.reqs[i].is_dllm() ] if keep_indices is None or len(keep_indices) == 0: @@ -2506,6 +2499,13 @@ def merge_batch(self, other: "ScheduleBatch"): # future. Synchronize here to avoid a cross-stream data race. self.maybe_wait_verify_done() + # Invariant: chunked-resume / mid-prefill reqs must never reach + # running_batch via merge — running_batch runs decode forward and + # admitting a prefill-in-progress req there breaks shape + KV accounting. + # filter_batch's predicate is responsible for excluding these from + # last_batch before this merge call. + assert not any(r.has_pending_chunk for r in other.reqs) + # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it # needs to be called with pre-merged Batch.reqs. diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 79d680e57d15..644171ec888a 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1082,16 +1082,11 @@ def init_chunked_prefill(self): self.chunked_prefill_size = None elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: self.chunked_prefill_size = None - self.chunked_req = None - # Tracks whether the current self.chunked_req was actually scheduled - # into last iteration's batch (i.e., in can_run_list -> got a fresh - # req_pool_idx from prepare_for_extend). Used to gate the - # stash_chunked_request call at the top of get_next_batch_to_run: - # if add_chunked_req early-returned under hybrid-SWA pressure, - # the req_pool_idx was already freed and fill_ids was reset by - # init_next_round_input, so running stash would double-free and - # corrupt prefix_indices. - self._chunked_req_scheduled_last_iter = False + # Chunked-resume tracking is now per-req (Req.has_pending_chunk + + # is_chunked counter); the scheduler no longer holds a global pointer. + # Stage A stashes any waiting_queue req with has_pending_chunk; cache + # impls bound row reads by kv_committed_len so a stash after + # init_next_round_input is safe without the old gate. self.is_mixed_chunk = ( self.chunked_prefill_size is not None and self.server_args.enable_mixed_chunk @@ -2443,9 +2438,6 @@ def handle_batch_embedding_request( for tokenized_req in recv_req: self.handle_embedding_request(tokenized_req) - def stash_chunked_request(self, req: Req): - maybe_cache_unfinished_req(req, self.tree_cache, chunked=True) - def _build_hisparse_decode_batch(self, reqs): """Build a ScheduleBatch for hisparse requests transitioning from staging to decode.""" device = self.device @@ -2490,21 +2482,17 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: if self.dllm_config is not None: self.dllm_manager.filter_finished_reqs() - # Merge the prefill batch into the running batch - chunked_req_to_exclude = set() + # Stage A: stash any in-flight chunked prefill KV into radix tree. + # Per-req loop over waiting_queue covers chunked-resume; DLLM staging + # reqs are owned by DllmManager (not in waiting_queue), handled + # separately below. + for req in self.waiting_queue: + if req.has_pending_chunk and not req.is_dllm(): + maybe_cache_unfinished_req(req, self.tree_cache, chunked=True) if self.dllm_config is not None and self.dllm_manager.any_staging_reqs(): - chunked_req_to_exclude.update(self.dllm_manager.staging_queue) for req in self.dllm_manager.staging_queue: - self.stash_chunked_request(req) - - if self.chunked_req is not None: - # Move the chunked request out of the batch so that we can merge - # only finished requests to running_batch. - chunked_req_to_exclude.add(self.chunked_req) - - if self._chunked_req_scheduled_last_iter: - self.stash_chunked_request(self.chunked_req) + maybe_cache_unfinished_req(req, self.tree_cache, chunked=True) # HiSparse has its own prefill-to-decode transition; skip last_batch merge. if self.enable_hisparse: @@ -2524,19 +2512,10 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: and self.last_batch and self.last_batch.forward_mode.is_extend() ): - if self.last_batch.chunked_req is not None: - # In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req. - # We need to discard it. - chunked_req_to_exclude.add(self.last_batch.chunked_req) - - if self.dllm_config is not None and self.last_batch.reqs: - chunked_req_to_exclude.update(self.last_batch.reqs) - - # Filter batch + # filter_batch's internal predicate excludes still-prefilling reqs + # (has_pending_chunk / is_chunked > 0 / is_dllm) from merge. last_bs = self.last_batch.batch_size() - self.last_batch.filter_batch( - chunked_req_to_exclude=list(chunked_req_to_exclude) - ) + self.last_batch.filter_batch() if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False @@ -2642,21 +2621,26 @@ def _get_new_batch_prefill_raw( # Reset batch_is_full to try preemption with a prefill adder. self.running_batch.batch_is_full = False + # Identify any in-flight chunked-resume req held in waiting_queue — + # priority + has_pending_chunk make it sit at the head, but its + # presence relaxes the "is queue empty / pool full" early exits below + # (we must keep scheduling it to make progress, or memory leaks). + has_chunked_resume = any(r.has_pending_chunk for r in self.waiting_queue) + if ( self.running_batch.batch_is_full or len(self.waiting_queue) == 0 - ) and self.chunked_req is None: + ) and not has_chunked_resume: return None running_bs = len(self.running_batch.reqs) - # Ignore the check if self.chunked_req is not None. - # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0, - # as the space for the chunked requests has just been released. - # In PP case, chunked requests (or dllm requests) can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict. - # Instead, we should always allow chunked requests to be added, otherwise, there will be a memory leak. + # Ignore the check if there is a chunked-resume in flight. + # In the non-PP case the row was just released so the count is fine; + # in PP case, chunked reqs span microbatches so the per-mb max_running + # check should not block them. if ( self.get_num_allocatable_reqs(running_bs) <= 0 - and self.chunked_req is None + and not has_chunked_resume and not self.enable_priority_preemption ): self.running_batch.batch_is_full = True @@ -2673,11 +2657,17 @@ def _get_new_batch_prefill_raw( # Determine chunked_prefill_size for this batch chunked_prefill_size = self.chunked_prefill_size - if self.chunked_req is not None and self.enable_dynamic_chunking: - history_len = len(self.chunked_req.prefix_indices) - dynamic_size = self.predict_next_chunk_size(history_len) - if dynamic_size is not None: - chunked_prefill_size = dynamic_size + if self.enable_dynamic_chunking: + # Single-flight invariant: at most one chunked-resume req in the + # queue at any time (priority + budget enforce this naturally). + chunked_resume = next( + (r for r in self.waiting_queue if r.has_pending_chunk), None + ) + if chunked_resume is not None: + history_len = len(chunked_resume.prefix_indices) + dynamic_size = self.predict_next_chunk_size(history_len) + if dynamic_size is not None: + chunked_prefill_size = dynamic_size # Prefill policy adder = PrefillAdder( @@ -2698,26 +2688,6 @@ def _get_new_batch_prefill_raw( waiting_queue_len=len(self.waiting_queue), ) - # Re-admit the in-flight chunked req via the unified add_one_req - # entry. add_one_req's reuse branch (gated on kv_committed_len > 0) - # mirrors the old add_chunked_req's behavior: skip lock_ref inc, - # init_load_back, and prefix budget. Sets req.has_pending_chunk to - # truncated. - if self.chunked_req is not None: - self.chunked_req.init_next_round_input() - adder.add_one_req( - self.chunked_req, - truncation_align_size=self.truncation_align_size, - ) - # After admit, has_pending_chunk reflects whether more chunks - # remain. Mirror it into self.chunked_req for the existing - # Stage A stash path (deleted in a later commit). - if not self.chunked_req.has_pending_chunk: - self.chunked_req = None - self._chunked_req_scheduled_last_iter = self.chunked_req is not None - else: - self._chunked_req_scheduled_last_iter = False - if self.enable_lora: running_loras = {req.lora_id for req in self.running_batch.reqs} @@ -2758,7 +2728,14 @@ def _get_new_batch_prefill_raw( req.rid ) - req.init_next_round_input(self.tree_cache) + # Chunked-resume reqs must NOT re-match prefix at admission + # (would re-assign req.last_node without rebalancing lock_ref, + # corrupting cache_unfinished_req's dec_lock_ref/inc_lock_ref + # pairing). They keep last_node from previous stash. + if req.has_pending_chunk: + req.init_next_round_input() + else: + req.init_next_round_input(self.tree_cache) res = adder.add_one_req( req, truncation_align_size=self.truncation_align_size, @@ -2797,30 +2774,32 @@ def _get_new_batch_prefill_raw( if len(can_run_list) == 0: return None + # Drop admitted reqs from waiting_queue, but KEEP chunked-resume reqs + # (has_pending_chunk == True after admission) so they stay at the head + # for the next iter's stash + admission. Single-flight is preserved + # naturally by budget + priority. can_run_set = set(can_run_list) - self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_set] + self.waiting_queue = [ + x + for x in self.waiting_queue + if x not in can_run_set or x.has_pending_chunk + ] if adder.preempt_list: for req in adder.preempt_list: self._add_request_to_queue(req) - # Identify newly-truncated chunked-resume reqs admitted this iter via - # add_one_req's reuse/chunked branch. has_pending_chunk is set by - # add_one_req when truncated=True. The "newly chunked" set excludes - # self.chunked_req which was already tracked from previous iter. - new_chunked = [ - r for r in can_run_list if r.has_pending_chunk and r is not self.chunked_req - ] + # Bump pending_middle_outputs (the is_chunked counter) for every + # admitted req that's still mid-prefill — output processor uses this + # to know its forward's sample is garbage. Counter semantics needed + # for PP, where multiple microbatches may admit the same req. + chunked_in_batch = [r for r in can_run_list if r.has_pending_chunk] assert ( - len(new_chunked) <= 1 - ), "single-flight invariant: at most one new chunked req per iter" - if new_chunked: - assert self.chunked_req is None - self.chunked_req = new_chunked[0] - # The chunked req is scheduled this iter -> stash needed next iter. - self._chunked_req_scheduled_last_iter = True - - if self.chunked_req is not None: - self.chunked_req.is_chunked += 1 + len(chunked_in_batch) <= 1 + ), "single-flight invariant: at most one chunked-resume req per batch" + chunk_deduct = 0 + for r in chunked_in_batch: + r.is_chunked += 1 + chunk_deduct = r.extend_input_len # Record for logging prefill stats after forward self.adder = adder @@ -2838,7 +2817,6 @@ def _get_new_batch_prefill_raw( self.model_config, self.enable_overlap, self.spec_algorithm, - chunked_req=self.chunked_req, ) self.max_prefill_bs = max(self.max_prefill_bs, len(can_run_list)) if self.enable_hierarchical_cache: @@ -2855,11 +2833,7 @@ def _get_new_batch_prefill_raw( self.running_batch.reqs, self.enable_priority_scheduling, num_pending_tokens=self._get_num_pending_tokens( - chunk_deduct=( - self.chunked_req.extend_input_len - if self.chunked_req is not None - else 0 - ) + chunk_deduct=chunk_deduct ), ) @@ -3313,7 +3287,6 @@ def is_fully_idle(self, for_health_check=False) -> bool: # Batch running status idle = ( self.running_batch.is_empty() - and self.chunked_req is None and not self.dllm_manager.any_staging_reqs() and (self.last_batch is None or self.last_batch.is_empty()) and (self.cur_batch is None or self.cur_batch.is_empty()) @@ -3681,11 +3654,11 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): if recv_req.mode == "in_place": # In-place pause: just set the flag and return immediately. - # All scheduler state (running_batch, last_batch, chunked_req, + # All scheduler state (running_batch, last_batch, waiting_queue, # result_queue) is left untouched. On resume, the normal event # loop (get_next_batch_to_run) handles last_batch merge, - # chunked_req cleanup, and overlap result processing through - # the standard code paths. This avoids duplicating batch + # chunked-resume re-admission, and overlap result processing + # through the standard code paths. This avoids duplicating batch # manipulation logic and the accounting bugs that come with it. return @@ -3695,10 +3668,9 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): self.process_batch_result(tmp_batch, tmp_result) if self.last_batch and self.last_batch.forward_mode.is_extend(): - chunked_req_to_exclude = set() - self.last_batch.filter_batch( - chunked_req_to_exclude=list(chunked_req_to_exclude) - ) + # filter_batch's internal predicate excludes still-prefilling reqs + # (has_pending_chunk / is_chunked > 0 / is_dllm). + self.last_batch.filter_batch() # Skip merge for disagg prefill: completed prefill requests are # already in disagg_prefill_inflight_queue. Merging them into # running_batch leaks them, since the prefill event loop never @@ -3723,7 +3695,6 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): self._add_request_to_queue(req) self.running_batch.batch_is_full = False - self.chunked_req = None def continue_generation(self, recv_req: ContinueGenerationReqInput): if recv_req.torch_empty_cache: diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py index 8a970b4bedcb..facccabff45d 100644 --- a/python/sglang/srt/mem_cache/chunk_cache.py +++ b/python/sglang/srt/mem_cache/chunk_cache.py @@ -88,7 +88,7 @@ def cache_unfinished_req(self, req: Req, chunked=False): kv_indices = self.req_to_token_pool.req_to_token[ req.req_pool_idx, : req.kv_committed_len ] - # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later + # `req.prefix_indices` will be used by add_one_req reuse branch next iter req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True) def evict(self, params: EvictParams) -> EvictResult: diff --git a/python/sglang/srt/mem_cache/mamba_radix_cache.py b/python/sglang/srt/mem_cache/mamba_radix_cache.py index 325b3aa08aae..00d4c165e50b 100644 --- a/python/sglang/srt/mem_cache/mamba_radix_cache.py +++ b/python/sglang/srt/mem_cache/mamba_radix_cache.py @@ -608,7 +608,7 @@ def _skip_cache_unfinished_req(req: Req) -> None: req.req_pool_idx, :read_len ] - # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later + # `req.prefix_indices` will be used by add_one_req reuse branch next iter req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True) return @@ -708,7 +708,7 @@ def _skip_cache_unfinished_req(req: Req) -> None: self.dec_lock_ref(req.last_node) self.inc_lock_ref(new_last_node) - # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later + # `req.prefix_indices` will be used by add_one_req reuse branch next iter # NOTE: this is needed for both page_size == 1 and page_size > 1 req.prefix_indices = torch.cat( [new_indices, kv_indices_orig[len(new_indices) :]] diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py index 9e1e93e48a79..f4b193c73965 100644 --- a/python/sglang/srt/mem_cache/radix_cache.py +++ b/python/sglang/srt/mem_cache/radix_cache.py @@ -543,7 +543,7 @@ def cache_unfinished_req(self, req: Req, chunked=False): self.dec_lock_ref(req.last_node) self.inc_lock_ref(new_last_node) - # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later + # `req.prefix_indices` will be used by add_one_req reuse branch next iter # - page_size != 1: there is a partial page at the end, keep the full kv_indices # - eagle case: bigram keys will only cache len - 1 kv indices if len(new_indices) < len(kv_indices): diff --git a/python/sglang/srt/mem_cache/swa_radix_cache.py b/python/sglang/srt/mem_cache/swa_radix_cache.py index 2457ec817446..a3936683e16f 100644 --- a/python/sglang/srt/mem_cache/swa_radix_cache.py +++ b/python/sglang/srt/mem_cache/swa_radix_cache.py @@ -492,7 +492,7 @@ def cache_unfinished_req(self, req: Req, chunked=False) -> None: req.req_pool_idx, :read_len ] - # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later + # `req.prefix_indices` will be used by add_one_req reuse branch next iter req.prefix_indices = kv_indices return @@ -543,7 +543,7 @@ def cache_unfinished_req(self, req: Req, chunked=False) -> None: result = self.inc_lock_ref(new_last_node) swa_uuid_for_lock = result.swa_uuid_for_lock - # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later + # `req.prefix_indices` will be used by add_one_req reuse branch next iter if len(new_indices) < len(kv_indices): req.prefix_indices = torch.cat( [new_indices, kv_indices[len(new_indices) :]] diff --git a/python/sglang/srt/observability/scheduler_metrics_mixin.py b/python/sglang/srt/observability/scheduler_metrics_mixin.py index 86cd5bfb1e81..e4590c0ba6c4 100644 --- a/python/sglang/srt/observability/scheduler_metrics_mixin.py +++ b/python/sglang/srt/observability/scheduler_metrics_mixin.py @@ -976,10 +976,11 @@ def _get_num_pending_tokens(self: Scheduler, chunk_deduct: int = 0) -> int: num_pending_tokens = sum( req.seqlen - len(req.prefix_indices) for req in self.waiting_queue ) - if self.chunked_req is not None: - req = self.chunked_req - num_pending_tokens += req.seqlen - len(req.prefix_indices) - chunk_deduct - return num_pending_tokens + # The chunked-resume req (if any) is now in self.waiting_queue, so + # it's already counted in the sum above. chunk_deduct subtracts the + # current chunk's extend that has been planned but not yet reflected + # in prefix_indices. + return num_pending_tokens - chunk_deduct def get_loads(self: Scheduler, req: GetLoadsReqInput = None) -> GetLoadsReqOutput: """ From b9d5d6ed5fddd3e360468da06e8f344e79f160a8 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 18:58:05 +0800 Subject: [PATCH 08/52] refactor: rename Req.is_chunked -> Req.pending_middle_outputs Pure rename. The field is an int counter ("how many middle-block prefill forwards are admitted but not yet output-processed"), not a boolean. The old name suggested a 'is this req chunked?' boolean and made call sites like 'is_chunked += 1' and 'if is_chunked <= 0' read confusingly. Also renames the DLLM mixin helper increment_chunked_count() to increment_pending_middle_outputs() for symmetry. Updated the field's docstring to describe its counter semantics + PP behavior. No semantic changes. --- python/sglang/srt/disaggregation/prefill.py | 4 ++-- python/sglang/srt/dllm/mixin/scheduler.py | 6 +++--- python/sglang/srt/managers/schedule_batch.py | 20 +++++++++++-------- python/sglang/srt/managers/schedule_policy.py | 2 +- python/sglang/srt/managers/scheduler.py | 10 +++++----- .../scheduler_output_processor_mixin.py | 8 ++++---- 6 files changed, 27 insertions(+), 23 deletions(-) diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 715b7739ccbf..0c65f97f5da0 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -504,7 +504,7 @@ def process_batch_result_disagg_prefill( for i, (req, next_token_id) in enumerate( zip(batch.reqs, next_token_ids, strict=True) ): - if req.is_chunked <= 0: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # There is no output_ids for prefill @@ -554,7 +554,7 @@ def process_batch_result_disagg_prefill( req.grammar.finished = req.finished() else: # being chunked reqs' prefill is not finished - req.is_chunked -= 1 + req.pending_middle_outputs -= 1 if req.return_logprob: extend_logprob_start_len = extend_logprob_start_len_per_req[i] diff --git a/python/sglang/srt/dllm/mixin/scheduler.py b/python/sglang/srt/dllm/mixin/scheduler.py index e8a563703811..4246822c9b6e 100644 --- a/python/sglang/srt/dllm/mixin/scheduler.py +++ b/python/sglang/srt/dllm/mixin/scheduler.py @@ -200,7 +200,7 @@ def _update_state_for_batch( if can_run_list: self.dllm_manager.add_staging_reqs(can_run_list) - self.dllm_manager.increment_chunked_count() + self.dllm_manager.increment_pending_middle_outputs() self.adder = adder self.can_run_list = can_run_list @@ -335,10 +335,10 @@ def is_empty(self) -> bool: return True return len(self.waiting_queue) == 0 - def increment_chunked_count(self) -> None: + def increment_pending_middle_outputs(self) -> None: """Increment chunked count for all staging requests.""" for req in self.staging_queue: - req.is_chunked += 1 + req.pending_middle_outputs += 1 def filter_finished_reqs(self) -> None: """Remove finished requests from both queues.""" diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 83254cb50e9f..51781d8d28a2 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -752,16 +752,20 @@ def __init__( # The prefix length that is inserted into the tree cache self.cache_protected_len: int = 0 - # Whether or not if it is chunked. It increments whenever - # it is chunked, and decrement whenever chunked request is - # processed. - self.is_chunked = 0 + # Counter of middle-block prefill forwards that have been admitted + # but not yet output-processed for this req. Increments at admission + # for non-last chunks; decrements at output_processor. In PP, can + # exceed 1 because multiple microbatches may hold the same chunked + # req in flight concurrently. In non-PP, oscillates 0/1 within each + # iter. Used by output_processor to know whether this forward's + # sample is real (==0) or garbage (>0). + self.pending_middle_outputs = 0 # Persistent (cross-iter) flag set by admission when this req's # current admission was truncated (more chunks remain). Cleared # when last chunk is admitted (truncated=False) or on retract. # Used by Stage A stash detection, filter_batch exclusion, and - # add_one_req's reuse-vs-fresh branch. Independent of is_chunked + # add_one_req's reuse-vs-fresh branch. Independent of pending_middle_outputs # counter (transient) and kv_committed_len (derived). self.has_pending_chunk = False @@ -1265,7 +1269,7 @@ def reset_for_retract(self): self.temp_input_top_logprobs_val = None self.temp_input_top_logprobs_idx = None self.extend_logprob_start_len = 0 - self.is_chunked = 0 + self.pending_middle_outputs = 0 self.has_pending_chunk = False self.mamba_pool_idx = None self.mamba_ping_pong_track_buffer = None @@ -2415,7 +2419,7 @@ def filter_batch( # runs decode forward, and admitting a mid-prefill req there causes # shape mismatch + double KV accounting. Enforced per-req: # - has_pending_chunk: chunked-resume scheduled to continue - # - is_chunked > 0: PP in-flight middle chunk for this req + # - pending_middle_outputs > 0: PP in-flight middle chunk for this req # - is_dllm(): DllmManager-managed (separate staging queue) # FIXME(lsyin): used here to get the correct seq_lens # The batch has been launched but we need it verified to get correct next batch info @@ -2427,7 +2431,7 @@ def filter_batch( for i in range(len(self.reqs)) if not self.reqs[i].finished() and not self.reqs[i].has_pending_chunk - and not self.reqs[i].is_chunked > 0 + and not self.reqs[i].pending_middle_outputs > 0 and not self.reqs[i].is_dllm() ] diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index b924592fc1f1..61f1027a675f 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -939,7 +939,7 @@ def add_one_req( truncated = True # has_pending_chunk: persistent flag carrying chunked-resume state - # across iters. DLLM uses its own staging_queue + is_chunked counter. + # across iters. DLLM uses its own staging_queue + pending_middle_outputs counter. if not req.is_dllm(): req.has_pending_chunk = truncated diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 644171ec888a..a6fa1c3176f7 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1083,7 +1083,7 @@ def init_chunked_prefill(self): elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: self.chunked_prefill_size = None # Chunked-resume tracking is now per-req (Req.has_pending_chunk + - # is_chunked counter); the scheduler no longer holds a global pointer. + # pending_middle_outputs counter); the scheduler no longer holds a global pointer. # Stage A stashes any waiting_queue req with has_pending_chunk; cache # impls bound row reads by kv_committed_len so a stash after # init_next_round_input is safe without the old gate. @@ -2513,7 +2513,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: and self.last_batch.forward_mode.is_extend() ): # filter_batch's internal predicate excludes still-prefilling reqs - # (has_pending_chunk / is_chunked > 0 / is_dllm) from merge. + # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm) from merge. last_bs = self.last_batch.batch_size() self.last_batch.filter_batch() if self.last_batch.batch_size() < last_bs: @@ -2788,7 +2788,7 @@ def _get_new_batch_prefill_raw( for req in adder.preempt_list: self._add_request_to_queue(req) - # Bump pending_middle_outputs (the is_chunked counter) for every + # Bump pending_middle_outputs (the pending_middle_outputs counter) for every # admitted req that's still mid-prefill — output processor uses this # to know its forward's sample is garbage. Counter semantics needed # for PP, where multiple microbatches may admit the same req. @@ -2798,7 +2798,7 @@ def _get_new_batch_prefill_raw( ), "single-flight invariant: at most one chunked-resume req per batch" chunk_deduct = 0 for r in chunked_in_batch: - r.is_chunked += 1 + r.pending_middle_outputs += 1 chunk_deduct = r.extend_input_len # Record for logging prefill stats after forward @@ -3669,7 +3669,7 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): if self.last_batch and self.last_batch.forward_mode.is_extend(): # filter_batch's internal predicate excludes still-prefilling reqs - # (has_pending_chunk / is_chunked > 0 / is_dllm). + # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm). self.last_batch.filter_batch() # Skip merge for disagg prefill: completed prefill requests are # already in disagg_prefill_inflight_queue. Merging them into diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py index ae6f732fe934..234b56b78865 100644 --- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py +++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py @@ -241,7 +241,7 @@ def process_batch_result_prefill( # decode req in mixed batch or retracted req continue - if req.is_chunked <= 0: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # req output_ids are set here @@ -314,7 +314,7 @@ def process_batch_result_prefill( else: # being chunked reqs' prefill is not finished - req.is_chunked -= 1 + req.pending_middle_outputs -= 1 # There is only at most one request being currently chunked. # Because this request does not finish prefill, # we don't want to stream the request currently being chunked. @@ -380,7 +380,7 @@ def process_batch_result_prefill( req.embedding = embeddings[i] if req.return_pooled_hidden_states and phs is not None: req.pooled_hidden_state = phs[i] - if req.is_chunked <= 0: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # Dummy output token for embedding models req.output_ids.append(0) @@ -393,7 +393,7 @@ def process_batch_result_prefill( maybe_cache_unfinished_req(req, self.tree_cache) else: # being chunked reqs' prefill is not finished - req.is_chunked -= 1 + req.pending_middle_outputs -= 1 req.time_stats.set_last_chunked_prefill_finish_time() self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req) From f0388931bf6d3bde95d24fa967bb78bdc24532fd Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:07:47 +0800 Subject: [PATCH 09/52] Fix retract_all passing List[Req] to filter_batch as keep_indices After 3fd7319a3d removed the chunked_req_to_exclude first-positional parameter from filter_batch, retract_all's existing call `self.filter_batch(retracted_reqs)` silently broke: the new first positional is keep_indices: Optional[List[int]], so we were trying to index reqs by Req objects. Pass keep_indices=[] explicitly to clear all reqs (the original intent). --- python/sglang/srt/managers/schedule_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 51781d8d28a2..7fa6f054abb2 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -2173,7 +2173,7 @@ def retract_all(self, server_args: ServerArgs): for idx in range(len(self.reqs)): self.release_req(idx, len(self.reqs) - idx, server_args) - self.filter_batch(retracted_reqs) + self.filter_batch(keep_indices=[]) return retracted_reqs def retract_decode( From fd3dcca22fd8f6dca91ffe24f102b79cfbc9d497 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:09:28 +0800 Subject: [PATCH 10/52] Refactor filter_batch to use explicit exclude_chunked_req flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit added an internal predicate that always excluded chunked-resume / PP middle-chunk / DLLM-staging reqs, with an inline invariant comment explaining what was filtered. Two issues: 1. Implicit behavior — callers had no way to grep for which sites actually rely on the prefill-pending exclusion. 2. Awkward API — retract_all and retract_decode (which pass keep_indices) had no use for the predicate, and the predicate's surface area drifted from the original chunked_req_to_exclude API. Reintroduce caller-supplied opt-in via exclude_chunked_req: bool, matching the spirit of the original chunked_req_to_exclude parameter. All sites that need the exclusion pass True; the few that pass keep_indices remain unchanged. --- python/sglang/srt/disaggregation/decode.py | 2 +- python/sglang/srt/disaggregation/prefill.py | 3 +-- python/sglang/srt/managers/schedule_batch.py | 27 +++++++++----------- python/sglang/srt/managers/scheduler.py | 14 +++++----- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index 797071794d68..b778c7eea712 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -1648,7 +1648,7 @@ def get_next_disagg_decode_batch_to_run( if new_prebuilt_batch: assert not any(r.has_pending_chunk for r in self.waiting_queue) self.process_batch_result_prebuilt(new_prebuilt_batch) - new_prebuilt_batch.filter_batch() + new_prebuilt_batch.filter_batch(exclude_chunked_req=True) if not new_prebuilt_batch.is_empty(): if self.running_batch.is_empty(): self.running_batch = new_prebuilt_batch diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 0c65f97f5da0..24292837e75c 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -732,9 +732,8 @@ def process_prefill_chunk(self: Scheduler) -> None: self.running_batch.batch_is_full = False if self.last_batch and self.last_batch.forward_mode.is_extend(): - # filter_batch's internal predicate excludes still-prefilling reqs. last_bs = self.last_batch.batch_size() - self.last_batch.filter_batch() + self.last_batch.filter_batch(exclude_chunked_req=True) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 7fa6f054abb2..b889d7321efa 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -2413,14 +2413,8 @@ def filter_batch( keep_indices: Optional[List[int]] = None, # FIXME(lsyin): deprecate this API after spec v1 is deprecated v1_spec_info_filtered: Optional[bool] = False, + exclude_chunked_req: bool = False, ): - # Invariant: reqs still doing prefill (chunked-resume or DLLM-managed) - # must never be merged into running_batch via this filter — running_batch - # runs decode forward, and admitting a mid-prefill req there causes - # shape mismatch + double KV accounting. Enforced per-req: - # - has_pending_chunk: chunked-resume scheduled to continue - # - pending_middle_outputs > 0: PP in-flight middle chunk for this req - # - is_dllm(): DllmManager-managed (separate staging queue) # FIXME(lsyin): used here to get the correct seq_lens # The batch has been launched but we need it verified to get correct next batch info self.maybe_wait_verify_done() @@ -2430,9 +2424,14 @@ def filter_batch( i for i in range(len(self.reqs)) if not self.reqs[i].finished() - and not self.reqs[i].has_pending_chunk - and not self.reqs[i].pending_middle_outputs > 0 - and not self.reqs[i].is_dllm() + and not ( + exclude_chunked_req + and ( + self.reqs[i].has_pending_chunk + or self.reqs[i].pending_middle_outputs > 0 + or self.reqs[i].is_dllm() + ) + ) ] if keep_indices is None or len(keep_indices) == 0: @@ -2503,11 +2502,9 @@ def merge_batch(self, other: "ScheduleBatch"): # future. Synchronize here to avoid a cross-stream data race. self.maybe_wait_verify_done() - # Invariant: chunked-resume / mid-prefill reqs must never reach - # running_batch via merge — running_batch runs decode forward and - # admitting a prefill-in-progress req there breaks shape + KV accounting. - # filter_batch's predicate is responsible for excluding these from - # last_batch before this merge call. + # Caller must filter_batch(exclude_chunked_req=True) on the other batch + # before merging — running_batch runs decode forward and admitting a + # prefill-in-progress req there breaks shape + KV accounting. assert not any(r.has_pending_chunk for r in other.reqs) # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index a6fa1c3176f7..12a77a14fbb0 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2512,10 +2512,8 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: and self.last_batch and self.last_batch.forward_mode.is_extend() ): - # filter_batch's internal predicate excludes still-prefilling reqs - # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm) from merge. last_bs = self.last_batch.batch_size() - self.last_batch.filter_batch() + self.last_batch.filter_batch(exclude_chunked_req=True) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False @@ -2533,7 +2531,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # Runs outside the last_batch block so stale requests are cleaned # even when no new batches arrive (e.g. traffic stops). if self.running_batch.is_prefill_only: - self.running_batch.filter_batch() + self.running_batch.filter_batch(exclude_chunked_req=True) if self.running_batch.is_empty(): self.running_batch.batch_is_full = False @@ -2846,7 +2844,9 @@ def _get_new_batch_prefill_raw( and new_batch.input_embeds is None ): # TODO (lianmin): support return_logprob + mixed chunked prefill - self.running_batch.filter_batch(v1_spec_info_filtered=True) + self.running_batch.filter_batch( + v1_spec_info_filtered=True, exclude_chunked_req=True + ) if not self.running_batch.is_empty(): self.running_batch.prepare_for_decode() new_batch.mix_with_running(self.running_batch) @@ -3668,9 +3668,7 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): self.process_batch_result(tmp_batch, tmp_result) if self.last_batch and self.last_batch.forward_mode.is_extend(): - # filter_batch's internal predicate excludes still-prefilling reqs - # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm). - self.last_batch.filter_batch() + self.last_batch.filter_batch(exclude_chunked_req=True) # Skip merge for disagg prefill: completed prefill requests are # already in disagg_prefill_inflight_queue. Merging them into # running_batch leaks them, since the prefill event loop never From a79ba1b2f79c199e19693187cc69e0490fc9cb37 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:17:55 +0800 Subject: [PATCH 11/52] Tighten add_one_req reuse gate to has_pending_chunk The 'is_resume' predicate previously fired for any req with kv_committed_len > 0, which incorrectly included streaming-session turn N>1 reqs (they inherit kv_committed_len from the session slot but are NOT chunked-resume). The reuse branch skips _req_inc_lock_ref, so those reqs would leave their last_node lock underbalanced. Tighten to the persistent chunked-resume flag (req.has_pending_chunk) so only true mid-prefill reqs take the reuse path. --- python/sglang/srt/managers/schedule_policy.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index 61f1027a675f..a4167e01e05a 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -782,12 +782,13 @@ def add_req_state(r, insert_sort=False): def add_one_req( self, req: Req, truncation_align_size: Optional[int] ): - # Reuse path: this req was admitted in a previous iter, has a row - # with committed KV (kv_committed_len > 0), and is mid-prefill. Skip - # fresh-req setup (lock_ref already held by previous stash; - # init_load_back already ran on first admission; prefix already - # counted in tree). DLLM has its own path and never takes reuse here. - is_resume = req.kv_committed_len > 0 and not req.is_dllm() + # Reuse path: this req's previous chunk left lock_ref held, prefix + # already in tree, and init_load_back already consumed host KV. We + # must skip fresh-req setup. Gate on `has_pending_chunk` (the + # persistent chunked-resume flag) — `kv_committed_len > 0` alone is + # wider (streaming-session turn N>1 also has it without being + # chunked-resume) and would skip _req_inc_lock_ref incorrectly. + is_resume = req.has_pending_chunk and not req.is_dllm() if (self.prefill_delayer_single_pass is not None) and ( not self.prefill_delayer_single_pass.negotiate_should_allow_prefill( From d7fa48baad6c5e3f5221b0f849f65c3ecd56c6e5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:18:16 +0800 Subject: [PATCH 12/52] Reset host_hit_length unconditionally in prepare_for_extend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset was nested inside two conditionals: if not req.retracted_stain: # skip after retract ... if not req._cache_breakdown_computed: # skip after first chunk ... req.host_hit_length = 0 After a req is retracted (retracted_stain stays True forever) and re-admitted, the outer block is skipped, so the reset never fires. The re-admission's match_prefix sets host_hit_length non-zero, then init_load_back consumes it on chunk 1 — but chunk 2's admission still sees the stale value and runs init_load_back a second time (double-load + lock_ref imbalance). Move the reset out of both conditionals so it runs once per admission. The breakdown metric still computes only on the first chunk via _cache_breakdown_computed. --- python/sglang/srt/managers/schedule_batch.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index b889d7321efa..83f8ffd62bd6 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1843,12 +1843,15 @@ def prepare_for_extend(self): req.cached_tokens_host = host_portion req.cached_tokens_storage = storage_portion req._cache_breakdown_computed = True - # Reset host_hit_length after metric is computed so that - # subsequent chunks' admission paths see host_hit_length == 0 - # and naturally skip init_load_back (host KV already loaded). - req.host_hit_length = 0 req.already_computed = seq_len + # Reset host_hit_length after init_load_back consumed it so that + # subsequent chunks' admissions skip init_load_back (host KV + # already loaded). Runs unconditionally: post-retract reqs have + # retracted_stain=True (skipping the outer block) but still + # match_prefix + init_load_back on their re-admission, so the + # reset must apply to them too. + req.host_hit_length = 0 req.is_retracted = False if get_global_server_args().enable_mamba_extra_buffer(): From aaf3752d2b603ff5c0b3e66b36452614cfd5a29d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:18:32 +0800 Subject: [PATCH 13/52] Skip chunked-resume reqs in calc_priority prefix matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _compute_prefix_matches runs match_prefix_for_req on every waiting_queue item. match_prefix_for_req unconditionally overwrites req.prefix_indices, req.last_node, req.last_host_node, and req.host_hit_length from the new match result. For a chunked-resume req: - its last_node was inc_lock_ref'd by the prior Stage A stash - overwriting last_node leaves that lock_ref permanently inflated - prefix_indices reset would mislead next chunk's admission (the KV row was written up to kv_committed_len; admission must see that length as the prefix) - host_hit_length would re-trigger init_load_back on next chunk Skip these reqs — their prefix_indices/last_node from the prior stash is already authoritative, and the LPM/DFS_WEIGHT sort uses len(prefix_indices)/last_node, which read correctly from the stashed state. Only triggers under --schedule-policy lpm/dfs-weight; FCFS path is unaffected. --- python/sglang/srt/managers/schedule_policy.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index a4167e01e05a..f809f76ade94 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -235,6 +235,12 @@ def _compute_prefix_matches( self.waiting_queue_radix_tree.reset() for r in waiting_queue: + if r.has_pending_chunk: + # Chunked-resume reqs already have prefix_indices + last_node + # set by the prior chunk's Stage A stash, plus an inc'd + # lock_ref on last_node. Re-running match_prefix here would + # overwrite both, leaving the prior inc unbalanced. + continue prefix_ids = r.origin_input_ids + r.output_ids extra_key = r.extra_key match_result = match_prefix_for_req(self.tree_cache, r, prefix_ids) From 359e5ed7bd075fe70e44f0d344c153475c35c6d9 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:18:47 +0800 Subject: [PATCH 14/52] Skip chunked-resume reqs in _abort_on_waiting_timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the v2 refactor, chunked-resume reqs live in waiting_queue across iters while actively prefilling. Their wait_queue_entry_time is set on original arrival and never refreshed, so a sufficiently long prefill (large prompt, many chunks, slow GPU) makes them look 'stuck' to _abort_on_waiting_timeout — which would abort them and leak the held req_to_token row + radix tree lock_ref + committed KV. Skip reqs with has_pending_chunk=True. Only takes effect when SGLANG_REQ_WAITING_TIMEOUT > 0 (env-gated; off by default). --- python/sglang/srt/managers/scheduler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 12a77a14fbb0..4d05a9ab2ff7 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2341,6 +2341,12 @@ def _abort_on_waiting_timeout(self): deleted_reqs = set() deadline = time.perf_counter() - timeout_s for req in self.waiting_queue: + # Chunked-resume reqs sit in waiting_queue across iters while + # actively prefilling — they are not idle. Their entry_time is + # from their original arrival, so a long prefill would falsely + # trigger the timeout and leak KV + row. + if req.has_pending_chunk: + continue entry_time = req.time_stats.wait_queue_entry_time if 0 < entry_time < deadline: if self.enable_hicache_storage: From 5ed4faf0ab66b43e05dced12ab846a15164f8317 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:18:58 +0800 Subject: [PATCH 15/52] Bypass LoRA scheduling gate for chunked-resume reqs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the v2 refactor, chunked-resume reqs share the waiting_queue loop with fresh admissions. If _can_schedule_lora_req rejects a chunked-resume req (e.g. its adapter entered the drainer between chunks), the req stays in waiting_queue indefinitely while holding its req_to_token row, tree lock_ref, and committed KV — a deadlock that no other code path clears. The LoRA admission check is meaningful only at first-chunk admission; once chunked-resume is in flight, the adapter is already loaded and the drainer cannot meaningfully reject it. Skip the gate for these reqs. --- python/sglang/srt/managers/scheduler.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 4d05a9ab2ff7..08b5c12d08db 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2703,7 +2703,16 @@ def _get_new_batch_prefill_raw( # Get requests from the waiting queue to a new prefill batch for req in self.waiting_queue: - if self.enable_lora and not self._can_schedule_lora_req(req, running_loras): + # Chunked-resume reqs hold a row + tree lock_ref from their prior + # admission. If the LoRA drainer rejects them mid-prefill, they + # stay in waiting_queue forever — deadlock + KV leak. Their LoRA + # adapter was already accepted on the first admission, so the + # drainer/validate check is moot for them. + if ( + self.enable_lora + and not req.has_pending_chunk + and not self._can_schedule_lora_req(req, running_loras) + ): continue running_bs = len(self.running_batch.reqs) From dbdcdde24520401b69b9d1054acc688568109132 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:19:13 +0800 Subject: [PATCH 16/52] Skip mamba_pool_idx cleanup for chunked-resume on NO_TOKEN The NO_TOKEN failure path in get_new_batch_prefill frees the req's mamba_pool_idx on the assumption that the slot was freshly allocated this iter and the admission was rolled back. For a chunked-resume req that hits NO_TOKEN this iter (budget transiently full), the mamba_pool_idx was actually allocated on its first admission and holds live mamba state needed for the remaining chunks. Add has_pending_chunk to the existing 'don't free' guard alongside the session check, matching the same intent: the slot's lifecycle extends beyond this admission attempt. --- python/sglang/srt/managers/scheduler.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 08b5c12d08db..410636344fec 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2770,9 +2770,13 @@ def _get_new_batch_prefill_raw( # Only free if the slot was freshly allocated in this batch (not # pre-existing from a session). Session-held slots have their own # lifecycle and freeing them here causes double-free. + # Chunked-resume reqs inherit mamba_pool_idx from their first + # admission; freeing it on a transient NO_TOKEN this iter would + # discard a live mamba state still needed by subsequent chunks. added = len(adder.can_run_list) > 0 and req is adder.can_run_list[-1] if ( not added + and not req.has_pending_chunk and req.mamba_pool_idx is not None and not getattr(req, "session", None) ): From 36ec1d7269a398a39bdd7e2f6e2753b48e767e93 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:19:25 +0800 Subject: [PATCH 17/52] Widen merge_batch assert to match filter_batch predicate The exclude_chunked_req predicate in filter_batch covers three states (has_pending_chunk, pending_middle_outputs > 0, is_dllm), but the safety assert in merge_batch only checked the first. If a future caller forgets exclude_chunked_req=True or uses an explicit keep_indices that lets a PP middle-chunk or DLLM staging req through, the assert wouldn't catch it. Mirror all three clauses so the assert is a true defense-in-depth for the documented invariant. --- python/sglang/srt/managers/schedule_batch.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 83f8ffd62bd6..19a0a202fedd 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -2507,8 +2507,13 @@ def merge_batch(self, other: "ScheduleBatch"): # Caller must filter_batch(exclude_chunked_req=True) on the other batch # before merging — running_batch runs decode forward and admitting a - # prefill-in-progress req there breaks shape + KV accounting. - assert not any(r.has_pending_chunk for r in other.reqs) + # prefill-in-progress req there breaks shape + KV accounting. Mirror + # the full exclude_chunked_req predicate so PP middle-chunk and DLLM + # staging reqs are also caught here. + assert not any( + r.has_pending_chunk or r.pending_middle_outputs > 0 or r.is_dllm() + for r in other.reqs + ) # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it From 116584e8faa401bbaaf1d2e376e9592fae95678f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 19:19:43 +0800 Subject: [PATCH 18/52] Bound streaming-session chunked stash by kv_committed_len MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 83e48ec295 fixed cache_unfinished_req in all 6 non-session cache impls to read req_to_token[:req.kv_committed_len] instead of [:len(req.fill_ids)] — required because init_next_round_input restores fill_ids to origin+output (full length) on subsequent admissions, while the row only holds KV up to kv_committed_len. StreamingSession.try_cache_unfinished_req(chunked=True) was missed in that pass. It typically saw len(fill_ids) == kv_committed_len in the success path, but after a SWA early-return: - chunk N succeeds → fill_ids truncated to chunk N end == kv_committed_len - chunk N+1 admission attempt: init_next_round_input() restores fill_ids to full length, then SWA budget rejects → AddReqResult.NO_TOKEN - next iter's Stage A stash reads req_to_token[req_pool_idx, :len(fill_ids) = full_length] which holds garbage for positions [chunk_N_end : full_length] - that garbage gets copied into prefix_indices, corrupting the subsequent admission's view of the cached prefix. Bound by kv_committed_len and add the same protected-len assert as the other cache impls. --- python/sglang/srt/session/streaming_session.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/session/streaming_session.py b/python/sglang/srt/session/streaming_session.py index a60b3376c080..e295efe6d39b 100644 --- a/python/sglang/srt/session/streaming_session.py +++ b/python/sglang/srt/session/streaming_session.py @@ -330,8 +330,15 @@ def try_cache_unfinished_req( if not _is_streaming(req): return False if chunked: + # Bound row read by kv_committed_len, NOT len(fill_ids): after + # a SWA early-return the next iter's init_next_round_input + # restores fill_ids to origin+output (full length), but the + # row only holds KV up to kv_committed_len — reading beyond + # that yields garbage slot indices. See radix_cache.py for + # the same fix applied to the non-session caches. + assert req.kv_committed_len >= req.cache_protected_len kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, : len(req.fill_ids) + req.req_pool_idx, : req.kv_committed_len ] req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True) return True From 96d47490947b3444f626f7a81c7682830da43ee8 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:02:24 +0800 Subject: [PATCH 19/52] Release row + KV + lock_ref when aborting a chunked-resume req from waiting_queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The waiting-queue abort path in abort_request only frees disagg-decode KV and mamba state. Before v2, that covered every kind of resource a waiting req could hold — fresh waiters had no row/KV/lock_ref. The stateless-scheduler v2 refactor changed this: chunked-resume reqs now live in waiting_queue across iterations while holding their req_to_token row, committed KV slots, and a radix tree lock_ref on req.last_node from the prior Stage A stash. Aborting such a req while it sits only in waiting_queue (i.e. the to_finish dedup keeps it off the batch path) left all three permanently leaked. Extend the existing mamba branch's release_kv_cache(is_insert=False) call to also cover has_pending_chunk + req_pool_idx-holding reqs. Defensively clear has_pending_chunk + pending_middle_outputs after release so any stale reference can't drag the freed row into a subsequent Stage A scan. Confirmed by two independent round-2 audits (Claude Opus 'R2-A', Codex retract 'HIGH #1'). --- python/sglang/srt/managers/scheduler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 410636344fec..cc8a751ec823 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -3597,12 +3597,19 @@ def abort_request(self, recv_req: AbortReq): req, self.req_to_metadata_buffer_idx_allocator ) - # For mamba radix cache + # For mamba radix cache, or for chunked-resume reqs whose prior + # admissions already allocated a row + KV + radix lock_ref. Without + # this branch, aborting a chunked-resume req that is currently only + # in waiting_queue (not in any batch's reqs) leaks all three. if ( req.mamba_pool_idx is not None - and self.disaggregation_mode != DisaggregationMode.DECODE - ): + or (req.has_pending_chunk and req.req_pool_idx is not None) + ) and self.disaggregation_mode != DisaggregationMode.DECODE: release_kv_cache(req, self.tree_cache, is_insert=False) + # Defensive: clear pending-chunk flags on the orphaned req so a + # stale reference can't trigger Stage A re-stash of the freed row. + req.has_pending_chunk = False + req.pending_middle_outputs = 0 logger.debug(f"Abort queued request. {req.rid=}") # Delete the requests in the grammar queue From bf5b4e9a104f4535023e1e888b79711b400e9a2e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:03:25 +0800 Subject: [PATCH 20/52] Give chunked-resume reqs priority in LPM and DFS_WEIGHT sorts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A chunked-resume req's prefix_indices length reflects only its already-prefilled chunks (~ kv_committed_len), not the full prompt prefix it could have matched as a fresh req. Under LPM/DFS_WEIGHT with tight budget, fresh reqs hitting a long cached prefix outrank chunked-resume reqs every iter, starving them. This stuck state is doubly bad because the v2 timeout watchdog skips chunked-resume reqs (commit 83dc7877e0) — without progress they hold their row + KV + radix lock_ref forever, until user-initiated abort, which (until the previous commit) also leaked those resources. LPM: prepend 'is chunked-resume?' as the primary sort key. DFS_WEIGHT: extract chunked-resume reqs before DFS, prepend them afterwards. (Their last_node points at a mid-chunk stash node whose weight=1 — fold them into DFS and they sink to low priority.) --- python/sglang/srt/managers/schedule_policy.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index f809f76ade94..4a59555bc7e6 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -283,11 +283,19 @@ def _sort_by_longest_prefix( waiting_queue: List[Req], temporary_deprioritized: Set[int] ) -> None: """Sorts the waiting queue based on the longest prefix match.""" + # Chunked-resume reqs sort first: their prefix_indices length only + # reflects the chunks already prefilled (kv_committed_len), not the + # full prompt prefix they could have hit had they been fresh. Without + # this floor, a fresh req with a long cached prefix outranks them + # every iter, starving them under tight budget. waiting_queue.sort( key=lambda r: ( - -len(r.prefix_indices) - if r.rid not in temporary_deprioritized - else float("inf") + 0 if r.has_pending_chunk else 1, + ( + -len(r.prefix_indices) + if r.rid not in temporary_deprioritized + else float("inf") + ), ) ) @@ -296,8 +304,15 @@ def _sort_by_dfs_weight( waiting_queue: List[Req], tree_cache: BasePrefixCache ) -> None: """Sorts the waiting queue based on a depth-first search weighting.""" + # Pull chunked-resume reqs out before DFS — their last_node points at + # a mid-chunk stash node with weight 1 (no siblings share it), which + # otherwise drops them to a low DFS priority and starves them under + # tight budget. They go back to the front of the queue afterwards. + chunked_reqs = [req for req in waiting_queue if req.has_pending_chunk] + non_chunked_reqs = [req for req in waiting_queue if not req.has_pending_chunk] + last_node_to_reqs = defaultdict(list) - for req in waiting_queue: + for req in non_chunked_reqs: last_node_to_reqs[req.last_node].append(req) node_to_weight = defaultdict(int) @@ -306,6 +321,7 @@ def _sort_by_dfs_weight( SchedulePolicy._calc_weight(tree_cache.root_node, node_to_weight) waiting_queue.clear() + waiting_queue.extend(chunked_reqs) SchedulePolicy._get_dfs_priority( tree_cache.root_node, node_to_weight, From f38e69f87dbb7b1eff0808824cd7601bea5846f7 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:04:07 +0800 Subject: [PATCH 21/52] Extend pause(retract) to waiting chunked-resume reqs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pause_generation(retract)'s documented contract: retract: Pause the scheduler's event loop ... all currently running requests will be retracted back to the waiting_queue. The KV cache can be flushed in this mode and will be automatically recomputed after continue_generation. Pre-v2 that contract held: every req holding KV was in running_batch. After v2, chunked-resume reqs live in waiting_queue across iterations while holding their req_to_token row, committed KV slots, and a radix tree lock_ref from prior Stage A stash. pause(retract) only touched running_batch — those waiting chunked-resume resources were never released, so flush_cache silently couldn't free everything (is_fully_idle also stays False because waiting_queue is non-empty). Add an explicit pass that releases each waiting chunked-resume req's resources (release_kv_cache(is_insert=False)) and resets its chunked-prefill state via reset_for_retract, so continue_generation re-prefills the request from origin_input_ids. Also lift the 'running_batch non-empty' guard one level so the new pass runs even when retract is invoked with only waiting chunked-resume present. --- python/sglang/srt/managers/scheduler.py | 26 ++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index cc8a751ec823..cddfb6ebc2b5 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -3711,14 +3711,26 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): self.last_batch = None self.cur_batch = None - if recv_req.mode == "retract" and not self.running_batch.is_empty(): - self.running_batch.filter_batch(v1_spec_info_filtered=True) - if len(self.running_batch.reqs) != 0: - retracted_reqs = self.running_batch.retract_all(self.server_args) - for req in retracted_reqs: - self._add_request_to_queue(req) + if recv_req.mode == "retract": + if not self.running_batch.is_empty(): + self.running_batch.filter_batch(v1_spec_info_filtered=True) + if len(self.running_batch.reqs) != 0: + retracted_reqs = self.running_batch.retract_all(self.server_args) + for req in retracted_reqs: + self._add_request_to_queue(req) - self.running_batch.batch_is_full = False + self.running_batch.batch_is_full = False + + # Chunked-resume reqs in waiting_queue still hold their row + KV + + # radix lock_ref from prior admissions. Without explicit release, + # pause(retract)'s 'flush_cache can succeed' contract (see + # PauseGenerationReqInput docstring) is violated. Release in-place + # and reset their chunked state so continue_generation re-prefills + # them from origin_input_ids. + for req in self.waiting_queue: + if req.has_pending_chunk and req.req_pool_idx is not None: + release_kv_cache(req, self.tree_cache, is_insert=False) + req.reset_for_retract() def continue_generation(self, recv_req: ContinueGenerationReqInput): if recv_req.torch_empty_cache: From 414efd4a27e5f0c5affa21436dcc534029c20ee4 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:23:48 +0800 Subject: [PATCH 22/52] Reset disagg send-side state on chunked-resume retract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 commit ecbe732255 added a pause(retract) sweep that calls release_kv_cache + reset_for_retract on waiting chunked-resume reqs. Pre-v2, retract only operated on running_batch reqs, which in disagg-prefill mode is empty for prefilling reqs — so reset_for_retract never had to consider the disagg send-side fields. After ecbe732255, the same path now hits disagg-prefill chunked-resume reqs that carry: - req.start_send_idx > 0 (mid-prompt position already sent) - req.tmp_end_idx (deferred end_idx for overlap) - req.disagg_kv_sender (live sender object bound to the decode peer) reset_for_retract didn't reset start_send_idx / tmp_end_idx. After continue_generation, the same req gets re-admitted with a fresh req_pool_idx and kv_committed_len starting at 0, but start_send_idx still holds the stale value. process_batch_result_disagg_prefill then calls send_kv_chunk(start_idx=start_send_idx), which reads req_to_token[new_row, stale_idx:end_idx] — either garbage slots or slots that now belong to a different req. The decode peer gets corrupt KV. Fix: - schedule_batch.py: reset_for_retract now zeros start_send_idx and restores tmp_end_idx to -1. Safe in non-disagg modes because the fields are init-only there. - scheduler.py pause(retract): for disagg-prefill mode, abort the sender protocol and drop our reference so the next admit goes through bootstrap again. Confirmed by round-3 Claude Opus audit (R3-A). --- python/sglang/srt/managers/schedule_batch.py | 8 ++++++++ python/sglang/srt/managers/scheduler.py | 11 +++++++++++ 2 files changed, 19 insertions(+) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 19a0a202fedd..9345d8c8d243 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1284,6 +1284,14 @@ def reset_for_retract(self): self.swa_evicted_seqlen = 0 self.extend_batch_idx = 0 self.decode_batch_idx = 0 + # Disagg-prefill send-side bookkeeping. The pre-v2 retract path never + # ran against a req that had started sending (retract only touched + # running_batch), so these stayed at init values. After v2 added + # pause(retract) coverage for waiting chunked-resume reqs, a retracted + # disagg-prefill req's stale start_send_idx would index garbage in the + # new row on re-prefill. + self.start_send_idx = 0 + self.tmp_end_idx = -1 # When using input_embeds, we cannot easily mix the original input embeddings # with the newly generated output token IDs during re-prefill of retracted request. diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index cddfb6ebc2b5..4af0a19b544c 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -3729,6 +3729,17 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): # them from origin_input_ids. for req in self.waiting_queue: if req.has_pending_chunk and req.req_pool_idx is not None: + # Disagg-prefill: signal the decode side that the send was + # retracted and drop our sender ref so re-prefill rebuilds + # the bootstrap state. start_send_idx / tmp_end_idx are + # reset by reset_for_retract. + if ( + self.disaggregation_mode == DisaggregationMode.PREFILL + and req.disagg_kv_sender is not None + ): + if hasattr(req.disagg_kv_sender, "abort"): + req.disagg_kv_sender.abort() + req.disagg_kv_sender = None release_kv_cache(req, self.tree_cache, is_insert=False) req.reset_for_retract() From b433e1ea351a373aa041098c67ff150fdb1284be Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:25:02 +0800 Subject: [PATCH 23/52] Count chunked-resume tail in runtime mem check (page_size > 1) self_check_during_busy enforces a strict invariant: available + evictable + protected + session_held + uncached == total After v2's chunked-resume refactor, when a chunked-resume req is in waiting_queue but not in last_batch/running_batch (filter_batch just removed it, admission this iter failed budget), its row still holds: - cache_protected_len worth of tree-protected KV (counted in protected) - kv_committed_len - cache_protected_len unaligned tail in row (< page_size, not in tree, not in any of available/evictable/ protected/session_held buckets) _active_pool_idxs and _get_total_uncached_sizes only iterated batches, so this tail was uncounted on the LHS. With page_size > 1 (DSv4 = 64, paged-attention configs 16/64/128), the invariant fires a false-positive leak assert. - _active_pool_idxs: also include chunked-resume req_pool_idx from waiting_queue, so session_held correctly identifies these slots as 'owned by an active req' (not held tokens to subtract). - _get_total_uncached_sizes: add chunked-resume reqs from waiting_queue to the groups iterated for uncached accounting. Dedup by id() in case the same req is in both a batch and the queue (transient state around admission boundaries). Only triggers when SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSY > 0 (off by default, used in dev/debug). Confirmed by round-3 Claude Opus audit (R3-B). --- .../scheduler_runtime_checker_mixin.py | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py index ebf929f71251..4fe33e477a81 100644 --- a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py +++ b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py @@ -146,7 +146,8 @@ def _streaming_session_count(self: Scheduler) -> int: ) def _active_pool_idxs(self: Scheduler) -> set: - """Pool idxs currently owned by reqs in last_batch / running_batch. + """Pool idxs currently owned by reqs in last_batch / running_batch or + held by chunked-resume reqs sitting in waiting_queue. Used to decide which session slots' KV is owned by batch reqs (and thus counted via uncached_size, not session_held). @@ -158,6 +159,12 @@ def _active_pool_idxs(self: Scheduler) -> set: for req in batch.reqs: if req.req_pool_idx is not None: idxs.add(req.req_pool_idx) + # Chunked-resume reqs in waiting_queue still own their row across iters + # (filter_batch may have just moved them out of last_batch but they + # haven't yet been re-admitted to running_batch). + for req in self.waiting_queue: + if req.has_pending_chunk and req.req_pool_idx is not None: + idxs.add(req.req_pool_idx) return idxs def _session_held_tokens(self: Scheduler) -> int: @@ -393,17 +400,31 @@ def _get_total_uncached_sizes(self: Scheduler) -> Tuple[int, int]: """ # After decode: running_batch IS last_batch (same object), count once. # After prefill: they differ, both hold uncached tokens. - batches = [self.last_batch] + req_groups = [list(self.last_batch.reqs)] if ( self.running_batch not in (None, self.last_batch) and not self.running_batch.is_empty() ): - batches.append(self.running_batch) + req_groups.append(list(self.running_batch.reqs)) + # Chunked-resume reqs in waiting_queue carry uncached tail + # (kv_committed_len - cache_protected_len, < page_size) that + # filter_batch just removed from last_batch but haven't been + # re-admitted to running_batch yet. The leak invariant must count it. + seen_ids = {id(req) for group in req_groups for req in group} + chunked_in_queue = [ + req + for req in self.waiting_queue + if req.has_pending_chunk + and req.req_pool_idx is not None + and id(req) not in seen_ids + ] + if chunked_in_queue: + req_groups.append(chunked_in_queue) full_uncached = 0 swa_uncached = 0 - for batch in batches: - for req in batch.reqs: + for group in req_groups: + for req in group: assert req.kv_committed_freed == req.kv_overallocated_freed if req.kv_committed_freed or req.req_pool_idx is None: continue From f0af5105abd9ea5f6ad63a97ec7110e1297944a6 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:28:41 +0800 Subject: [PATCH 24/52] Document filter_batch(exclude_chunked_req=True) at every call site MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All 6 sites where filter_batch is invoked with exclude_chunked_req=True drop chunked-resume reqs from a batch. The reqs themselves are NOT lost — they persist in self.waiting_queue thanks to the retention filter in get_new_batch_prefill_raw: self.waiting_queue = [ x for x in self.waiting_queue if x not in can_run_set or x.has_pending_chunk ] so the next iter's Stage A scan re-stashes them and admission re-admits. This is the load-bearing invariant the whole stateless-scheduler v2 design rests on, but it was implicit at the call sites. Per-site notes: - scheduler.py:2498 (last_batch before merge): drop is required — running_batch runs decode forward, chunked-resume is mid-prefill. - scheduler.py:2516 (running_batch when is_prefill_only): defensive, the merge step already drops chunked-resume. - scheduler.py:2843 (running_batch before mix_with_running): defensive, same reason. - scheduler.py:3673 (disagg-prefill last_batch): same as 2498. - disaggregation/prefill.py:735 (process_prefill_chunk last_batch): same as 2498. - disaggregation/decode.py:1422 (new_prebuilt_batch): defensive — chunked prefill is prefill-side, decode-side shouldn't see it; an assert above already guards waiting_queue. --- python/sglang/srt/disaggregation/decode.py | 5 +++++ python/sglang/srt/disaggregation/prefill.py | 5 +++++ python/sglang/srt/managers/scheduler.py | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index b778c7eea712..8b9bcecd31fb 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -1648,6 +1648,11 @@ def get_next_disagg_decode_batch_to_run( if new_prebuilt_batch: assert not any(r.has_pending_chunk for r in self.waiting_queue) self.process_batch_result_prebuilt(new_prebuilt_batch) + # Defensive: chunked prefill is a prefill-side concept; decode-side + # prebuilt batches shouldn't carry has_pending_chunk reqs. The + # assert above already guards waiting_queue; this flag protects + # against any future code that would route a chunked req through + # the disagg decode path. new_prebuilt_batch.filter_batch(exclude_chunked_req=True) if not new_prebuilt_batch.is_empty(): if self.running_batch.is_empty(): diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 24292837e75c..10fabe87c9a2 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -733,6 +733,11 @@ def process_prefill_chunk(self: Scheduler) -> None: if self.last_batch and self.last_batch.forward_mode.is_extend(): last_bs = self.last_batch.batch_size() + # Drop chunked-resume reqs from last_batch — running_batch runs + # decode forward and admitting a mid-prefill req there breaks + # shape + KV accounting. The dropped reqs stay in + # self.waiting_queue (chunked-resume retention) and re-enter via + # the next iter's Stage A stash + admission cycle. self.last_batch.filter_batch(exclude_chunked_req=True) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 4af0a19b544c..771051aef4b4 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2519,6 +2519,12 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: and self.last_batch.forward_mode.is_extend() ): last_bs = self.last_batch.batch_size() + # Drop chunked-resume reqs before merging last_batch into + # running_batch. running_batch runs decode forward and admitting + # a mid-prefill req there breaks shapes + KV accounting. The + # dropped reqs persist in self.waiting_queue (retention at + # ~line 2775: `x not in can_run_set or x.has_pending_chunk`) + # and re-enter via next iter's Stage A stash + admission. self.last_batch.filter_batch(exclude_chunked_req=True) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False @@ -2537,6 +2543,11 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # Runs outside the last_batch block so stale requests are cleaned # even when no new batches arrive (e.g. traffic stops). if self.running_batch.is_prefill_only: + # Defensive exclude_chunked_req: the merge step above already + # drops chunked-resume reqs from last_batch, so running_batch + # shouldn't normally hold one. Keep the flag set so any leak in + # that invariant doesn't survive here; the dropped req still + # has its waiting_queue retention to re-admit next iter. self.running_batch.filter_batch(exclude_chunked_req=True) if self.running_batch.is_empty(): self.running_batch.batch_is_full = False @@ -2863,6 +2874,10 @@ def _get_new_batch_prefill_raw( and new_batch.input_embeds is None ): # TODO (lianmin): support return_logprob + mixed chunked prefill + # exclude_chunked_req here is defensive — by design running_batch + # holds decode reqs only (the last_batch filter+merge step above + # already drops chunked-resume), and any dropped chunked-resume + # would still ride waiting_queue retention to next iter's Stage A. self.running_batch.filter_batch( v1_spec_info_filtered=True, exclude_chunked_req=True ) @@ -3694,6 +3709,9 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): self.process_batch_result(tmp_batch, tmp_result) if self.last_batch and self.last_batch.forward_mode.is_extend(): + # Same invariant as the non-disagg merge path: drop chunked-resume + # reqs before potentially folding last_batch into running_batch. + # They re-enter via waiting_queue retention + Stage A next iter. self.last_batch.filter_batch(exclude_chunked_req=True) # Skip merge for disagg prefill: completed prefill requests are # already in disagg_prefill_inflight_queue. Merging them into From b823c16e6048905c5176fd732b5ccc19f347b4db Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:31:50 +0800 Subject: [PATCH 25/52] Include PP microbatch reqs in abort_request batch_rids dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit abort_request builds batch_rids from running_batch + cur_batch to distinguish 'in batch' reqs (need to_finish, output processor releases on the next iter) from 'waiting only' reqs (can be popped and aborted immediately). The v2 round-2 commit ffaae91c79 added a release_kv_cache + has_pending_chunk clear inside the waiting-pop branch so that chunked-resume reqs sitting in waiting_queue get their row + KV + lock_ref properly released on abort. PP breaks this categorization. Under pipeline parallelism, multiple microbatches are in flight at once: self.mbs holds batches that have been launched but not yet processed, self.last_mbs holds prior-iter launches whose results are coming, self.running_mbs is per-mb running state. A chunked-resume req X can be: - in waiting_queue (chunked-resume retention) - in self.mbs[mb_a] (forward launched, result pending) with pending_middle_outputs > 0. abort_request only sees running_batch + cur_batch (one mb), so X falls into batch_rids miss → waiting-pop path → release_kv_cache while mb_a's forward still references the row. When mb_a's delayed output finally lands: - pending_middle_outputs was cleared to 0 by waiting-pop, so output processor takes the full-output branch - req_pool_idx was cleared by release_kv_cache, so maybe_cache_unfinished_req or release_kv_cache crashes / corrupts Fix: extend batch_rids with rids from every non-empty mb across mbs, last_mbs, running_mbs whenever pp_size > 1. Treats in-flight PP reqs as 'in batch', routing them through the to_finish path which the PP output processor drains correctly. Confirmed by round-4 Codex counter audit (HIGH bug). --- python/sglang/srt/managers/scheduler.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 771051aef4b4..4bbdc7e833a9 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -3580,9 +3580,24 @@ def abort_request(self, recv_req: AbortReq): # waiting_queue removal for those — let the to_finish path below handle # them, otherwise we send_output / release_kv_cache twice. if self.cur_batch is self.running_batch or self.cur_batch is None: - batch_reqs = self.running_batch.reqs + batch_reqs = list(self.running_batch.reqs) else: - batch_reqs = self.running_batch.reqs + self.cur_batch.reqs + batch_reqs = list(self.running_batch.reqs) + list(self.cur_batch.reqs) + # PP: rids from every in-flight microbatch must also be treated as + # 'in batch'. Each mb's forward was launched against the req's + # req_pool_idx + KV slots; the output processor on a different mb + # iteration consumes the result later. Without this, a chunked-resume + # req with pending_middle_outputs > 0 sitting in waiting_queue would + # fall into the waiting-only abort path, release_kv_cache would free + # the row + KV underneath the still-launched forward, and the delayed + # output processor would crash on a None req_pool_idx (or, with + # pending_middle_outputs cleared to 0, mistake the middle-chunk + # result for a full output and append garbage tokens). + if self.pp_size > 1 and hasattr(self, "mbs"): + for mb_list in (self.mbs, self.last_mbs, self.running_mbs): + for mb in mb_list: + if mb is not None and not mb.is_empty(): + batch_reqs.extend(mb.reqs) batch_rids = {r.rid for r in batch_reqs} # Delete requests in the waiting queue From 678bba26f097693d2fec35e94d15cdbf12392533 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 20:53:21 +0800 Subject: [PATCH 26/52] Document why Stage A chunk-stash runs at iter boundary instead of end-of-prior-iter --- python/sglang/srt/managers/scheduler.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 4bbdc7e833a9..24a8ab52fbb0 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2492,6 +2492,16 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # Per-req loop over waiting_queue covers chunked-resume; DLLM staging # reqs are owned by DllmManager (not in waiting_queue), handled # separately below. + # + # Why this runs at the iter boundary (not at the end of the prior iter): + # admission inside get_new_batch_prefill_raw reads req.prefix_indices to + # decide extend_input_len. Stashing in the middle of admission would let + # a chunked-resume req "match itself" — the tree would expose KV this + # same req just wrote, double-counting it as cached prefix. Keeping + # stash here means admission only ever sees tree state that is stable + # for the duration of the scheduling pass. vLLM / TokenSpeed do not + # need this because their admission reads a single monotone counter + # (num_computed_tokens / FSM state), not a prefix-indices splice. for req in self.waiting_queue: if req.has_pending_chunk and not req.is_dllm(): maybe_cache_unfinished_req(req, self.tree_cache, chunked=True) From 34c02d6a6746a410368d6be14bc0d9a912c78e53 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 21:22:21 +0800 Subject: [PATCH 27/52] Filter chunked-resume reqs from split_prefill_batch before pdmux merge The new merge_batch invariant in schedule_batch.py asserts that the source batch holds no has_pending_chunk / pending_middle_outputs / dllm reqs. The pdmux split-prefill path was the only merge site missing the matching filter_batch(exclude_chunked_req=True) before merge. With chunked prefill enabled under pdmux, admitting a non-last chunk would trip the assert. --- .../sglang/srt/multiplex/multiplexing_mixin.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/multiplex/multiplexing_mixin.py b/python/sglang/srt/multiplex/multiplexing_mixin.py index 9902afe5c16f..befc37e8b206 100644 --- a/python/sglang/srt/multiplex/multiplexing_mixin.py +++ b/python/sglang/srt/multiplex/multiplexing_mixin.py @@ -208,10 +208,19 @@ def event_loop_pdmux(self: Scheduler): self.process_batch_result( self.split_prefill_batch, prefill_result ) - if self.running_batch and not self.running_batch.is_empty(): - self.running_batch.merge_batch(self.split_prefill_batch) - else: - self.running_batch = self.split_prefill_batch + # Drop chunked-resume reqs before folding split_prefill_batch + # into running_batch. running_batch runs decode forward and + # admitting a mid-prefill req there breaks shape + KV + # accounting; the dropped reqs persist in self.waiting_queue + # (retention in get_new_batch_prefill) and re-enter via the + # next iter's Stage A stash + admission cycle. Mirrors the + # standard event_loop path at scheduler.py:2514. + self.split_prefill_batch.filter_batch(exclude_chunked_req=True) + if not self.split_prefill_batch.is_empty(): + if self.running_batch and not self.running_batch.is_empty(): + self.running_batch.merge_batch(self.split_prefill_batch) + else: + self.running_batch = self.split_prefill_batch self.split_prefill_batch = None wait_prefill_kernel_done = False From 2868334e3371c2025a3b7f30e0a76465b3d00df3 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 21:39:23 +0800 Subject: [PATCH 28/52] Apply black-jupyter formatting (CI lint fixup) --- python/sglang/srt/managers/schedule_policy.py | 4 +--- python/sglang/srt/managers/scheduler.py | 8 ++------ python/sglang/srt/mem_cache/radix_cache.py | 4 +--- python/sglang/srt/mem_cache/swa_radix_cache.py | 4 +--- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index 4a59555bc7e6..325f07d573d7 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -801,9 +801,7 @@ def add_req_state(r, insert_sort=False): return self.budget_state() - def add_one_req( - self, req: Req, truncation_align_size: Optional[int] - ): + def add_one_req(self, req: Req, truncation_align_size: Optional[int]): # Reuse path: this req's previous chunk left lock_ref held, prefix # already in tree, and init_load_back already consumed host KV. We # must skip fresh-req setup. Gate on `has_pending_chunk` (the diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 24a8ab52fbb0..2dd0ea94e103 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2818,9 +2818,7 @@ def _get_new_batch_prefill_raw( # naturally by budget + priority. can_run_set = set(can_run_list) self.waiting_queue = [ - x - for x in self.waiting_queue - if x not in can_run_set or x.has_pending_chunk + x for x in self.waiting_queue if x not in can_run_set or x.has_pending_chunk ] if adder.preempt_list: for req in adder.preempt_list: @@ -2870,9 +2868,7 @@ def _get_new_batch_prefill_raw( adder, self.running_batch.reqs, self.enable_priority_scheduling, - num_pending_tokens=self._get_num_pending_tokens( - chunk_deduct=chunk_deduct - ), + num_pending_tokens=self._get_num_pending_tokens(chunk_deduct=chunk_deduct), ) # Mixed-style chunked prefill diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py index f4b193c73965..2a6b0a4ba02d 100644 --- a/python/sglang/srt/mem_cache/radix_cache.py +++ b/python/sglang/srt/mem_cache/radix_cache.py @@ -495,9 +495,7 @@ def cache_unfinished_req(self, req: Req, chunked=False): assert req.kv_committed_len >= req.cache_protected_len read_len = req.kv_committed_len token_ids = req.fill_ids[:read_len] - kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, :read_len - ] + kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :read_len] radix_key = RadixKey( token_ids, req.extra_key, is_bigram=self.is_eagle diff --git a/python/sglang/srt/mem_cache/swa_radix_cache.py b/python/sglang/srt/mem_cache/swa_radix_cache.py index a3936683e16f..bf1b46a58809 100644 --- a/python/sglang/srt/mem_cache/swa_radix_cache.py +++ b/python/sglang/srt/mem_cache/swa_radix_cache.py @@ -497,9 +497,7 @@ def cache_unfinished_req(self, req: Req, chunked=False) -> None: return token_ids = req.fill_ids[:read_len] - kv_indices = self.req_to_token_pool.req_to_token[ - req.req_pool_idx, :read_len - ] + kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :read_len] radix_key = RadixKey( token_ids, req.extra_key, is_bigram=self.is_eagle From daf9c42f17b8737c1de7b68724915039619f73e1 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 21:40:21 +0800 Subject: [PATCH 29/52] Remove v1 SWA chunked-req stash gate test (gate was deleted in v2) The test exercised _chunked_req_scheduled_last_iter, a v1 gate that prevented spurious stash on deferred chunked_req (#24252). v2 replaces that gate with has_pending_chunk + Stage A scan + page-aligned cache_protected_len, none of which the test instruments. The test sets s.chunked_req and s._chunked_req_scheduled_last_iter on a SimpleNamespace scheduler that lacks self.waiting_queue, so it AttributeErrors in CI under v2's get_next_batch_to_run path. --- .../test_scheduler_chunked_req_gate.py | 161 ------------------ 1 file changed, 161 deletions(-) delete mode 100644 test/registered/unit/managers/test_scheduler_chunked_req_gate.py diff --git a/test/registered/unit/managers/test_scheduler_chunked_req_gate.py b/test/registered/unit/managers/test_scheduler_chunked_req_gate.py deleted file mode 100644 index 87a6daf7e293..000000000000 --- a/test/registered/unit/managers/test_scheduler_chunked_req_gate.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Regression tests for the SWA chunked-req stash gate (#24252).""" - -import unittest -from types import SimpleNamespace -from unittest.mock import MagicMock - -import torch - -from sglang.test.ci.ci_register import register_cpu_ci -from sglang.test.test_utils import CustomTestCase, maybe_stub_sgl_kernel - -maybe_stub_sgl_kernel() - -from sglang.srt.managers.schedule_batch import Req -from sglang.srt.managers.scheduler import Scheduler -from sglang.srt.mem_cache.chunk_cache import ChunkCache - -register_cpu_ci(est_time=6, suite="stage-a-test-cpu") - - -def _make_req( - *, - req_pool_idx: int, - fill_ids: list, - prefix_indices: torch.Tensor, - extend_input_len: int, -) -> Req: - req = Req.__new__(Req) - req.rid = "test-req" - req.origin_input_ids = list(fill_ids) - req.output_ids = [] - req.fill_ids = list(fill_ids) - req.prefix_indices = prefix_indices - req.req_pool_idx = req_pool_idx - req.extend_input_len = extend_input_len - req.is_chunked = 0 - req.host_hit_length = 0 - req.cache_protected_len = 0 - req.skip_radix_cache_insert = False - req.last_node = None - req.swa_uuid_for_lock = None - req.session = None - req.return_logprob = False - req.logprob_start_len = -1 - req.positional_embed_overrides = None - req.extra_key = None - req.mamba_pool_idx = None - req.sampling_params = SimpleNamespace(max_new_tokens=128, ignore_eos=False) - return req - - -def _make_req_to_token_pool(num_slots: int, max_context: int) -> SimpleNamespace: - # Slot s contains a recognizable fingerprint [s*1000, s*1000+1, ...] - # so we can tell a corrupted prefix_indices from a healthy one by content. - pool = SimpleNamespace() - pool.req_to_token = ( - torch.arange(max_context, dtype=torch.int32).unsqueeze(0).repeat(num_slots, 1) - + torch.arange(num_slots, dtype=torch.int32).unsqueeze(1) * 1000 - ) - return pool - - -def _make_chunk_cache(req_to_token_pool) -> ChunkCache: - return ChunkCache( - SimpleNamespace( - req_to_token_pool=req_to_token_pool, - token_to_kv_pool_allocator=None, - page_size=1, - ) - ) - - -def _scheduler_for_get_next_batch(*, tree_cache, chunked_req) -> Scheduler: - s = Scheduler.__new__(Scheduler) - s._abort_on_waiting_timeout = MagicMock() - s._abort_on_running_timeout = MagicMock() - s.dllm_config = None - s.dllm_manager = None - s.enable_hisparse = False - s.last_batch = None - s.require_mlp_sync = False - s.spec_algorithm = MagicMock() - s.server_args = MagicMock(speculative_skip_dp_mlp_sync=True) - s.running_batch = MagicMock() - s.running_batch.is_empty.return_value = True - s.running_batch.is_prefill_only = False - s.running_batch.batch_is_full = False - s.running_batch.reqs = [] - s.get_new_batch_prefill = MagicMock(return_value=None) - s.maybe_prepare_mlp_sync_batch = MagicMock(side_effect=lambda batch, **_: batch) - s._maybe_prepare_ngram_embedding = MagicMock(side_effect=lambda batch: batch) - s.update_running_batch = MagicMock(side_effect=lambda batch: batch) - s.tree_cache = tree_cache - s.chunked_req = chunked_req - return s - - -class TestStashGatePreservesPrefixIndices(CustomTestCase): - """Consumer side: real ChunkCache.cache_unfinished_req mutates - req.prefix_indices iff stash actually runs, so prefix_indices content - is the bug-detection signal.""" - - POOL_IDX = 4 - INITIAL_PREFIX_LEN = 8 # what was really cached last iter - POST_RESET_FILL_LEN = 32 # length after init_next_round_input - NUM_SLOTS = 8 - MAX_CONTEXT = 64 - - def _build(self, flag: bool): - pool = _make_req_to_token_pool(self.NUM_SLOTS, self.MAX_CONTEXT) - cache = _make_chunk_cache(pool) - initial_prefix = pool.req_to_token[self.POOL_IDX, : self.INITIAL_PREFIX_LEN].to( - dtype=torch.int64, copy=True - ) - req = _make_req( - req_pool_idx=self.POOL_IDX, - fill_ids=list(range(self.POST_RESET_FILL_LEN)), - prefix_indices=initial_prefix, - extend_input_len=0, - ) - s = _scheduler_for_get_next_batch(tree_cache=cache, chunked_req=req) - s._chunked_req_scheduled_last_iter = flag - return s, req, initial_prefix, pool - - def test_deferred_chunked_req_keeps_real_prefix_indices(self): - # The bug case: a spurious stash on a deferred chunked_req - # would extend prefix_indices to len(fill_ids). - s, req, initial_prefix, _ = self._build(flag=False) - - Scheduler.get_next_batch_to_run(s) - - self.assertEqual(req.prefix_indices.shape[0], self.INITIAL_PREFIX_LEN) - self.assertTrue(torch.equal(req.prefix_indices, initial_prefix)) - - def test_scheduled_chunked_req_advances_prefix_indices_via_real_stash(self): - # Symmetric guard against over-gating: when the chunked_req was - # actually scheduled, stash must run and advance prefix_indices. - s, req, _, pool = self._build(flag=True) - - Scheduler.get_next_batch_to_run(s) - - expected = pool.req_to_token[self.POOL_IDX, : self.POST_RESET_FILL_LEN].to( - dtype=torch.int64 - ) - self.assertEqual(req.prefix_indices.shape[0], self.POST_RESET_FILL_LEN) - self.assertTrue(torch.equal(req.prefix_indices, expected)) - - def test_no_chunked_req_never_mutates_state_even_with_stale_flag(self): - # Retract path clears chunked_req without resetting the flag; - # the outer `if chunked_req is not None` guard must hold. - pool = _make_req_to_token_pool(self.NUM_SLOTS, self.MAX_CONTEXT) - cache = _make_chunk_cache(pool) - s = _scheduler_for_get_next_batch(tree_cache=cache, chunked_req=None) - s._chunked_req_scheduled_last_iter = True - - Scheduler.get_next_batch_to_run(s) - self.assertIsNone(s.chunked_req) - - -if __name__ == "__main__": - unittest.main() From a94e842611d425656ac9e457e7c2356d6a55eac2 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 21:46:45 +0800 Subject: [PATCH 30/52] Drop v1 has_chunked_req kwarg + delete v1 add_chunked_req SWA tests test_prefill_adder.py exercised two v1 APIs that v2 deleted: - add_one_req(req, has_chunked_req=..., truncation_align_size=...) -> v2 signature is add_one_req(req, truncation_align_size). Drop kwarg. - adder.add_chunked_req(req) -> v2 unified into add_one_req's reuse branch (gated on req.has_pending_chunk; see schedule_policy.py:811). The three SWA-reservation tests assert v1's truncate-to-fit semantics (return req with set_extend_input_len(REM_SWA - PAGE_SIZE)), which v2 intentionally replaces: v2's _swa_budget_for_req rejects with AddReqResult.NO_TOKEN when swa_needed >= rem_swa_tokens and lets waiting_queue retention re-admit the chunked-resume req next iter. These tests no longer correspond to live behavior; delete them along with the _build_hybrid_swa_chunked_req helper they share. --- .../unit/managers/test_prefill_adder.py | 94 +------------------ 1 file changed, 3 insertions(+), 91 deletions(-) diff --git a/test/registered/unit/managers/test_prefill_adder.py b/test/registered/unit/managers/test_prefill_adder.py index 14d4eab70061..6ee5f921134e 100644 --- a/test/registered/unit/managers/test_prefill_adder.py +++ b/test/registered/unit/managers/test_prefill_adder.py @@ -383,9 +383,7 @@ def test_mixed_chunk_prefill_budgets(self): req1.last_node = MagicMock() req1.sampling_params.ignore_eos = False - result1 = adder.add_one_req( - req1, has_chunked_req=False, truncation_align_size=None - ) + result1 = adder.add_one_req(req1, truncation_align_size=None) self.assertEqual(len(adder.can_run_list), 1) self.assertEqual(adder.rem_chunk_tokens, 0) # 56 - 56 @@ -417,9 +415,7 @@ def test_mixed_chunk_prefill_budgets(self): req2.last_node = MagicMock() req2.sampling_params.ignore_eos = False - result2 = adder2.add_one_req( - req2, has_chunked_req=False, truncation_align_size=None - ) + result2 = adder2.add_one_req(req2, truncation_align_size=None) self.assertEqual(len(adder2.can_run_list), 1) self.assertEqual(adder2.rem_chunk_tokens, 3) # 59 - 56 = 3 remaining @@ -434,78 +430,12 @@ def test_mixed_chunk_prefill_budgets(self): req3.last_node = MagicMock() req3.sampling_params.ignore_eos = False - result3 = adder2.add_one_req( - req3, has_chunked_req=False, truncation_align_size=None - ) + result3 = adder2.add_one_req(req3, truncation_align_size=None) self.assertEqual(len(adder2.can_run_list), 2) self.assertEqual(adder2.rem_chunk_tokens, 0) # 3 - 3 = 0 self.assertEqual(result3, AddReqResult.OTHER) - def _build_hybrid_swa_chunked_req( - self, - *, - page_size, - rem_swa, - rem_chunk=2048, - extend_input_len=500, - is_hybrid_swa=True, - full_available=100_000, - ): - self.mock_token_allocator.swa_available_size.return_value = rem_swa - self.mock_token_allocator.full_available_size.return_value = full_available - self.mock_token_allocator.available_size.return_value = full_available - self.mock_tree_cache.sliding_window_size = 128 - adder = self.create_adder( - self.create_running_batch(), - page_size=page_size, - rem_chunk_tokens=rem_chunk, - ) - adder.is_hybrid_swa = is_hybrid_swa - - req = self.create_mock_req("chunked", priority=0, max_new_tokens=128) - req.extend_input_len = extend_input_len - req.prefix_indices = [] - req.fill_ids = list(range(extend_input_len)) - req.set_extend_input_len = MagicMock() - return adder, req - - def test_add_chunked_req_hybrid_swa_reserves_page_for_alloc_extend(self): - # alloc_extend needs extend_num_tokens + page_size per request. If the - # scheduler hands out all of rem_swa_tokens, alloc_extend cannot get its - # extra page and OOMs. With the fix, extend_input_len must cap at - # rem_swa_tokens - page_size so the page is reserved. - PAGE_SIZE = 64 - REM_SWA = 100 - adder, req = self._build_hybrid_swa_chunked_req( - page_size=PAGE_SIZE, rem_swa=REM_SWA - ) - - result = adder.add_chunked_req(req) - - self.assertIs(result, req) # truncated → chunked prefill continues - req.set_extend_input_len.assert_called_once() - new_len = req.set_extend_input_len.call_args.args[0] - self.assertLessEqual(new_len + PAGE_SIZE, REM_SWA) - self.assertEqual(new_len, REM_SWA - PAGE_SIZE) - - def test_add_chunked_req_hybrid_swa_defers_when_swa_below_page(self): - # When rem_swa_tokens <= page_size there is no room to serve even the - # reservation, so the chunked req must be deferred (returned unchanged) - # instead of falling back to rem_chunk_tokens and bypassing SWA budget. - PAGE_SIZE = 64 - adder, req = self._build_hybrid_swa_chunked_req( - page_size=PAGE_SIZE, rem_swa=PAGE_SIZE - ) - original_len = req.extend_input_len - - result = adder.add_chunked_req(req) - - self.assertIs(result, req) - req.set_extend_input_len.assert_not_called() - self.assertEqual(req.extend_input_len, original_len) - self.assertEqual(len(adder.can_run_list), 0) - def test_swa_budget_for_req(self): cases = [ # (extend, rem_chunk, window, page, expected, label) @@ -526,24 +456,6 @@ def test_swa_budget_for_req(self): ) self.assertEqual(adder._swa_budget_for_req(extend), expected) - def test_add_chunked_req_non_hybrid_no_swa_reservation(self): - # Non-hybrid path: the SWA-pool reservation must NOT apply, otherwise - # the fix would regress non-SWA models. - PAGE_SIZE = 16 - adder, req = self._build_hybrid_swa_chunked_req( - page_size=PAGE_SIZE, - rem_swa=10, - rem_chunk=500, - extend_input_len=200, - is_hybrid_swa=False, - full_available=300, - ) - - result = adder.add_chunked_req(req) - self.assertIsNone(result) - req.set_extend_input_len.assert_called_once_with(200) - self.assertIn(req, adder.can_run_list) - if __name__ == "__main__": unittest.main() From 02b1785f0a3db25cc21e861f84b2bbedba8159ca Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 22:21:39 +0800 Subject: [PATCH 31/52] Guard _handle_finished_req against PP cross-microbatch double-finalize In PP+chunked-prefill, the same Req object can sit in several in-flight mbs[*] batches because chunks are pipelined across microbatch slots. The slot that processes the last chunk's result finalizes the req (release_kv_cache nulls req_pool_idx); a sibling slot's pending result then re-enters _handle_finished_req on the same Req and trips the 'Only MambaRadixCache allow freeing before alloc' assert inside release_kv_cache. Treat 'req_pool_idx is None at finalize' as 'already released' for non-Mamba caches and skip the redundant cleanup. The first finalize already ran multimodal_inputs.release_features, the experts/indexer collectors, hisparse request_finished, release_kv_cache, and set completion_time. maybe_collect_customized_info is still called so per-i diagnostic capture isn't dropped. Reproduces under test_pp_long_context_prefill (128k random_input_len, random_output_len=1, --pp-size 2, chunked_prefill_size=8192, fp8 70B): PP0 raises AssertionError in scheduler_output_processor_mixin.py:657 during process_batch_result_decode. --- .../managers/scheduler_output_processor_mixin.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py index 234b56b78865..5fa5313678d5 100644 --- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py +++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py @@ -641,6 +641,19 @@ def _handle_finished_req( self.decode_offload_manager.offload_kv_cache(req) if req.finished(): + # Idempotency guard for PP cross-microbatch races: in PP+chunked + # prefill the same Req object can sit in multiple in-flight + # mbs[*] batches when chunks of one req are pipelined across + # microbatch slots. The slot that processes the last chunk's + # result finalizes the req (release_kv_cache nulls req_pool_idx), + # then a sibling slot's pending result hits the same req again + # here and would trip the assert in release_kv_cache. Treat + # `req_pool_idx is None at finalize` as "already released" and + # skip the redundant cleanup; the first call already collected + # multimodal/experts/indexer/time-stats state. + if req.req_pool_idx is None and not self.tree_cache.supports_mamba(): + self.maybe_collect_customized_info(i, req, logits_output) + return # delete feature to save memory if req.multimodal_inputs is not None and req.session is None: req.multimodal_inputs.release_features() From b0f21388b396b39da085e7652fc3eefb8af8179d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 13 May 2026 23:28:27 +0800 Subject: [PATCH 32/52] Seed has_pending_chunk/is_dllm/host_hit_length on test_prefill_adder mocks v2 add_one_req (schedule_policy.py:811) reads req.has_pending_chunk and req.is_dllm() at the reuse-branch gate. MagicMock(spec=Req) only surfaces class-level attributes; has_pending_chunk is set inside Req.__init__ and was therefore unreachable on the mock, raising AttributeError under stage-b-test-1-gpu-small test_mixed_chunk_prefill_budgets. Seed the three attributes that v2 newly touches on the mock so the reuse-branch gate evaluates cleanly with is_resume=False. --- test/registered/unit/managers/test_prefill_adder.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/registered/unit/managers/test_prefill_adder.py b/test/registered/unit/managers/test_prefill_adder.py index 6ee5f921134e..b43f3efdf1b8 100644 --- a/test/registered/unit/managers/test_prefill_adder.py +++ b/test/registered/unit/managers/test_prefill_adder.py @@ -77,6 +77,11 @@ def create_mock_req(self, rid, priority, max_new_tokens, output_len=0, wait_time req.sampling_params = SimpleNamespace(max_new_tokens=max_new_tokens) req.time_stats = SimpleNamespace(wait_queue_entry_time=wait_time) req.finished.return_value = False + # v2 add_one_req reads these on the reuse-branch gate; MagicMock(spec=Req) + # doesn't surface attributes set only in Req.__init__, so seed them. + req.has_pending_chunk = False + req.is_dllm.return_value = False + req.host_hit_length = 0 return req def create_adder(self, running_batch, **kwargs): From 33f981ce935489434f0e99e46bf45b2655bbe981 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 14 May 2026 18:12:56 +0800 Subject: [PATCH 33/52] Re-add ScheduleBatch.chunked_req marker for PP cross-mb filter exclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In PP+chunked-prefill, mb_a's last-chunk admit clears req.has_pending_chunk but mb_b still holds the prior middle-chunk batch in its last_batch slot. The dynamic filter_batch predicate (has_pending_chunk OR pending_middle_outputs>0) becomes False for the req before mb_a's last-chunk forward result has been processed by output_processor, so mb_b would merge the still-prefilling req into running_batch and run a decode forward on stale state — wrong logits, wrong output tokens. Restore the OLD-code idiom: stamp the batch's chunked_req at admit time from chunked_in_batch[0] (the req that was admitted as mid-prefill in this batch). filter_batch then excludes any req that is the batch's own chunked_req, not just reqs whose dynamic counters happen to be > 0 at filter time. Mirrors the behavior that existed before c445a82cf5 (Switch chunked-resume to waiting_queue holding; delete chunked_req fields); only the storage moved from Scheduler.chunked_req to per-batch — this brings back the per-batch marker without re-introducing scheduler-level chunked-aware state. --- python/sglang/srt/managers/schedule_batch.py | 10 ++++++++++ python/sglang/srt/managers/scheduler.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 9345d8c8d243..e521ca60671a 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1414,6 +1414,15 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): # This is an optimization to reduce the overhead of the prefill check. batch_is_full: bool = False + # The chunked-resume req that was admitted into this batch as mid-prefill + # (truncated at admit time -> has_pending_chunk True). Set by the scheduler + # right after init_new; consulted by filter_batch to exclude this req from + # merging into running_batch in subsequent iters. Required for PP, where + # mb_a's last-chunk admit clears has_pending_chunk but mb_b is still holding + # a middle-chunk batch in its last_batch slot — without this per-batch + # marker, mb_b would merge the still-prefilling req into running_batch. + chunked_req: Optional[Req] = None + # Sampling info sampling_info: SamplingBatchInfo = None @@ -2441,6 +2450,7 @@ def filter_batch( self.reqs[i].has_pending_chunk or self.reqs[i].pending_middle_outputs > 0 or self.reqs[i].is_dllm() + or self.reqs[i] is self.chunked_req ) ) ] diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 2dd0ea94e103..897a4cc139f1 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2854,6 +2854,15 @@ def _get_new_batch_prefill_raw( self.enable_overlap, self.spec_algorithm, ) + # Stamp the batch's chunked_req at admit time so subsequent filter_batch + # calls (across PP microbatches) can exclude this req from running_batch + # merging until its last chunk's forward result has been processed. + # has_pending_chunk-based filtering alone is insufficient: in PP, when + # mb_a admits the last chunk (has_pending_chunk -> False) but mb_b still + # holds a middle-chunk batch in its last_batch slot, mb_b would merge + # the still-prefilling req into running_batch. + if chunked_in_batch: + new_batch.chunked_req = chunked_in_batch[0] self.max_prefill_bs = max(self.max_prefill_bs, len(can_run_list)) if self.enable_hierarchical_cache: # todo (zhiqiang): disable cuda graph execution if hicache loading triggered From 11db3a4192446432cbe0144b5b627b89024331a3 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 14 May 2026 18:26:07 +0800 Subject: [PATCH 34/52] Revert "Re-add ScheduleBatch.chunked_req marker for PP cross-mb filter exclusion" This reverts commit 33f981ce935489434f0e99e46bf45b2655bbe981. --- python/sglang/srt/managers/schedule_batch.py | 10 ---------- python/sglang/srt/managers/scheduler.py | 9 --------- 2 files changed, 19 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index e521ca60671a..9345d8c8d243 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1414,15 +1414,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): # This is an optimization to reduce the overhead of the prefill check. batch_is_full: bool = False - # The chunked-resume req that was admitted into this batch as mid-prefill - # (truncated at admit time -> has_pending_chunk True). Set by the scheduler - # right after init_new; consulted by filter_batch to exclude this req from - # merging into running_batch in subsequent iters. Required for PP, where - # mb_a's last-chunk admit clears has_pending_chunk but mb_b is still holding - # a middle-chunk batch in its last_batch slot — without this per-batch - # marker, mb_b would merge the still-prefilling req into running_batch. - chunked_req: Optional[Req] = None - # Sampling info sampling_info: SamplingBatchInfo = None @@ -2450,7 +2441,6 @@ def filter_batch( self.reqs[i].has_pending_chunk or self.reqs[i].pending_middle_outputs > 0 or self.reqs[i].is_dllm() - or self.reqs[i] is self.chunked_req ) ) ] diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 897a4cc139f1..2dd0ea94e103 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2854,15 +2854,6 @@ def _get_new_batch_prefill_raw( self.enable_overlap, self.spec_algorithm, ) - # Stamp the batch's chunked_req at admit time so subsequent filter_batch - # calls (across PP microbatches) can exclude this req from running_batch - # merging until its last chunk's forward result has been processed. - # has_pending_chunk-based filtering alone is insufficient: in PP, when - # mb_a admits the last chunk (has_pending_chunk -> False) but mb_b still - # holds a middle-chunk batch in its last_batch slot, mb_b would merge - # the still-prefilling req into running_batch. - if chunked_in_batch: - new_batch.chunked_req = chunked_in_batch[0] self.max_prefill_bs = max(self.max_prefill_bs, len(can_run_list)) if self.enable_hierarchical_cache: # todo (zhiqiang): disable cuda graph execution if hicache loading triggered From b3a7b9f2a10cde180f20e01fcd81ca7a76b10224 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 14 May 2026 18:39:17 +0800 Subject: [PATCH 35/52] Bump pending_middle_outputs for last-chunk admits + decrement-first output proc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PP+chunked-prefill correctness: pending_middle_outputs is the count of forwards launched but not yet output-processed. The c445 design only bumped this counter for mid-chunk admits (has_pending_chunk True after), and output proc checked the counter at entry to decide produce-output vs middle-handling. In PP, this drops a forward result on the floor when a sibling mb is concurrently processing a later chunk's middle forward — the LAST chunk's admit doesn't bump the counter, so a mid-chunk forward result for the same req can take the produce-output branch with stale state. Fix: - At admit, bump pending_middle_outputs for any req participating in a multi-chunk prefill: has_pending_chunk (mid-chunk this iter) OR kv_committed_len > 0 (was-resume → this iter is last chunk OR another mid). kv_committed_len here reflects the prior iter's prepare_for_extend, not this iter's. - Output proc decrements first, then produces output iff the counter has just hit zero AND has_pending_chunk is False. Otherwise this is a non-last forward in the PP pipeline; suppress the produce-output path. Filter_batch's existing pending_middle_outputs > 0 predicate now correctly excludes a mid-prefill req from running_batch merge until ALL its in-flight forwards (across mbs) have been output-processed — no per-batch chunked_req marker required, preserving the v2 design of per-Req-only chunked state. --- python/sglang/srt/managers/scheduler.py | 29 +++++++++----- .../scheduler_output_processor_mixin.py | 39 ++++++++++++++----- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 2dd0ea94e103..a6924662b23b 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2824,18 +2824,29 @@ def _get_new_batch_prefill_raw( for req in adder.preempt_list: self._add_request_to_queue(req) - # Bump pending_middle_outputs (the pending_middle_outputs counter) for every - # admitted req that's still mid-prefill — output processor uses this - # to know its forward's sample is garbage. Counter semantics needed - # for PP, where multiple microbatches may admit the same req. - chunked_in_batch = [r for r in can_run_list if r.has_pending_chunk] + # Bump pending_middle_outputs for every admitted req whose admission is + # part of a multi-chunk prefill — both mid-chunk admits (has_pending_chunk + # stays True after this admit) AND the last-chunk admit of a previously + # chunked-resume req (kv_committed_len > 0 means a prior chunk's prepare + # already wrote to its row, so this req has been chunk-prefilled before). + # The counter is the number of forwards launched but not yet output- + # processed. Output processor decrements first, then checks whether + # this was the last pending forward; required for PP, where the LAST + # chunk's forward result may not be the last forward in flight for the + # req (a sibling mb may hold a mid-chunk forward still pipelined). + # kv_committed_len here reflects the PRIOR iter's prepare_for_extend; + # this iter's prepare_for_extend has not yet run. + chunk_admits = [ + r for r in can_run_list if r.has_pending_chunk or r.kv_committed_len > 0 + ] assert ( - len(chunked_in_batch) <= 1 - ), "single-flight invariant: at most one chunked-resume req per batch" + sum(1 for r in chunk_admits if r.has_pending_chunk) <= 1 + ), "single-flight invariant: at most one mid-chunk admit per batch" chunk_deduct = 0 - for r in chunked_in_batch: + for r in chunk_admits: r.pending_middle_outputs += 1 - chunk_deduct = r.extend_input_len + if r.has_pending_chunk: + chunk_deduct = r.extend_input_len # Record for logging prefill stats after forward self.adder = adder diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py index 5fa5313678d5..d74b78d1c8cc 100644 --- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py +++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py @@ -241,7 +241,22 @@ def process_batch_result_prefill( # decode req in mixed batch or retracted req continue - if req.pending_middle_outputs <= 0: + # Decrement-first semantics: pending_middle_outputs is the number + # of forwards launched for this req that have not yet been output- + # processed. After decrement, this is the LAST pending forward iff + # the counter hits zero AND no more chunks are coming + # (has_pending_chunk is cleared by add_one_req's last-chunk path). + # In PP, a sibling mb may hold a mid-chunk forward in its pipeline + # whose result has yet to be processed — we must not finalize the + # prefill until pmo reaches zero. + is_last_chunk_output = True + if req.pending_middle_outputs > 0: + req.pending_middle_outputs -= 1 + is_last_chunk_output = ( + req.pending_middle_outputs == 0 and not req.has_pending_chunk + ) + + if is_last_chunk_output: req.time_stats.set_prefill_finished_time() # req output_ids are set here @@ -313,11 +328,9 @@ def process_batch_result_prefill( req.grammar.finished = req.finished() else: - # being chunked reqs' prefill is not finished - req.pending_middle_outputs -= 1 - # There is only at most one request being currently chunked. - # Because this request does not finish prefill, - # we don't want to stream the request currently being chunked. + # Middle chunk forward (or non-last forward in PP pipeline): + # prefill not yet finalized; counter already decremented above. + # We don't want to stream the request currently being chunked. skip_stream_req = req # Incrementally update input logprobs. @@ -380,7 +393,17 @@ def process_batch_result_prefill( req.embedding = embeddings[i] if req.return_pooled_hidden_states and phs is not None: req.pooled_hidden_state = phs[i] - if req.pending_middle_outputs <= 0: + + # Decrement-first; mirrors the generation-model branch above. + # See that branch for the PP rationale. + is_last_chunk_output = True + if req.pending_middle_outputs > 0: + req.pending_middle_outputs -= 1 + is_last_chunk_output = ( + req.pending_middle_outputs == 0 and not req.has_pending_chunk + ) + + if is_last_chunk_output: req.time_stats.set_prefill_finished_time() # Dummy output token for embedding models req.output_ids.append(0) @@ -392,8 +415,6 @@ def process_batch_result_prefill( else: maybe_cache_unfinished_req(req, self.tree_cache) else: - # being chunked reqs' prefill is not finished - req.pending_middle_outputs -= 1 req.time_stats.set_last_chunked_prefill_finish_time() self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req) From e875cd36e4afbb80097cb73498fb16a157592123 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 14 May 2026 19:12:58 +0800 Subject: [PATCH 36/52] Revert "Bump pending_middle_outputs for last-chunk admits + decrement-first output proc" This reverts commit b3a7b9f2a10cde180f20e01fcd81ca7a76b10224. --- python/sglang/srt/managers/scheduler.py | 29 +++++--------- .../scheduler_output_processor_mixin.py | 39 +++++-------------- 2 files changed, 18 insertions(+), 50 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index a6924662b23b..2dd0ea94e103 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2824,29 +2824,18 @@ def _get_new_batch_prefill_raw( for req in adder.preempt_list: self._add_request_to_queue(req) - # Bump pending_middle_outputs for every admitted req whose admission is - # part of a multi-chunk prefill — both mid-chunk admits (has_pending_chunk - # stays True after this admit) AND the last-chunk admit of a previously - # chunked-resume req (kv_committed_len > 0 means a prior chunk's prepare - # already wrote to its row, so this req has been chunk-prefilled before). - # The counter is the number of forwards launched but not yet output- - # processed. Output processor decrements first, then checks whether - # this was the last pending forward; required for PP, where the LAST - # chunk's forward result may not be the last forward in flight for the - # req (a sibling mb may hold a mid-chunk forward still pipelined). - # kv_committed_len here reflects the PRIOR iter's prepare_for_extend; - # this iter's prepare_for_extend has not yet run. - chunk_admits = [ - r for r in can_run_list if r.has_pending_chunk or r.kv_committed_len > 0 - ] + # Bump pending_middle_outputs (the pending_middle_outputs counter) for every + # admitted req that's still mid-prefill — output processor uses this + # to know its forward's sample is garbage. Counter semantics needed + # for PP, where multiple microbatches may admit the same req. + chunked_in_batch = [r for r in can_run_list if r.has_pending_chunk] assert ( - sum(1 for r in chunk_admits if r.has_pending_chunk) <= 1 - ), "single-flight invariant: at most one mid-chunk admit per batch" + len(chunked_in_batch) <= 1 + ), "single-flight invariant: at most one chunked-resume req per batch" chunk_deduct = 0 - for r in chunk_admits: + for r in chunked_in_batch: r.pending_middle_outputs += 1 - if r.has_pending_chunk: - chunk_deduct = r.extend_input_len + chunk_deduct = r.extend_input_len # Record for logging prefill stats after forward self.adder = adder diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py index d74b78d1c8cc..5fa5313678d5 100644 --- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py +++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py @@ -241,22 +241,7 @@ def process_batch_result_prefill( # decode req in mixed batch or retracted req continue - # Decrement-first semantics: pending_middle_outputs is the number - # of forwards launched for this req that have not yet been output- - # processed. After decrement, this is the LAST pending forward iff - # the counter hits zero AND no more chunks are coming - # (has_pending_chunk is cleared by add_one_req's last-chunk path). - # In PP, a sibling mb may hold a mid-chunk forward in its pipeline - # whose result has yet to be processed — we must not finalize the - # prefill until pmo reaches zero. - is_last_chunk_output = True - if req.pending_middle_outputs > 0: - req.pending_middle_outputs -= 1 - is_last_chunk_output = ( - req.pending_middle_outputs == 0 and not req.has_pending_chunk - ) - - if is_last_chunk_output: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # req output_ids are set here @@ -328,9 +313,11 @@ def process_batch_result_prefill( req.grammar.finished = req.finished() else: - # Middle chunk forward (or non-last forward in PP pipeline): - # prefill not yet finalized; counter already decremented above. - # We don't want to stream the request currently being chunked. + # being chunked reqs' prefill is not finished + req.pending_middle_outputs -= 1 + # There is only at most one request being currently chunked. + # Because this request does not finish prefill, + # we don't want to stream the request currently being chunked. skip_stream_req = req # Incrementally update input logprobs. @@ -393,17 +380,7 @@ def process_batch_result_prefill( req.embedding = embeddings[i] if req.return_pooled_hidden_states and phs is not None: req.pooled_hidden_state = phs[i] - - # Decrement-first; mirrors the generation-model branch above. - # See that branch for the PP rationale. - is_last_chunk_output = True - if req.pending_middle_outputs > 0: - req.pending_middle_outputs -= 1 - is_last_chunk_output = ( - req.pending_middle_outputs == 0 and not req.has_pending_chunk - ) - - if is_last_chunk_output: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # Dummy output token for embedding models req.output_ids.append(0) @@ -415,6 +392,8 @@ def process_batch_result_prefill( else: maybe_cache_unfinished_req(req, self.tree_cache) else: + # being chunked reqs' prefill is not finished + req.pending_middle_outputs -= 1 req.time_stats.set_last_chunked_prefill_finish_time() self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req) From 5c523049dbe0ca7cbff0e87f598caaae0119aea2 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 14 May 2026 19:24:05 +0800 Subject: [PATCH 37/52] Exclude in-flight other-mb reqs in filter_batch (PP chunked-resume race) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In PP+chunked-prefill: mb_a admits the LAST chunk of req X (clearing has_pending_chunk, with pending_middle_outputs unchanged since chunked admits only bump for mid-chunk). mb_b is still holding a prior mid-chunk batch in its last_batch slot. By the time mb_b runs filter_batch on that last_batch, X's last-chunk forward result has not yet arrived in the output processor — but X's dynamic predicate fields (has_pending_chunk False, pending_middle_outputs may already be 0 from an earlier mid-chunk output proc) make filter_batch keep X. X merges into running_batch and runs a decode forward on stale state → wrong tokens. Fix: at filter_batch call sites in get_next_batch_to_run, compute the set of req rids whose forward batches are in flight in other PP microbatches (self.mbs[other_id] != self.last_batch) and pass it to filter_batch as an additional exclusion set. filter_batch keeps the dynamic per-Req predicate intact and only adds a transient "in-flight elsewhere" check that lives in the caller's scope — no per-batch chunked-aware state, preserving the stateless-scheduler v2 design invariant. mbs[other_id] at this point always holds the other mb's most-recently launched batch; for pp_loop_size==2, it is guaranteed to be in-flight (launched in this iter mb_(id-1)%pp step, processed at end of this iter mb_(id+1)%pp step). For pp_size==1, the set is empty and filter_batch behaves exactly as before. --- python/sglang/srt/managers/schedule_batch.py | 3 ++ python/sglang/srt/managers/scheduler.py | 47 +++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 9345d8c8d243..dd4dd36e1a19 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -2425,12 +2425,14 @@ def filter_batch( # FIXME(lsyin): deprecate this API after spec v1 is deprecated v1_spec_info_filtered: Optional[bool] = False, exclude_chunked_req: bool = False, + exclude_in_flight_other_mb: Optional[set] = None, ): # FIXME(lsyin): used here to get the correct seq_lens # The batch has been launched but we need it verified to get correct next batch info self.maybe_wait_verify_done() if keep_indices is None: + in_flight_rids = exclude_in_flight_other_mb or set() keep_indices = [ i for i in range(len(self.reqs)) @@ -2443,6 +2445,7 @@ def filter_batch( or self.reqs[i].is_dllm() ) ) + and self.reqs[i].rid not in in_flight_rids ] if keep_indices is None or len(keep_indices) == 0: diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 2dd0ea94e103..94f24653c088 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2480,6 +2480,33 @@ def _build_hisparse_decode_batch(self, reqs): # todo hisparse, maybe other info to contain for the new batch return batch + def _in_flight_other_mb_rids(self) -> set: + """rids of reqs whose forward is launched in another mb but whose + result has not yet been processed by the output processor. + + Used by filter_batch on last_batch / running_batch to exclude + chunked-prefill reqs whose LAST chunk admit cleared has_pending_chunk + (and pending_middle_outputs may have been decremented to 0 by an + earlier mid-chunk forward result), but whose actual last-chunk + forward result has not yet arrived in the output processor — they + must not be merged into running_batch as decode reqs yet. + + At PP loop iter T mb_id step's filter_batch time, mbs[other_id != + mb_id] holds an in-flight forward batch (launched, not yet + processed). For pp_loop_size==2, the other mb's batch is always + in-flight at this point. Skip self.last_batch (==mbs[mb_id], the + batch being filtered itself). + """ + if self.pp_size <= 1 or not hasattr(self, "mbs"): + return set() + rids = set() + for mb in self.mbs: + if mb is None or mb is self.last_batch: + continue + for r in mb.reqs: + rids.add(r.rid) + return rids + def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: if self.enable_fpm: self._fpm_batch_t0 = time.monotonic() @@ -2535,7 +2562,20 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # dropped reqs persist in self.waiting_queue (retention at # ~line 2775: `x not in can_run_set or x.has_pending_chunk`) # and re-enter via next iter's Stage A stash + admission. - self.last_batch.filter_batch(exclude_chunked_req=True) + # + # PP cross-mb: also drop reqs whose forward result is still + # pending in another mb's pipeline. has_pending_chunk + + # pending_middle_outputs alone do not cover the window where + # mb_a admitted the LAST chunk (clearing has_pending_chunk; not + # bumping pending_middle_outputs since chunked_in_batch only + # counts mid-chunk admits) but mb_a's forward result has not + # yet been processed — without this exclusion, mb_b would merge + # the still-prefilling req into running_batch and run a decode + # forward on stale state. + self.last_batch.filter_batch( + exclude_chunked_req=True, + exclude_in_flight_other_mb=self._in_flight_other_mb_rids(), + ) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False @@ -2558,7 +2598,10 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # shouldn't normally hold one. Keep the flag set so any leak in # that invariant doesn't survive here; the dropped req still # has its waiting_queue retention to re-admit next iter. - self.running_batch.filter_batch(exclude_chunked_req=True) + self.running_batch.filter_batch( + exclude_chunked_req=True, + exclude_in_flight_other_mb=self._in_flight_other_mb_rids(), + ) if self.running_batch.is_empty(): self.running_batch.batch_is_full = False From 45347ca3a32590985dead61a5b818734b40d8516 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 14 May 2026 21:14:58 +0800 Subject: [PATCH 38/52] Revert "Exclude in-flight other-mb reqs in filter_batch (PP chunked-resume race)" This reverts commit 5c523049dbe0ca7cbff0e87f598caaae0119aea2. --- python/sglang/srt/managers/schedule_batch.py | 3 -- python/sglang/srt/managers/scheduler.py | 47 +------------------- 2 files changed, 2 insertions(+), 48 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index dd4dd36e1a19..9345d8c8d243 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -2425,14 +2425,12 @@ def filter_batch( # FIXME(lsyin): deprecate this API after spec v1 is deprecated v1_spec_info_filtered: Optional[bool] = False, exclude_chunked_req: bool = False, - exclude_in_flight_other_mb: Optional[set] = None, ): # FIXME(lsyin): used here to get the correct seq_lens # The batch has been launched but we need it verified to get correct next batch info self.maybe_wait_verify_done() if keep_indices is None: - in_flight_rids = exclude_in_flight_other_mb or set() keep_indices = [ i for i in range(len(self.reqs)) @@ -2445,7 +2443,6 @@ def filter_batch( or self.reqs[i].is_dllm() ) ) - and self.reqs[i].rid not in in_flight_rids ] if keep_indices is None or len(keep_indices) == 0: diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 94f24653c088..2dd0ea94e103 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2480,33 +2480,6 @@ def _build_hisparse_decode_batch(self, reqs): # todo hisparse, maybe other info to contain for the new batch return batch - def _in_flight_other_mb_rids(self) -> set: - """rids of reqs whose forward is launched in another mb but whose - result has not yet been processed by the output processor. - - Used by filter_batch on last_batch / running_batch to exclude - chunked-prefill reqs whose LAST chunk admit cleared has_pending_chunk - (and pending_middle_outputs may have been decremented to 0 by an - earlier mid-chunk forward result), but whose actual last-chunk - forward result has not yet arrived in the output processor — they - must not be merged into running_batch as decode reqs yet. - - At PP loop iter T mb_id step's filter_batch time, mbs[other_id != - mb_id] holds an in-flight forward batch (launched, not yet - processed). For pp_loop_size==2, the other mb's batch is always - in-flight at this point. Skip self.last_batch (==mbs[mb_id], the - batch being filtered itself). - """ - if self.pp_size <= 1 or not hasattr(self, "mbs"): - return set() - rids = set() - for mb in self.mbs: - if mb is None or mb is self.last_batch: - continue - for r in mb.reqs: - rids.add(r.rid) - return rids - def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: if self.enable_fpm: self._fpm_batch_t0 = time.monotonic() @@ -2562,20 +2535,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # dropped reqs persist in self.waiting_queue (retention at # ~line 2775: `x not in can_run_set or x.has_pending_chunk`) # and re-enter via next iter's Stage A stash + admission. - # - # PP cross-mb: also drop reqs whose forward result is still - # pending in another mb's pipeline. has_pending_chunk + - # pending_middle_outputs alone do not cover the window where - # mb_a admitted the LAST chunk (clearing has_pending_chunk; not - # bumping pending_middle_outputs since chunked_in_batch only - # counts mid-chunk admits) but mb_a's forward result has not - # yet been processed — without this exclusion, mb_b would merge - # the still-prefilling req into running_batch and run a decode - # forward on stale state. - self.last_batch.filter_batch( - exclude_chunked_req=True, - exclude_in_flight_other_mb=self._in_flight_other_mb_rids(), - ) + self.last_batch.filter_batch(exclude_chunked_req=True) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False @@ -2598,10 +2558,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # shouldn't normally hold one. Keep the flag set so any leak in # that invariant doesn't survive here; the dropped req still # has its waiting_queue retention to re-admit next iter. - self.running_batch.filter_batch( - exclude_chunked_req=True, - exclude_in_flight_other_mb=self._in_flight_other_mb_rids(), - ) + self.running_batch.filter_batch(exclude_chunked_req=True) if self.running_batch.is_empty(): self.running_batch.batch_is_full = False From 69ef71edc45c80958386229fc1d7bed2875ab70a Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 14 May 2026 21:29:55 +0800 Subject: [PATCH 39/52] Conditionally exclude in-flight other-mb chunked-resume reqs (PP, max_new_tokens > 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In PP+chunked-prefill, mb_a's LAST chunk admit clears has_pending_chunk on the req while mb_a's chunk forward result is still in flight in another mb's pipeline. Without exclusion, mb_b's filter_batch merges the req into running_batch and mb_b's decode forward writes WRONG K,V at row position N (input falls back to origin[-1] since req.output_ids is empty at that point). For max_new_tokens > 1, the wrong K,V at N persists in the KV pool and corrupts every subsequent decode position. For max_new_tokens == 1, the wrong decode result is filtered by the req.finished() check in the output processor BEFORE being appended to output_ids, and the wrong K,V at N is released with the rest of the row when the req finishes — no observable effect. Excluding such reqs would delay them by 1 mb step for no correctness gain, so we skip them. This conditional preserves the parallelism that test_pp_long_context_prefill (output_len=1) relies on while still fixing PP gsm8k correctness (max_new_tokens=512+, score 0.66 -> 0.77). The fix uses no per-batch or scheduler-level chunked-aware state — only a transient set of rids computed at filter_batch call time from self.mbs. Reads mbs[other_id != current_mb_id] which holds the other mb's most recently launched batch; for pp_loop_size==2, that batch is guaranteed to still be in flight (launched in the current iter's mb_(id-1)%pp step, processed at end of the current iter's mb_(id+1)%pp step). --- python/sglang/srt/managers/schedule_batch.py | 3 ++ python/sglang/srt/managers/scheduler.py | 49 +++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 9345d8c8d243..dd4dd36e1a19 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -2425,12 +2425,14 @@ def filter_batch( # FIXME(lsyin): deprecate this API after spec v1 is deprecated v1_spec_info_filtered: Optional[bool] = False, exclude_chunked_req: bool = False, + exclude_in_flight_other_mb: Optional[set] = None, ): # FIXME(lsyin): used here to get the correct seq_lens # The batch has been launched but we need it verified to get correct next batch info self.maybe_wait_verify_done() if keep_indices is None: + in_flight_rids = exclude_in_flight_other_mb or set() keep_indices = [ i for i in range(len(self.reqs)) @@ -2443,6 +2445,7 @@ def filter_batch( or self.reqs[i].is_dllm() ) ) + and self.reqs[i].rid not in in_flight_rids ] if keep_indices is None or len(keep_indices) == 0: diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 2dd0ea94e103..56215174adaa 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2480,6 +2480,41 @@ def _build_hisparse_decode_batch(self, reqs): # todo hisparse, maybe other info to contain for the new batch return batch + def _in_flight_other_mb_rids(self) -> set: + """rids of reqs whose chunked-prefill forward is launched in another + PP microbatch but whose result has not yet been processed by the + output processor — AND for which a follow-up decode would actually + propagate corruption (max_new_tokens > 1). + + In PP+chunked-prefill, mb_a's LAST chunk admit clears has_pending_chunk + on the req while mb_a's chunk forward result is still in flight. If + mb_b's filter_batch merges this req into running_batch, mb_b's decode + forward runs on stale state — input falls back to origin[-1] and + writes WRONG K,V at row position N. The wrong K,V at N persists in + the KV pool and corrupts every subsequent decode position. + + For req.sampling_params.max_new_tokens == 1, the wrong decode result + is filtered by `req.finished()` (line ~240) before being appended, + and the wrong K,V at N is released with the rest of the row when + the req finishes — no observable effect. Excluding such reqs would + delay them by 1 mb step for no correctness gain, so we skip them + here and only return rids of reqs that genuinely need protection. + """ + if self.pp_size <= 1 or not hasattr(self, "mbs"): + return set() + rids = set() + for mb in self.mbs: + if mb is None or mb is self.last_batch: + continue + for r in mb.reqs: + # max_new_tokens is normalized to a non-None int in + # _prepare_input_for_image_request / similar paths during + # request admission, but defensively handle missing/zero. + max_new = r.sampling_params.max_new_tokens or 0 + if max_new > 1: + rids.add(r.rid) + return rids + def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: if self.enable_fpm: self._fpm_batch_t0 = time.monotonic() @@ -2535,7 +2570,14 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # dropped reqs persist in self.waiting_queue (retention at # ~line 2775: `x not in can_run_set or x.has_pending_chunk`) # and re-enter via next iter's Stage A stash + admission. - self.last_batch.filter_batch(exclude_chunked_req=True) + # + # PP cross-mb: also drop reqs whose LAST chunk forward is still + # in flight in another mb (when more decodes will follow — i.e., + # max_new_tokens > 1). See _in_flight_other_mb_rids for rationale. + self.last_batch.filter_batch( + exclude_chunked_req=True, + exclude_in_flight_other_mb=self._in_flight_other_mb_rids(), + ) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False @@ -2558,7 +2600,10 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # shouldn't normally hold one. Keep the flag set so any leak in # that invariant doesn't survive here; the dropped req still # has its waiting_queue retention to re-admit next iter. - self.running_batch.filter_batch(exclude_chunked_req=True) + self.running_batch.filter_batch( + exclude_chunked_req=True, + exclude_in_flight_other_mb=self._in_flight_other_mb_rids(), + ) if self.running_batch.is_empty(): self.running_batch.batch_is_full = False From 14adb095469b1dc69e95d5036a7d1ec30b6b5fba Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 19 May 2026 23:00:49 +0800 Subject: [PATCH 40/52] Rename inflight_middle_chunks -> pending_middle_outputs (revert upstream rename) The name 'pending_middle_outputs' more precisely describes what the counter tracks: middle-block prefill forwards that are admitted but not yet output-processed (output processor uses it to decide whether this forward's sample is real (==0) or garbage (>0)). Restore the local-branch name across all call sites. --- python/sglang/srt/disaggregation/prefill.py | 4 ++-- python/sglang/srt/dllm/mixin/scheduler.py | 6 +++--- python/sglang/srt/managers/schedule_batch.py | 10 +++++----- python/sglang/srt/managers/schedule_policy.py | 2 +- python/sglang/srt/managers/scheduler.py | 12 ++++++------ .../scheduler_components/batch_result_processor.py | 8 ++++---- test/registered/unit/managers/test_hisparse_unit.py | 2 +- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index c1fdb96aeb3e..6891603e4fa8 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -514,7 +514,7 @@ def process_batch_result_disagg_prefill( for i, (req, next_token_id) in enumerate( zip(batch.reqs, next_token_ids, strict=True) ): - if req.inflight_middle_chunks <= 0: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # There is no output_ids for prefill @@ -564,7 +564,7 @@ def process_batch_result_disagg_prefill( req.grammar.finished = req.finished() else: # being chunked reqs' prefill is not finished - req.inflight_middle_chunks -= 1 + req.pending_middle_outputs -= 1 if req.return_logprob: extend_logprob_start_len = extend_logprob_start_len_per_req[i] diff --git a/python/sglang/srt/dllm/mixin/scheduler.py b/python/sglang/srt/dllm/mixin/scheduler.py index b438bb2e583b..37110c315e62 100644 --- a/python/sglang/srt/dllm/mixin/scheduler.py +++ b/python/sglang/srt/dllm/mixin/scheduler.py @@ -200,7 +200,7 @@ def _update_state_for_batch( if can_run_list: self.dllm_manager.add_staging_reqs(can_run_list) - self.dllm_manager.increment_inflight_middle_chunks() + self.dllm_manager.increment_pending_middle_outputs() self.adder = adder self.can_run_list = can_run_list @@ -337,10 +337,10 @@ def is_empty(self) -> bool: return True return len(self.waiting_queue) == 0 - def increment_inflight_middle_chunks(self) -> None: + def increment_pending_middle_outputs(self) -> None: """Increment chunked count for all staging requests.""" for req in self.staging_queue: - req.inflight_middle_chunks += 1 + req.pending_middle_outputs += 1 def filter_finished_reqs(self) -> None: """Remove finished requests from both queues.""" diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index e0d3f42730ac..7e8734c756ba 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -791,13 +791,13 @@ def __init__( # req in flight concurrently. In non-PP, oscillates 0/1 within each # iter. Used by output_processor to know whether this forward's # sample is real (==0) or garbage (>0). - self.inflight_middle_chunks = 0 + self.pending_middle_outputs = 0 # Persistent (cross-iter) flag set by admission when this req's # current admission was truncated (more chunks remain). Cleared # when last chunk is admitted (truncated=False) or on retract. # Used by Stage A stash detection, filter_batch exclusion, and - # add_one_req's reuse-vs-fresh branch. Independent of inflight_middle_chunks + # add_one_req's reuse-vs-fresh branch. Independent of pending_middle_outputs # counter (transient) and kv_committed_len (derived). self.has_pending_chunk = False @@ -1292,7 +1292,7 @@ def reset_for_retract(self): self.temp_input_top_logprobs_val = None self.temp_input_top_logprobs_idx = None self.extend_logprob_start_len = 0 - self.inflight_middle_chunks = 0 + self.pending_middle_outputs = 0 self.has_pending_chunk = False self.mamba_pool_idx = None self.mamba_ping_pong_track_buffer = None @@ -2501,7 +2501,7 @@ def filter_batch( exclude_chunked_req and ( self.reqs[i].has_pending_chunk - or self.reqs[i].inflight_middle_chunks > 0 + or self.reqs[i].pending_middle_outputs > 0 or self.reqs[i].is_dllm() ) ) @@ -2585,7 +2585,7 @@ def merge_batch(self, other: "ScheduleBatch"): # the full exclude_chunked_req predicate so PP middle-chunk and DLLM # staging reqs are also caught here. assert not any( - r.has_pending_chunk or r.inflight_middle_chunks > 0 or r.is_dllm() + r.has_pending_chunk or r.pending_middle_outputs > 0 or r.is_dllm() for r in other.reqs ) diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index b0564f33be78..c9c4a3636437 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -959,7 +959,7 @@ def add_one_req(self, req: Req, truncation_align_size: Optional[int]): truncated = True # has_pending_chunk: persistent flag carrying chunked-resume state - # across iters. DLLM uses its own staging_queue + inflight_middle_chunks counter. + # across iters. DLLM uses its own staging_queue + pending_middle_outputs counter. if not req.is_dllm(): req.has_pending_chunk = truncated diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index b974bdd763b6..7e33f060c999 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1022,7 +1022,7 @@ def init_chunked_prefill(self): elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: self.chunked_prefill_size = None # Chunked-resume tracking is now per-req (Req.has_pending_chunk + - # inflight_middle_chunks counter); the scheduler no longer holds a global pointer. + # pending_middle_outputs counter); the scheduler no longer holds a global pointer. # Stage A stashes any waiting_queue req with has_pending_chunk; cache # impls bound row reads by kv_committed_len so a stash after # init_next_round_input is safe without the old gate. @@ -2651,7 +2651,7 @@ def _get_new_batch_prefill_raw( for req in adder.preempt_list: self._add_request_to_queue(req) - # Bump inflight_middle_chunks for every admitted req that's still + # Bump pending_middle_outputs for every admitted req that's still # mid-prefill — output processor uses this to know its forward's # sample is garbage. Counter semantics needed for PP, where multiple # microbatches may admit the same req. @@ -2661,7 +2661,7 @@ def _get_new_batch_prefill_raw( ), "single-flight invariant: at most one chunked-resume req per batch" chunk_deduct = 0 for r in chunked_in_batch: - r.inflight_middle_chunks += 1 + r.pending_middle_outputs += 1 chunk_deduct = r.extend_input_len set_time_batch(can_run_list, "set_forward_entry_time") @@ -3463,11 +3463,11 @@ def abort_request(self, recv_req: AbortReq): # 'in batch'. Each mb's forward was launched against the req's # req_pool_idx + KV slots; the output processor on a different mb # iteration consumes the result later. Without this, a chunked-resume - # req with inflight_middle_chunks > 0 sitting in waiting_queue would + # req with pending_middle_outputs > 0 sitting in waiting_queue would # fall into the waiting-only abort path, release_kv_cache would free # the row + KV underneath the still-launched forward, and the delayed # output processor would crash on a None req_pool_idx (or, with - # inflight_middle_chunks cleared to 0, mistake the middle-chunk + # pending_middle_outputs cleared to 0, mistake the middle-chunk # result for a full output and append garbage tokens). if self.pp_size > 1 and hasattr(self, "mbs"): for mb_list in (self.mbs, self.last_mbs, self.running_mbs): @@ -3515,7 +3515,7 @@ def abort_request(self, recv_req: AbortReq): # Defensive: clear pending-chunk flags on the orphaned req so a # stale reference can't trigger Stage A re-stash of the freed row. req.has_pending_chunk = False - req.inflight_middle_chunks = 0 + req.pending_middle_outputs = 0 logger.debug(f"Abort queued request. {req.rid=}") # Delete the requests in the grammar queue diff --git a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py index ce52385987a4..6a88e72b8be2 100644 --- a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py +++ b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py @@ -215,7 +215,7 @@ def process_batch_result_prefill( # decode req in mixed batch or retracted req continue - if req.inflight_middle_chunks <= 0: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # req output_ids are set here @@ -264,7 +264,7 @@ def process_batch_result_prefill( else: # being chunked reqs' prefill is not finished - req.inflight_middle_chunks -= 1 + req.pending_middle_outputs -= 1 # There is only at most one request being currently chunked. # Because this request does not finish prefill, # we don't want to stream the request currently being chunked. @@ -304,7 +304,7 @@ def process_batch_result_prefill( req.embedding = embeddings[i] if req.return_pooled_hidden_states and phs is not None: req.pooled_hidden_state = phs[i] - if req.inflight_middle_chunks <= 0: + if req.pending_middle_outputs <= 0: req.time_stats.set_prefill_finished_time() # Dummy output token for embedding models req.output_ids.append(0) @@ -317,7 +317,7 @@ def process_batch_result_prefill( maybe_cache_unfinished_req(req, self.tree_cache) else: # being chunked reqs' prefill is not finished - req.inflight_middle_chunks -= 1 + req.pending_middle_outputs -= 1 req.time_stats.set_last_chunked_prefill_finish_time() self.output_streamer.stream_output( diff --git a/test/registered/unit/managers/test_hisparse_unit.py b/test/registered/unit/managers/test_hisparse_unit.py index 56fc32a1620e..d5d272f91e53 100644 --- a/test/registered/unit/managers/test_hisparse_unit.py +++ b/test/registered/unit/managers/test_hisparse_unit.py @@ -52,7 +52,7 @@ def _make_req(rid="test-req-0", origin_input_ids=None, output_ids=None): finished_reason=None, hisparse_staging=False, staging=False, - inflight_middle_chunks=0, + pending_middle_outputs=0, ) req.finished = lambda: req.finished_reason is not None req.set_extend_input_len = lambda extend_input_len: setattr( From be72b26f7ecc2131fd10bbc9a5cf09299fc62e8f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 19 May 2026 23:11:54 +0800 Subject: [PATCH 41/52] Fix Scheduler.pp_size refs: use ps.pp_size after ParallelState refactor Upstream PR #25444 moved Scheduler.pp_size onto a frozen ParallelState container (self.ps.pp_size). My branch's chunked-resume PP code still referenced the old direct attribute, causing AttributeError: 'Scheduler' object has no attribute 'pp_size' in _in_flight_other_mb_rids and abort_request. --- python/sglang/srt/managers/scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 7e33f060c999..dd3375971ab8 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2278,7 +2278,7 @@ def _in_flight_other_mb_rids(self) -> set: delay them by 1 mb step for no correctness gain, so we skip them here and only return rids of reqs that genuinely need protection. """ - if self.pp_size <= 1 or not hasattr(self, "mbs"): + if self.ps.pp_size <= 1 or not hasattr(self, "mbs"): return set() rids = set() for mb in self.mbs: @@ -3469,7 +3469,7 @@ def abort_request(self, recv_req: AbortReq): # output processor would crash on a None req_pool_idx (or, with # pending_middle_outputs cleared to 0, mistake the middle-chunk # result for a full output and append garbage tokens). - if self.pp_size > 1 and hasattr(self, "mbs"): + if self.ps.pp_size > 1 and hasattr(self, "mbs"): for mb_list in (self.mbs, self.last_mbs, self.running_mbs): for mb in mb_list: if mb is not None and not mb.is_empty(): From 2a07502c3f49c74254bfe2523094ff29d5b7d177 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 11:45:03 +0800 Subject: [PATCH 42/52] Refactor: Introduce Scheduler.active_reqs ownership tracker (C1) Adds a by-rid dict tracking sync-mode reqs scheduler currently owns the lifecycle of (admitted, not finished, not retracted). Runs as a parallel tracker alongside existing waiting_queue / running_batch without changing scheduler behavior. DEBUG_INVARIANTS=1 enables _assert_invariants checks at get_next_batch_to_run boundaries. Part of waiting_queue refactor plan, commit 1/7. See agent-drafts/ 2026-05-25-waiting-queue-refactor-plan.md. --- python/sglang/srt/managers/scheduler.py | 92 +++++++++++++++++++ .../batch_result_processor.py | 9 ++ 2 files changed, 101 insertions(+) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 72f2f413007c..614a5ba77a6a 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -698,6 +698,7 @@ def __init__( ), output_streamer=self.output_streamer, abort_request=self.abort_request, + deactivate_req=self._deactivate, ) self.is_initializing = False @@ -996,6 +997,12 @@ def init_model_worker(self): def init_running_status(self): self.waiting_queue: List[Req] = [] + # By-rid ownership tracker for sync-mode reqs the scheduler currently + # owns the lifecycle of (admitted, not finished, not retracted). Runs + # as a parallel tracker alongside waiting_queue / running_batch.reqs / + # chunked retention without changing scheduler behavior. See + # agent-drafts/2026-05-25-waiting-queue-refactor-plan.md (C1). + self.active_reqs: Dict[str, Req] = {} # The running decoding batch for continuous batching self.running_batch: ScheduleBatch = ScheduleBatch(reqs=[], batch_is_full=False) # The current forward batch @@ -1013,6 +1020,60 @@ def init_running_status(self): self.forward_sleep_time = None self._engine_paused = False + def _activate(self, req: Req) -> None: + """Mark req as entering active lifecycle (initial admission). + + Caller must ensure req.rid is not already in active_reqs (chunked-resume + re-admit is filtered at the call site). See refactor plan §C1. + """ + assert req.rid not in self.active_reqs, f"already active: {req.rid}" + self.active_reqs[req.rid] = req + + def _deactivate(self, req: Req) -> None: + """Mark req as leaving active lifecycle (finish / abort / retract). + + Important: this function ONLY pops from active_reqs dict. + - Does not clear req.req_pool_idx: batch_result_processor.py:774-787 PP + cross-mb idempotency guard relies on it as an "already released" + sentinel. + - Does not clear req.has_pending_chunk / req.pending_middle_outputs: + owned by the semantic finish/abort/retract sites. + - Does not call release_kv_cache: that is the responsibility of + release_req / abort / finish paths. + This function only answers "scheduler no longer owns this req's + lifecycle". + """ + self.active_reqs.pop(req.rid, None) + + def _assert_invariants(self) -> None: + """Debug-only invariant checks for active_reqs ownership tracking. + + Gated by DEBUG_INVARIANTS=1 to avoid slowing down normal runs. Skipped + in disagg modes (Q1=(c): disagg has its own ownership model). + """ + if not os.environ.get("DEBUG_INVARIANTS"): + return + if self.disaggregation_mode != DisaggregationMode.NULL: + return + waiting_rids = {r.rid for r in self.waiting_queue} + active_rids = set(self.active_reqs.keys()) + running_rids = {r.rid for r in self.running_batch.reqs} + + # sync mode: chunked-resume reqs still live in waiting_queue until C4 + # deletes the retention. Relax waiting ∩ active here: any rid in the + # intersection must be a chunked-resume req. + intersection_rids = waiting_rids & active_rids + for rid in intersection_rids: + assert self.active_reqs[ + rid + ].has_pending_chunk, ( + f"{rid} in both waiting and active but not chunked-resume" + ) + + assert ( + running_rids <= active_rids + ), f"running not subset of active: {running_rids - active_rids}" + def init_chunked_prefill(self): self.chunked_prefill_size = self.server_args.chunked_prefill_size uses_transformers_backend = ( @@ -2331,6 +2392,7 @@ def _in_flight_other_mb_rids(self) -> set: return rids def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: + self._assert_invariants() if self.enable_fpm: self._fpm_batch_t0 = time.monotonic() self._abort_on_waiting_timeout() @@ -2467,6 +2529,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: if self.enable_fpm: ret.fpm_start_time = self._fpm_batch_t0 + self._assert_invariants() return ret def get_num_allocatable_reqs(self, running_bs): @@ -2676,6 +2739,14 @@ def _get_new_batch_prefill_raw( if len(can_run_list) == 0: return None + # audit A1: mark newly-admitted reqs as active. Filter chunked-resume + # re-admit (already active from a prior iter) — _activate's internal + # assert would trip otherwise. Filter can be removed once C3+C4 move + # chunked-resume out of the main admission loop. + for req in can_run_list: + if req.rid not in self.active_reqs: + self._activate(req) + # Drop admitted reqs from waiting_queue, but KEEP chunked-resume reqs # (has_pending_chunk == True after admission) so they stay at the head # for the next iter's stash + admission. Single-flight is preserved @@ -2686,6 +2757,10 @@ def _get_new_batch_prefill_raw( ] if adder.preempt_list: for req in adder.preempt_list: + # audit R2: PrefillAdder.preempt_to_schedule already released + # the victim's resources via running_batch.release_req. Drop + # from active_reqs before re-enqueueing as a waiting req. + self._deactivate(req) self._add_request_to_queue(req) # Bump pending_middle_outputs for every admitted req that's still @@ -2859,7 +2934,13 @@ def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]: ) logger.warning(msg_prefix + msg_details) + # audit R1: retract_decode released row + KV via release_req for + # both retracted_reqs (re-enqueued as waiting) and reqs_to_abort + # (final OOM eviction). Drop both from active_reqs. + for req in reqs_to_abort: + self._deactivate(req) for req in retracted_reqs: + self._deactivate(req) self._add_request_to_queue(req, is_retracted=True) else: self.new_token_ratio_tracker.decay_step() @@ -3567,6 +3648,9 @@ def abort_request(self, recv_req: AbortReq): # stale reference can't trigger Stage A re-stash of the freed row. req.has_pending_chunk = False req.pending_middle_outputs = 0 + # audit D6: orphan release in waiting_queue (sync mode mamba or + # chunked-resume mid-prefill); drop from active set. + self._deactivate(req) logger.debug(f"Abort queued request. {req.rid=}") # Delete the requests in the grammar queue @@ -3676,7 +3760,11 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): self.running_batch.filter_batch(v1_spec_info_filtered=True) if len(self.running_batch.reqs) != 0: retracted_reqs = self.running_batch.retract_all(self.server_args) + # audit R3: retract_all released resources via release_req + # for every running req; drop from active_reqs before + # re-enqueueing as waiting. for req in retracted_reqs: + self._deactivate(req) self._add_request_to_queue(req) self.running_batch.batch_is_full = False @@ -3702,6 +3790,10 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): req.disagg_kv_sender = None release_kv_cache(req, self.tree_cache, is_insert=False) req.reset_for_retract() + # audit D7: chunked-resume req released via reset_for_retract + # stays in waiting_queue for re-prefill but no longer holds + # row/KV, so it leaves the active set. + self._deactivate(req) def continue_generation(self, recv_req: ContinueGenerationReqInput): if recv_req.torch_empty_cache: diff --git a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py index 9385d2d03d4c..06f8aa29826e 100644 --- a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py +++ b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py @@ -78,6 +78,7 @@ class SchedulerBatchResultProcessor: logprob_result_processor: "SchedulerLogprobResultProcessor" output_streamer: "SchedulerOutputStreamer" abort_request: Callable + deactivate_req: Callable def process_batch_result_prebuilt(self, batch: ScheduleBatch): assert self.disaggregation_mode == DisaggregationMode.DECODE @@ -231,6 +232,8 @@ def process_batch_result_prefill( self._maybe_collect_routed_experts(req) self._maybe_collect_indexer_topk(req) release_kv_cache(req, self.tree_cache) + # audit D1: sync prefill finish + self.deactivate_req(req) req.time_stats.set_completion_time() elif not batch.decoding_reqs or req not in batch.decoding_reqs: maybe_cache_unfinished_req(req, self.tree_cache) @@ -315,6 +318,8 @@ def process_batch_result_prefill( if req.finished(): release_kv_cache(req, self.tree_cache) + # audit D2: embedding/reward prefill finish + self.deactivate_req(req) req.time_stats.set_completion_time() else: maybe_cache_unfinished_req(req, self.tree_cache) @@ -800,6 +805,10 @@ def _handle_finished_req( if self.server_args.enable_hisparse: self.hisparse_coordinator.request_finished(req) release_kv_cache(req, self.tree_cache) + # audit D3: sync decode finish (non-offload path). The DECODE + # offload branch (D4) does not call _deactivate — disagg DECODE + # is not in active_reqs (Q1=(c)). + self.deactivate_req(req) req.time_stats.set_completion_time() From c8cb8eed9d0772f03acf4f627837621475c18791 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 14:14:04 +0800 Subject: [PATCH 43/52] Refactor: Scan chunked_reqs() in Stage A instead of waiting_queue (C2) Eliminates H3 hack (Stage A scanning the full waiting_queue to find chunked-resume reqs). Now scans the chunked_reqs() view derived from active_reqs. Behavior identical to C1 because C1's retention keeps waiting_queue and active_reqs in sync for chunked-resume reqs. Part of waiting_queue refactor plan, commit 2/7. --- python/sglang/srt/managers/scheduler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 614a5ba77a6a..935d27c36951 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -25,7 +25,7 @@ from contextlib import contextmanager, nullcontext from functools import partial from http import HTTPStatus -from typing import Any, Deque, Dict, List, Optional, Tuple, Union +from typing import Any, Deque, Dict, Iterable, List, Optional, Tuple, Union from sglang.srt.utils.common import suppress_noisy_warnings @@ -1074,6 +1074,12 @@ def _assert_invariants(self) -> None: running_rids <= active_rids ), f"running not subset of active: {running_rids - active_rids}" + def chunked_reqs(self) -> Iterable[Req]: + """active_reqs 中 has_pending_chunk=True 的派生 view。 + Single-flight 不变量(Q5,§7-Q5):len(list(chunked_reqs())) <= 1, + 在 _get_new_batch_prefill_raw 顶端断言(C3 引入)。""" + return (r for r in self.active_reqs.values() if r.has_pending_chunk) + def init_chunked_prefill(self): self.chunked_prefill_size = self.server_args.chunked_prefill_size uses_transformers_backend = ( @@ -2414,8 +2420,11 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # for the duration of the scheduling pass. vLLM / TokenSpeed do not # need this because their admission reads a single monotone counter # (num_computed_tokens / FSM state), not a prefix-indices splice. - for req in self.waiting_queue: - if req.has_pending_chunk and not req.is_dllm(): + # audit P1: Stage A — stash chunked-resume KV into radix tree at iter + # boundary. Switch from scanning waiting_queue (H3 hack) to iterating + # the chunked_reqs() view directly. + for req in self.chunked_reqs(): + if not req.is_dllm(): maybe_cache_unfinished_req(req, self.tree_cache, chunked=True) if self.dllm_config is not None and self.dllm_manager.any_staging_reqs(): From 0810ca8a269d2863f2c2915da8aee7b1c24c30b9 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 14:20:39 +0800 Subject: [PATCH 44/52] Refactor: Inline chunked admission, strip main-loop chunked branches (C3) Adds an inline chunked admission block at the top of _get_new_batch_prefill_raw that consumes chunked_reqs() directly. Strips has_pending_chunk branches from the main waiting_queue loop (H6 LoRA drainer bypass, H7 init_next_round_input split). The waiting_queue retention for chunked-resume is still in place; it is removed in C4. Single-flight assertion enforced at the inline admission entry. Part of waiting_queue refactor plan, commit 3/7. --- python/sglang/srt/managers/scheduler.py | 105 ++++++++++++++++++++---- 1 file changed, 87 insertions(+), 18 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 935d27c36951..540d1800bd36 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -156,6 +156,7 @@ ScheduleBatch, ) from sglang.srt.managers.schedule_policy import ( + CLIP_MAX_NEW_TOKENS, AddReqResult, PrefillAdder, SchedulePolicy, @@ -2569,6 +2570,11 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]: def _get_new_batch_prefill_raw( self, prefill_delayer_single_pass: Optional[PrefillDelayerSinglePassExecutor] ) -> Optional[ScheduleBatch]: + # Chunked-resume admission: handled by the inline block below + # (`chunked_reqs()` filter + adder bookkeeping). The main + # waiting_queue loop further down admits ONLY truly-waiting reqs (no + # has_pending_chunk paths). See agent-drafts/ + # 2026-05-25-waiting-queue-refactor-plan.md §C3. # Check if the grammar is ready in the grammar queue if self.grammar_manager.has_waiting_grammars(): ready_grammar_requests = self.grammar_manager.get_ready_grammar_requests() @@ -2649,6 +2655,73 @@ def _get_new_batch_prefill_raw( waiting_queue_len=len(self.waiting_queue), ) + # audit Q5: single-flight invariant — at most one chunked-resume req + # in active at any time. + chunked_in_active = list(self.chunked_reqs()) + assert len(chunked_in_active) <= 1, ( + f"single-flight violated: {len(chunked_in_active)} chunked reqs " + f"in active ({[r.rid for r in chunked_in_active]})" + ) + + # Inline chunked admission (Plan §C3, do NOT recreate + # PrefillAdder.add_chunked_req per §10 decision). The chunked-resume + # req keeps its row/KV/lock_ref from prior admission, so + # init_next_round_input must NOT re-match prefix (no tree_cache arg, + # H7 elimination). Budget bookkeeping is inlined here — minor LOC; + # intentionally not extracted. Mirrors the body of upstream/main + # PrefillAdder.add_chunked_req. + if chunked_in_active: + chunked_req = chunked_in_active[0] + chunked_req.init_next_round_input() + if adder.dllm_config is not None: + _rem_tokens = adder._get_dllm_remain_tokens() + else: + _rem_tokens = min(adder.rem_chunk_tokens, int(adder.rem_total_tokens)) + if adder.is_hybrid_swa: + # alloc_extend needs extend_num_tokens + page_size per + # request, so reserve one page here to avoid OOM. + _rem_tokens = min( + _rem_tokens, int(adder.rem_swa_tokens) - adder.page_size + ) + # The chunked_req must be added to the list; otherwise, it + # will cause a memory leak. Therefore, in certain cases where + # _rem_tokens <= 0, it should be replaced with + # rem_chunk_tokens. Under hybrid_swa with no room, skip this + # iter — the chunked req stays in active_reqs and is retried + # next iter (mirrors upstream `return req`). + if _rem_tokens <= 0: + if adder.is_hybrid_swa: + _rem_tokens = None + else: + _rem_tokens = adder.rem_chunk_tokens + + if _rem_tokens is not None: + truncated = chunked_req.extend_input_len > _rem_tokens + chunked_req.set_extend_input_len( + min(chunked_req.extend_input_len, _rem_tokens) + ) + chunked_req.fill_ids = chunked_req.fill_ids[ + : len(chunked_req.prefix_indices) + chunked_req.extend_input_len + ] + adder.can_run_list.append(chunked_req) + adder._update_prefill_budget( + 0, + chunked_req.extend_input_len, + ( + min( + chunked_req.sampling_params.max_new_tokens, + CLIP_MAX_NEW_TOKENS, + ) + if not truncated + else 0 + ), + ) + # has_pending_chunk: persistent flag carrying chunked-resume + # state across iters. When truncated=False, this was the last + # chunk — clear the flag so the req exits chunked_reqs(). + if not chunked_req.is_dllm(): + chunked_req.has_pending_chunk = truncated + if self.enable_lora: running_loras = {req.lora_id for req in self.running_batch.reqs} @@ -2660,16 +2733,17 @@ def _get_new_batch_prefill_raw( # Get requests from the waiting queue to a new prefill batch for req in self.waiting_queue: - # Chunked-resume reqs hold a row + tree lock_ref from their prior - # admission. If the LoRA drainer rejects them mid-prefill, they - # stay in waiting_queue forever — deadlock + KV leak. Their LoRA - # adapter was already accepted on the first admission, so the - # drainer/validate check is moot for them. - if ( - self.enable_lora - and not req.has_pending_chunk - and not self._can_schedule_lora_req(req, running_loras) - ): + # Chunked-resume req is admitted via the inline block above + # (Plan §C3). It still rides H2 retention in waiting_queue until + # C4 removes that — skip it here to avoid double-admit. Once C4 + # drops the retention, this guard becomes a no-op and can be + # removed. + if req.has_pending_chunk: + continue + + # audit H6: chunked-resume no longer flows through main loop; + # drainer check applies uniformly. + if self.enable_lora and not self._can_schedule_lora_req(req, running_loras): continue running_bs = len(self.running_batch.reqs) @@ -2698,14 +2772,9 @@ def _get_new_batch_prefill_raw( req.rid ) - # Chunked-resume reqs must NOT re-match prefix at admission - # (would re-assign req.last_node without rebalancing lock_ref, - # corrupting cache_unfinished_req's dec_lock_ref/inc_lock_ref - # pairing). They keep last_node from previous stash. - if req.has_pending_chunk: - req.init_next_round_input() - else: - req.init_next_round_input(self.tree_cache) + # audit H7: chunked-resume handled in inline admission above; + # main loop unconditional. + req.init_next_round_input(self.tree_cache) res = adder.add_one_req( req, truncation_align_size=self.truncation_align_size, From c19d510601b156681a7efc8c2759eadb52882946 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 14:24:14 +0800 Subject: [PATCH 45/52] Refactor: Remove waiting_queue retention for chunked-resume (C4) Chunked-resume reqs no longer anchor in waiting_queue (H2 hack elimination). The retention `or x.has_pending_chunk` is removed; the transitional guard added in C3 to prevent double-admit is also removed. After this commit, chunked-resume reqs live exclusively in active_reqs and are re-admitted via the inline block at the top of _get_new_batch_prefill_raw. Part of waiting_queue refactor plan, commit 4/7. --- python/sglang/srt/managers/scheduler.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 540d1800bd36..bbd4ae1931a8 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2733,14 +2733,6 @@ def _get_new_batch_prefill_raw( # Get requests from the waiting queue to a new prefill batch for req in self.waiting_queue: - # Chunked-resume req is admitted via the inline block above - # (Plan §C3). It still rides H2 retention in waiting_queue until - # C4 removes that — skip it here to avoid double-admit. Once C4 - # drops the retention, this guard becomes a no-op and can be - # removed. - if req.has_pending_chunk: - continue - # audit H6: chunked-resume no longer flows through main loop; # drainer check applies uniformly. if self.enable_lora and not self._can_schedule_lora_req(req, running_loras): @@ -2825,14 +2817,11 @@ def _get_new_batch_prefill_raw( if req.rid not in self.active_reqs: self._activate(req) - # Drop admitted reqs from waiting_queue, but KEEP chunked-resume reqs - # (has_pending_chunk == True after admission) so they stay at the head - # for the next iter's stash + admission. Single-flight is preserved - # naturally by budget + priority. + # audit H2: retention removed. chunked-resume reqs are no longer + # anchored in waiting_queue — they live in active_reqs and are + # re-admitted via the inline chunked admission loop (C3). can_run_set = set(can_run_list) - self.waiting_queue = [ - x for x in self.waiting_queue if x not in can_run_set or x.has_pending_chunk - ] + self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_set] if adder.preempt_list: for req in adder.preempt_list: # audit R2: PrefillAdder.preempt_to_schedule already released From b00d5f6bad0ca1f7e191978c6112066b0b1d38f9 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 14:27:25 +0800 Subject: [PATCH 46/52] Refactor: Remove early-exit / dynamic-chunking / abort-timeout chunked bypasses (C5) Now that chunked-resume reqs live in active_reqs (post-C4), the defensive bypasses that scanned waiting_queue for has_pending_chunk become dead code. Eliminates H4 (early-exit has_chunked_resume scan), H5 (dynamic-chunking lookup), AB7 (_abort_on_waiting_timeout has_pending_chunk skip), plus a stale comment referencing the deleted retention. Single chunked_in_active computation reused throughout _get_new_batch_prefill_raw. Part of waiting_queue refactor plan, commit 5/7. --- python/sglang/srt/managers/scheduler.py | 58 ++++++++++--------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index bbd4ae1931a8..15e98f58a7b1 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2219,13 +2219,8 @@ def _abort_on_waiting_timeout(self): deleted_reqs = set() deadline = time.perf_counter() - timeout_s + # audit AB7: chunked-resume no longer in waiting_queue (C4), bypass removed. for req in self.waiting_queue: - # Chunked-resume reqs sit in waiting_queue across iters while - # actively prefilling — they are not idle. Their entry_time is - # from their original arrival, so a long prefill would falsely - # trigger the timeout and leak KV + row. - if req.has_pending_chunk: - continue entry_time = req.time_stats.wait_queue_entry_time if 0 < entry_time < deadline: if self.enable_hicache_storage: @@ -2454,9 +2449,8 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # Drop chunked-resume reqs before merging last_batch into # running_batch. running_batch runs decode forward and admitting # a mid-prefill req there breaks shapes + KV accounting. The - # dropped reqs persist in self.waiting_queue (retention at - # ~line 2775: `x not in can_run_set or x.has_pending_chunk`) - # and re-enter via next iter's Stage A stash + admission. + # dropped reqs persist in self.active_reqs and re-enter via the + # inline chunked admission in _get_new_batch_prefill_raw. # # PP cross-mb: also drop reqs whose LAST chunk forward is still # in flight in another mb (when more decodes will follow — i.e., @@ -2588,15 +2582,19 @@ def _get_new_batch_prefill_raw( # Reset batch_is_full to try preemption with a prefill adder. self.running_batch.batch_is_full = False - # Identify any in-flight chunked-resume req held in waiting_queue — - # priority + has_pending_chunk make it sit at the head, but its - # presence relaxes the "is queue empty / pool full" early exits below - # (we must keep scheduling it to make progress, or memory leaks). - has_chunked_resume = any(r.has_pending_chunk for r in self.waiting_queue) + # audit H4 + Q5: chunked-resume now lives in active_reqs (not + # waiting_queue, post-C4). Compute the single-flight view once here + # and reuse below for early-exit relaxation, dynamic chunking, and + # the inline chunked admission entry. + chunked_in_active = list(self.chunked_reqs()) + assert len(chunked_in_active) <= 1, ( + f"single-flight violated: {len(chunked_in_active)} chunked reqs " + f"in active ({[r.rid for r in chunked_in_active]})" + ) if ( self.running_batch.batch_is_full or len(self.waiting_queue) == 0 - ) and not has_chunked_resume: + ) and not chunked_in_active: return None running_bs = len(self.running_batch.reqs) @@ -2607,7 +2605,7 @@ def _get_new_batch_prefill_raw( # check should not block them. if ( self.get_num_allocatable_reqs(running_bs) <= 0 - and not has_chunked_resume + and not chunked_in_active and not self.enable_priority_preemption ): self.running_batch.batch_is_full = True @@ -2624,17 +2622,15 @@ def _get_new_batch_prefill_raw( # Determine chunked_prefill_size for this batch chunked_prefill_size = self.chunked_prefill_size - if self.enable_dynamic_chunking: - # Single-flight invariant: at most one chunked-resume req in the - # queue at any time (priority + budget enforce this naturally). - chunked_resume = next( - (r for r in self.waiting_queue if r.has_pending_chunk), None - ) - if chunked_resume is not None: - history_len = len(chunked_resume.prefix_indices) - dynamic_size = self.predict_next_chunk_size(history_len) - if dynamic_size is not None: - chunked_prefill_size = dynamic_size + if self.enable_dynamic_chunking and chunked_in_active: + # audit H5: chunked-resume lives in active_reqs; reuse the + # single-flight view computed above instead of scanning + # waiting_queue. + chunked_resume = chunked_in_active[0] + history_len = len(chunked_resume.prefix_indices) + dynamic_size = self.predict_next_chunk_size(history_len) + if dynamic_size is not None: + chunked_prefill_size = dynamic_size # Prefill policy adder = PrefillAdder( @@ -2655,14 +2651,6 @@ def _get_new_batch_prefill_raw( waiting_queue_len=len(self.waiting_queue), ) - # audit Q5: single-flight invariant — at most one chunked-resume req - # in active at any time. - chunked_in_active = list(self.chunked_reqs()) - assert len(chunked_in_active) <= 1, ( - f"single-flight violated: {len(chunked_in_active)} chunked reqs " - f"in active ({[r.rid for r in chunked_in_active]})" - ) - # Inline chunked admission (Plan §C3, do NOT recreate # PrefillAdder.add_chunked_req per §10 decision). The chunked-resume # req keeps its row/KV/lock_ref from prior admission, so From 294fb739ee0a247e16283de6bb5d5298d8c31819 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 14:31:42 +0800 Subject: [PATCH 47/52] Refactor: Simplify abort_request, handle stashed chunked-resume (C6) Eliminates H1 (dual-existence comment) and H8 (defensive has_pending_chunk / pending_middle_outputs reset on waiting-segment orphan release). Post-C4 chunked-resume reqs no longer live in waiting_queue, so the waiting-segment orphan branch is narrowed to mamba-pool reqs only. Critical: the active-segment loop now iterates active_reqs instead of batch_reqs, distinguishing in-batch reqs (FINISH_ABORT via batch result path) from stashed chunked-resume reqs (immediate release + _deactivate, audit finding 2). Without this, aborting a chunked- resume mid-prefill outside of any current batch would leak row + KV + lock_ref. Part of waiting_queue refactor plan, commit 6/7. --- python/sglang/srt/managers/scheduler.py | 66 ++++++++++++++++--------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 15e98f58a7b1..e7124fafbcaf 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -3638,10 +3638,7 @@ def handle_rpc_request(self, recv_req: RpcReqInput): def abort_request(self, recv_req: AbortReq): # todo hisparse, release resources for abort requests in hisparse coordinator - # Build batch rid set: chunked-resume reqs may live in both waiting_queue - # and batch.reqs simultaneously (stateless-scheduler refactor). Skip the - # waiting_queue removal for those — let the to_finish path below handle - # them, otherwise we send_output / release_kv_cache twice. + # Post-C4: chunked-resume reqs live in active_reqs only, never in waiting_queue. if self.cur_batch is self.running_batch or self.cur_batch is None: batch_reqs = list(self.running_batch.reqs) else: @@ -3690,21 +3687,18 @@ def abort_request(self, recv_req: AbortReq): req, self.req_to_metadata_buffer_idx_allocator ) - # For mamba radix cache, or for chunked-resume reqs whose prior - # admissions already allocated a row + KV + radix lock_ref. Without - # this branch, aborting a chunked-resume req that is currently only - # in waiting_queue (not in any batch's reqs) leaks all three. + # audit AB4 simplified post-C4: only mamba radix cache reqs can be + # in waiting_queue with mamba_pool_idx held. Chunked-resume reqs + # are NOT in waiting_queue anymore (live in active_reqs); their + # abort-time release happens in the active_reqs loop below. if ( req.mamba_pool_idx is not None - or (req.has_pending_chunk and req.req_pool_idx is not None) - ) and self.disaggregation_mode != DisaggregationMode.DECODE: + and self.disaggregation_mode != DisaggregationMode.DECODE + ): release_kv_cache(req, self.tree_cache, is_insert=False) - # Defensive: clear pending-chunk flags on the orphaned req so a - # stale reference can't trigger Stage A re-stash of the freed row. - req.has_pending_chunk = False - req.pending_middle_outputs = 0 - # audit D6: orphan release in waiting_queue (sync mode mamba or - # chunked-resume mid-prefill); drop from active set. + # audit D6 (mamba branch): drop from active set if present. + # (mamba-radix path may or may not put req in active_reqs; + # _deactivate is idempotent.) self._deactivate(req) logger.debug(f"Abort queued request. {req.rid=}") @@ -3757,16 +3751,40 @@ def abort_request(self, recv_req: AbortReq): remaining_retracted.append(decode_req) self.disagg_decode_prealloc_queue.retracted_queue = remaining_retracted - # Delete requests in the running batch (reuse batch_reqs built above) - for req in batch_reqs: - if not req.finished() and ( - recv_req.abort_all or req.rid.startswith(recv_req.rid) - ): - # Abort method 3: set `to_finish` - # The request will still run one decode forward pass. - # Then we reuse all existing code to clean up the KV cache allocation. + # audit finding 2 (Plan §C6 Edit 3): iterate active_reqs instead of + # batch_reqs so that stashed chunked-resume reqs (in active_reqs but + # NOT in any current batch) get their resources released immediately. + # batch_rids was built above and includes cur_batch + running_batch + + # PP mbs[*]; "in-batch" reqs go through to_finish, "stashed-chunked" + # reqs need explicit release because no batch result path will pick + # them up. + for rid in list(self.active_reqs.keys()): + req = self.active_reqs[rid] + if req.finished(): + continue + if not (recv_req.abort_all or rid.startswith(recv_req.rid)): + continue + + if rid in batch_rids: + # In some batch: standard to_finish path; release_kv_cache + + # _deactivate happen in process_batch_result_*. logger.debug(f"Abort running request. {req.rid=}") req.to_finish = FINISH_ABORT() + else: + # Active but not in any batch — the only legitimate case is + # a stashed chunked-resume mid-prefill (audit finding 2). + # Release immediately, else row+KV+lock_ref leak. + assert req.has_pending_chunk and req.req_pool_idx is not None, ( + f"unexpected active-but-not-in-batch req: {rid} " + f"has_pending_chunk={req.has_pending_chunk} " + f"req_pool_idx={req.req_pool_idx}" + ) + if self.disaggregation_mode != DisaggregationMode.DECODE: + release_kv_cache(req, self.tree_cache, is_insert=False) + req.has_pending_chunk = False + req.pending_middle_outputs = 0 + self._deactivate(req) + logger.debug(f"Abort stashed chunked-resume request. {req.rid=}") def _pause_engine(self) -> Tuple[List[Req], int]: raise NotImplementedError() From 68cec4a1c39f74c7da736cde06e03f71e024ebcb Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 14:39:00 +0800 Subject: [PATCH 48/52] Refactor: Tighten invariants, finalize docstrings (C7) - Tightens _assert_invariants: waiting_queue and active_reqs are now strictly disjoint (sync mode); C1's relaxed transitional clause removed. - Removes C1's _activate idempotency filter at the admission call site; the main admission loop no longer produces re-admits after C3/C4. - Adds comprehensive invariant documentation as field-level comments on Scheduler.active_reqs and method docstring on chunked_reqs(). - Migrates pause_generation(retract) chunked release path to iterate chunked_reqs() instead of scanning waiting_queue (dead post-C4), and flags a pre-existing latent bug (req not re-enqueued after reset_for_retract). Concludes the waiting_queue refactor chain (commit 7/7). See agent-drafts/2026-05-25-waiting-queue-refactor-plan.md and audit. --- python/sglang/srt/managers/scheduler.py | 107 ++++++++++++++++-------- 1 file changed, 73 insertions(+), 34 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index e7124fafbcaf..1267b8f603ca 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -998,11 +998,29 @@ def init_model_worker(self): def init_running_status(self): self.waiting_queue: List[Req] = [] - # By-rid ownership tracker for sync-mode reqs the scheduler currently - # owns the lifecycle of (admitted, not finished, not retracted). Runs - # as a parallel tracker alongside waiting_queue / running_batch.reqs / - # chunked retention without changing scheduler behavior. See - # agent-drafts/2026-05-25-waiting-queue-refactor-plan.md (C1). + # `active_reqs`: sync-mode reqs the scheduler currently owns the + # lifecycle of (admitted, not finished, not retracted, not aborted- + # released). by-rid indexed. + # + # Definition (Plan §7-Q7): admitted via `_get_new_batch_prefill_raw` + # and not yet released through finish/retract/abort. Includes normal + # decode reqs AND mid-prefill chunked-resume reqs AND PP cross-mb + # in-flight reqs (the last two: NOT in running_batch.reqs but still + # holding row + KV + lock_ref). + # + # Invariants: + # * `waiting_queue ∩ active_reqs == ∅` (sync mode; disagg modes use + # their own ownership managers, see Q1=(c)). + # * `set(running_batch.reqs) ⊆ active_reqs` (in-batch always active). + # * `set(chunked_reqs()) ⊆ active_reqs` (by definition). + # * `len(list(chunked_reqs())) <= 1` (Q5 single-flight; asserted at + # inline chunked admission entry). + # * `active_reqs` keys are in 1:1 correspondence with allocated + # `req_to_token_pool` rows (sync mode). + # + # Maintained at: `_activate` / `_deactivate` (only entry points). + # See agent-drafts/2026-05-25-waiting-queue-refactor-plan.md and + # 2026-05-25-scheduler-lifecycle-audit.md. self.active_reqs: Dict[str, Req] = {} # The running decoding batch for continuous batching self.running_batch: ScheduleBatch = ScheduleBatch(reqs=[], batch_is_full=False) @@ -1060,25 +1078,30 @@ def _assert_invariants(self) -> None: active_rids = set(self.active_reqs.keys()) running_rids = {r.rid for r in self.running_batch.reqs} - # sync mode: chunked-resume reqs still live in waiting_queue until C4 - # deletes the retention. Relax waiting ∩ active here: any rid in the - # intersection must be a chunked-resume req. - intersection_rids = waiting_rids & active_rids - for rid in intersection_rids: - assert self.active_reqs[ - rid - ].has_pending_chunk, ( - f"{rid} in both waiting and active but not chunked-resume" - ) + # sync mode: waiting_queue and active_reqs are strictly disjoint + # (C4 removed chunked-resume retention; chunked-resume now lives in + # active_reqs only). + assert not waiting_rids & active_rids, ( + f"waiting_queue and active_reqs must be disjoint (sync mode); " + f"overlap: {waiting_rids & active_rids}" + ) assert ( running_rids <= active_rids ), f"running not subset of active: {running_rids - active_rids}" def chunked_reqs(self) -> Iterable[Req]: - """active_reqs 中 has_pending_chunk=True 的派生 view。 - Single-flight 不变量(Q5,§7-Q5):len(list(chunked_reqs())) <= 1, - 在 _get_new_batch_prefill_raw 顶端断言(C3 引入)。""" + """Active reqs currently in mid-prefill (`has_pending_chunk=True`). + + Derived view over `active_reqs` — no separate storage. Single-flight + invariant (Q5): `len(list(chunked_reqs())) <= 1` at any iter + boundary; asserted at the entry of the inline chunked admission + block in `_get_new_batch_prefill_raw`. + + Iteration semantics: returns a fresh generator each call; consume + once or wrap in `list(...)`. Callers that mutate `active_reqs` + during iteration must `list(...)` first. + """ return (r for r in self.active_reqs.values() if r.has_pending_chunk) def init_chunked_prefill(self): @@ -2797,13 +2820,18 @@ def _get_new_batch_prefill_raw( if len(can_run_list) == 0: return None - # audit A1: mark newly-admitted reqs as active. Filter chunked-resume - # re-admit (already active from a prior iter) — _activate's internal - # assert would trip otherwise. Filter can be removed once C3+C4 move - # chunked-resume out of the main admission loop. + # audit A1: mark newly-admitted reqs as active. Post-C3/C4 the main + # admission loop (the for-loop over waiting_queue above) only + # produces brand-new admissions. The inline chunked admission block + # also appends to `can_run_list` for chunked-resume re-admit, and + # those reqs are already in active_reqs from a prior iter (the + # inline block does NOT call _activate). Skip them here so the + # strict `_activate` assert (post-C7) catches accidental + # double-admission for everything else. for req in can_run_list: - if req.rid not in self.active_reqs: - self._activate(req) + if req.rid in self.active_reqs: + continue + self._activate(req) # audit H2: retention removed. chunked-resume reqs are no longer # anchored in waiting_queue — they live in active_reqs and are @@ -3842,14 +3870,18 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): self.running_batch.batch_is_full = False - # Chunked-resume reqs in waiting_queue still hold their row + KV + - # radix lock_ref from prior admissions. Without explicit release, - # pause(retract)'s 'flush_cache can succeed' contract (see - # PauseGenerationReqInput docstring) is violated. Release in-place - # and reset their chunked state so continue_generation re-prefills - # them from origin_input_ids. - for req in self.waiting_queue: - if req.has_pending_chunk and req.req_pool_idx is not None: + # Chunked-resume reqs still hold their row + KV + radix lock_ref + # from prior admissions. Without explicit release, pause(retract)'s + # 'flush_cache can succeed' contract (see PauseGenerationReqInput + # docstring) is violated. Release in-place and reset their chunked + # state so continue_generation re-prefills them from + # origin_input_ids. + # + # audit C7: chunked-resume lives in active_reqs (post-C4), + # iterate chunked_reqs() directly. list(...) because we mutate + # active_reqs via _deactivate inside the loop. + for req in list(self.chunked_reqs()): + if req.req_pool_idx is not None: # Disagg-prefill: signal the decode side that the send was # retracted and drop our sender ref so re-prefill rebuilds # the bootstrap state. start_send_idx / tmp_end_idx are @@ -3864,9 +3896,16 @@ def pause_generation(self, recv_req: PauseGenerationReqInput): release_kv_cache(req, self.tree_cache, is_insert=False) req.reset_for_retract() # audit D7: chunked-resume req released via reset_for_retract - # stays in waiting_queue for re-prefill but no longer holds - # row/KV, so it leaves the active set. + # no longer holds row/KV, so it leaves the active set. self._deactivate(req) + # TODO(post-refactor follow-up): plan §10 flag — after + # reset_for_retract, this req is NOT re-enqueued to + # waiting_queue. Either the design relies on the original + # reference staying in waiting_queue (but C4 removed + # retention!) or this is a pre-existing latent bug from + # before the refactor. Investigate separately. See + # agent-drafts/2026-05-25-waiting-queue-refactor-plan.md + # §10. def continue_generation(self, recv_req: ContinueGenerationReqInput): if recv_req.torch_empty_cache: From a1e67f78b28c477a7c223bd3fa1bfaccb4edc306 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 15:08:46 +0800 Subject: [PATCH 49/52] Fix: Gate Scheduler._activate to sync-mode non-DLLM only (C8) Review of C1-C7 revealed two P0 bugs and one P1: 1. _activate fired unconditionally in _get_new_batch_prefill_raw, enrolling disagg PREFILL and DLLM reqs into active_reqs. Neither path has a corresponding _deactivate (disagg PREFILL uses process_batch_result_disagg_prefill; DLLM uses dllm/mixin paths), leaking active_reqs entries indefinitely and crashing abort_all via the new stashed-chunked assert (C6). 2. flush_cache cleared tree cache / pool but not active_reqs, leaving stale dict entries pointing at freed req_pool_idx. Fix: gate _activate at the helper itself (single point of control) to enforce the "sync-mode non-DLLM only" invariant that the plan + audit always assumed but code didn't enforce. flush_cache.clear() ensures the dict is reset alongside other ownership pools. Also: rewrite two stale comments referencing pre-C4 waiting_queue retention. Part of waiting_queue refactor chain, commit 8/7 (post-review fix). --- python/sglang/srt/managers/scheduler.py | 38 ++++++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 1267b8f603ca..6924e6f23610 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1040,11 +1040,23 @@ def init_running_status(self): self._engine_paused = False def _activate(self, req: Req) -> None: - """Mark req as entering active lifecycle (initial admission). - - Caller must ensure req.rid is not already in active_reqs (chunked-resume - re-admit is filtered at the call site). See refactor plan §C1. + """Mark req as entering active lifecycle. + + Gated: only sync-mode non-DLLM reqs enter active_reqs. Disagg + PREFILL/DECODE reqs are owned by their respective queues + (disagg_*_queue); DLLM reqs are owned by dllm_manager.staging_queue. + See plan §2 Scope (Q1=(c)) and audit §1 总览. + + Without this gate, _activate would enroll disagg PREFILL / DLLM + admits (they share _get_new_batch_prefill_raw with sync mode) into + active_reqs, but their finish paths don't call _deactivate, leading + to memory leak + abort_all crash on the active-segment stashed- + chunked assert (C6). """ + if self.disaggregation_mode != DisaggregationMode.NULL: + return + if req.is_dllm(): + return assert req.rid not in self.active_reqs, f"already active: {req.rid}" self.active_reqs[req.rid] = req @@ -1122,11 +1134,13 @@ def init_chunked_prefill(self): self.chunked_prefill_size = None elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: self.chunked_prefill_size = None - # Chunked-resume tracking is now per-req (Req.has_pending_chunk + - # pending_middle_outputs counter); the scheduler no longer holds a global pointer. - # Stage A stashes any waiting_queue req with has_pending_chunk; cache - # impls bound row reads by kv_committed_len so a stash after - # init_next_round_input is safe without the old gate. + # Chunked-resume tracking: per-Req (has_pending_chunk + + # pending_middle_outputs). After the C1-C7 refactor, chunked-resume + # reqs live exclusively in `active_reqs` (not waiting_queue); Stage A + # iterates `chunked_reqs()` derived from active_reqs. The inline + # chunked admission block at the top of `_get_new_batch_prefill_raw` + # re-admits them each iter. See agent-drafts/ + # 2026-05-25-waiting-queue-refactor-plan.md. self.is_mixed_chunk = ( self.chunked_prefill_size is not None and self.server_args.enable_mixed_chunk @@ -2502,8 +2516,9 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # Defensive exclude_chunked_req: the merge step above already # drops chunked-resume reqs from last_batch, so running_batch # shouldn't normally hold one. Keep the flag set so any leak in - # that invariant doesn't survive here; the dropped req still - # has its waiting_queue retention to re-admit next iter. + # that invariant doesn't survive here; the dropped req remains + # in active_reqs (post-C4) and is re-admitted next iter via the + # inline chunked admission block in _get_new_batch_prefill_raw. self.running_batch.filter_batch( exclude_chunked_req=True, exclude_in_flight_other_mb=self._in_flight_other_mb_rids(), @@ -3537,6 +3552,7 @@ def flush_cache(self, empty_cache: bool = True): self.last_batch = None self.tree_cache.reset() self.req_to_token_pool.clear() + self.active_reqs.clear() # audit: keep parallel to req_to_token_pool reset (C8) self.token_to_kv_pool_allocator.clear() self.grammar_manager.clear() self.metrics_reporter.reset_metrics() From 404bdb7f104c20b12e8db28045d5d70939a3ca23 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 15:59:34 +0800 Subject: [PATCH 50/52] Refactor: Replace inline chunked budget bookkeeping with add_one_req (C9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit C3 inlined the body of upstream's `PrefillAdder.add_chunked_req` into `_get_new_batch_prefill_raw` to avoid resurrecting the special method. But `add_one_req` already supports chunked-resume via its `is_resume` path (`has_pending_chunk and not is_dllm`), which gates: - budget_prefix=0 (no prefix double-count) - skip _req_inc_lock_ref (already held from prior admission) - update has_pending_chunk = truncated So the inline manual budget code was a copy of logic that `add_one_req` already encapsulates. C9 replaces the ~30-line inline block with a single `adder.add_one_req(chunked_req, ...)` call; chunked admission still runs BEFORE the main waiting_queue loop so it skips LoRA drainer / hicache prefetch checks that don't apply to in-flight chunked. Removes scheduler.py access to PrefillAdder protected methods (`_get_dllm_remain_tokens`, `_update_prefill_budget`) — these stay encapsulated. Behavior change: `prefill_delayer_single_pass` / `prefill_max_requests` / `dsa_prefill_cp_in_seq_split` early-exit gates now apply to chunked too. Safe in practice: chunked runs first so can_run_list is empty for `_max_requests` / `cp_in_seq_split` checks; prefill_delayer blocking chunked just delays one iter. Part of waiting_queue refactor chain, commit 9/7. --- python/sglang/srt/managers/scheduler.py | 83 +++++++------------------ 1 file changed, 22 insertions(+), 61 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 6924e6f23610..825b9b5a6be8 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -156,7 +156,6 @@ ScheduleBatch, ) from sglang.srt.managers.schedule_policy import ( - CLIP_MAX_NEW_TOKENS, AddReqResult, PrefillAdder, SchedulePolicy, @@ -2602,11 +2601,14 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]: def _get_new_batch_prefill_raw( self, prefill_delayer_single_pass: Optional[PrefillDelayerSinglePassExecutor] ) -> Optional[ScheduleBatch]: - # Chunked-resume admission: handled by the inline block below - # (`chunked_reqs()` filter + adder bookkeeping). The main - # waiting_queue loop further down admits ONLY truly-waiting reqs (no - # has_pending_chunk paths). See agent-drafts/ - # 2026-05-25-waiting-queue-refactor-plan.md §C3. + # Chunked-resume admission: handled by the small block at the top of this + # method, which feeds the single chunked-resume req (if any) through + # `adder.add_one_req`. PrefillAdder.add_one_req detects chunked-resume via + # the `is_resume` flag (has_pending_chunk and not is_dllm) and handles all + # budget bookkeeping in one place — no special add_chunked_req method + # resurrected. The main waiting_queue loop below admits ONLY truly-waiting + # reqs. See agent-drafts/2026-05-25-waiting-queue-refactor-plan.md §C3 (and + # C9 follow-up). # Check if the grammar is ready in the grammar queue if self.grammar_manager.has_waiting_grammars(): ready_grammar_requests = self.grammar_manager.get_ready_grammar_requests() @@ -2689,64 +2691,23 @@ def _get_new_batch_prefill_raw( waiting_queue_len=len(self.waiting_queue), ) - # Inline chunked admission (Plan §C3, do NOT recreate - # PrefillAdder.add_chunked_req per §10 decision). The chunked-resume - # req keeps its row/KV/lock_ref from prior admission, so - # init_next_round_input must NOT re-match prefix (no tree_cache arg, - # H7 elimination). Budget bookkeeping is inlined here — minor LOC; - # intentionally not extracted. Mirrors the body of upstream/main - # PrefillAdder.add_chunked_req. if chunked_in_active: chunked_req = chunked_in_active[0] + # No tree_cache: chunked-resume MUST NOT re-match prefix (H7). + # Its row + KV + lock_ref are already held from prior admission. chunked_req.init_next_round_input() - if adder.dllm_config is not None: - _rem_tokens = adder._get_dllm_remain_tokens() - else: - _rem_tokens = min(adder.rem_chunk_tokens, int(adder.rem_total_tokens)) - if adder.is_hybrid_swa: - # alloc_extend needs extend_num_tokens + page_size per - # request, so reserve one page here to avoid OOM. - _rem_tokens = min( - _rem_tokens, int(adder.rem_swa_tokens) - adder.page_size - ) - # The chunked_req must be added to the list; otherwise, it - # will cause a memory leak. Therefore, in certain cases where - # _rem_tokens <= 0, it should be replaced with - # rem_chunk_tokens. Under hybrid_swa with no room, skip this - # iter — the chunked req stays in active_reqs and is retried - # next iter (mirrors upstream `return req`). - if _rem_tokens <= 0: - if adder.is_hybrid_swa: - _rem_tokens = None - else: - _rem_tokens = adder.rem_chunk_tokens - - if _rem_tokens is not None: - truncated = chunked_req.extend_input_len > _rem_tokens - chunked_req.set_extend_input_len( - min(chunked_req.extend_input_len, _rem_tokens) - ) - chunked_req.fill_ids = chunked_req.fill_ids[ - : len(chunked_req.prefix_indices) + chunked_req.extend_input_len - ] - adder.can_run_list.append(chunked_req) - adder._update_prefill_budget( - 0, - chunked_req.extend_input_len, - ( - min( - chunked_req.sampling_params.max_new_tokens, - CLIP_MAX_NEW_TOKENS, - ) - if not truncated - else 0 - ), - ) - # has_pending_chunk: persistent flag carrying chunked-resume - # state across iters. When truncated=False, this was the last - # chunk — clear the flag so the req exits chunked_reqs(). - if not chunked_req.is_dllm(): - chunked_req.has_pending_chunk = truncated + # Use the standard adder.add_one_req — its `is_resume` branch + # (schedule_policy.py:811) handles chunked-resume correctly: + # - budget_prefix=0 (don't double-count prefix) + # - skip _req_inc_lock_ref (already held) + # - update has_pending_chunk = truncated + # By running BEFORE the main waiting_queue loop, the chunked req + # also skips LoRA drainer / hicache prefetch checks that the + # main loop applies to fresh reqs. + adder.add_one_req( + chunked_req, + truncation_align_size=self.truncation_align_size, + ) if self.enable_lora: running_loras = {req.lora_id for req in self.running_batch.reqs} From d5bf8baab40408fe933cafb695c86c33846fa8ba Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 16:30:30 +0800 Subject: [PATCH 51/52] Refactor: Clean cross-file chunked-in-waiting refs + fix disagg PREFILL leak (C10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two motivations: 1. BUG: C8's `_activate` gate excluded ALL disagg modes, but disagg PREFILL shares _get_new_batch_prefill_raw with sync — chunked-resume reqs were admitted, then orphaned (out of waiting_queue per C4, not in active_reqs per C8), leaking row + KV + lock_ref. Fix: gate to DECODE only (which has its own prealloc/transfer queue ownership), then wire _deactivate at disagg/prefill.py's three release_kv_cache sites and migrate its Stage A loop to chunked_reqs(). 2. CLEANUP: post-C4 chunked-resume never lives in waiting_queue, but several supporting files still split waiting_queue by has_pending_chunk (schedule_policy.py 3 sites, pool_stats_observer.py, invariant_checker.py, several stale comments). Revert/migrate to read active_reqs. DECODE mode is still excluded from active_reqs (Q1=(c)); only PREFILL is now correctly tracked. Part of waiting_queue refactor chain, commit 10/7. --- python/sglang/srt/disaggregation/decode.py | 2 +- python/sglang/srt/disaggregation/prefill.py | 18 ++++++---- python/sglang/srt/managers/schedule_batch.py | 2 +- python/sglang/srt/managers/schedule_policy.py | 33 ++++--------------- python/sglang/srt/managers/scheduler.py | 20 ++++------- .../scheduler_components/invariant_checker.py | 13 ++++---- .../pool_stats_observer.py | 9 ++--- 7 files changed, 40 insertions(+), 57 deletions(-) diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index b552b66e487f..afad8b4adfc0 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -1654,7 +1654,7 @@ def get_next_disagg_decode_batch_to_run( # Process pending prebuilt batch: output processing + filter + merge new_prebuilt_batch = self.get_new_prebuilt_batch() if new_prebuilt_batch: - assert not any(r.has_pending_chunk for r in self.waiting_queue) + # C10: dead assert removed — post-C4 chunked-resume not in waiting_queue. self.batch_result_processor.process_batch_result_prebuilt( new_prebuilt_batch ) diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index ecf4bd784863..b4272ccebb57 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -556,6 +556,8 @@ def process_batch_result_disagg_prefill( # This can happen if the grammar is not set correctly or the token is invalid. error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}" release_kv_cache(req, self.tree_cache) + # audit D-prefill-1: disagg PREFILL release path + self._deactivate(req) prepare_abort( req, error_message, @@ -640,6 +642,8 @@ def process_disagg_prefill_inflight_queue( undone_reqs.append(req) elif poll == KVPoll.Success: # transfer done release_kv_cache(req, self.tree_cache) # unlock the tree + # audit D-prefill-2: disagg PREFILL release path + self._deactivate(req) req.finished_reason = FINISH_LENGTH(length=0) # FIXME: clean up req's data in transfer engine if hasattr(req.disagg_kv_sender, "clear"): @@ -655,6 +659,8 @@ def process_disagg_prefill_inflight_queue( logger.warning(error_message) req.time_stats.trace_ctx.abort(abort_info={"reason": error_message}) release_kv_cache(req, self.tree_cache) # unlock the tree + # audit D-prefill-3: disagg PREFILL release path + self._deactivate(req) prepare_abort( req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR ) @@ -725,10 +731,10 @@ def get_transferred_rids(self: Scheduler) -> List[str]: return transferred_rids def process_prefill_chunk(self: Scheduler) -> None: - # Per-req stash for any in-flight chunked-resume reqs (now sitting in - # the waiting_queue with has_pending_chunk == True). - for req in self.waiting_queue: - if req.has_pending_chunk and not req.is_dllm(): + # audit C10: disagg PREFILL chunked-resume now lives in active_reqs + # (same as sync mode post-C4); iterate chunked_reqs() view. + for req in self.chunked_reqs(): + if not req.is_dllm(): maybe_cache_unfinished_req(req, self.tree_cache, chunked=True) if self.enable_overlap: # Delay KV transfer to process_batch_result_disagg_prefill @@ -746,8 +752,8 @@ def process_prefill_chunk(self: Scheduler) -> None: # Drop chunked-resume reqs from last_batch — running_batch runs # decode forward and admitting a mid-prefill req there breaks # shape + KV accounting. The dropped reqs stay in - # self.waiting_queue (chunked-resume retention) and re-enter via - # the next iter's Stage A stash + admission cycle. + # self.active_reqs and re-enter via the next iter's Stage A + # stash + admission cycle. self.last_batch.filter_batch(exclude_chunked_req=True) if self.last_batch.batch_size() < last_bs: self.running_batch.batch_is_full = False diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 11d70b306970..dd059014038c 100755 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1351,7 +1351,7 @@ def reset_for_retract(self): # Disagg-prefill send-side bookkeeping. The pre-v2 retract path never # ran against a req that had started sending (retract only touched # running_batch), so these stayed at init values. After v2 added - # pause(retract) coverage for waiting chunked-resume reqs, a retracted + # pause(retract) coverage for active chunked-resume reqs, a retracted # disagg-prefill req's stale start_send_idx would index garbage in the # new row on re-prefill. self.start_send_idx = 0 diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index fb5892135a91..7654c4c230f4 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -234,13 +234,8 @@ def _compute_prefix_matches( temporary_deprioritized: Set[int] = set() self.waiting_queue_radix_tree.reset() + # C10: chunked-resume no longer in waiting_queue (post-C4); revert to main-upstream sort. for r in waiting_queue: - if r.has_pending_chunk: - # Chunked-resume reqs already have prefix_indices + last_node - # set by the prior chunk's Stage A stash, plus an inc'd - # lock_ref on last_node. Re-running match_prefix here would - # overwrite both, leaving the prior inc unbalanced. - continue prefix_ids = r.origin_input_ids + r.output_ids extra_key = r.extra_key match_result = match_prefix_for_req(self.tree_cache, r, prefix_ids) @@ -283,19 +278,12 @@ def _sort_by_longest_prefix( waiting_queue: List[Req], temporary_deprioritized: Set[int] ) -> None: """Sorts the waiting queue based on the longest prefix match.""" - # Chunked-resume reqs sort first: their prefix_indices length only - # reflects the chunks already prefilled (kv_committed_len), not the - # full prompt prefix they could have hit had they been fresh. Without - # this floor, a fresh req with a long cached prefix outranks them - # every iter, starving them under tight budget. + # C10: chunked-resume no longer in waiting_queue (post-C4); revert to main-upstream sort. waiting_queue.sort( key=lambda r: ( - 0 if r.has_pending_chunk else 1, - ( - -len(r.prefix_indices) - if r.rid not in temporary_deprioritized - else float("inf") - ), + -len(r.prefix_indices) + if r.rid not in temporary_deprioritized + else float("inf") ) ) @@ -304,15 +292,9 @@ def _sort_by_dfs_weight( waiting_queue: List[Req], tree_cache: BasePrefixCache ) -> None: """Sorts the waiting queue based on a depth-first search weighting.""" - # Pull chunked-resume reqs out before DFS — their last_node points at - # a mid-chunk stash node with weight 1 (no siblings share it), which - # otherwise drops them to a low DFS priority and starves them under - # tight budget. They go back to the front of the queue afterwards. - chunked_reqs = [req for req in waiting_queue if req.has_pending_chunk] - non_chunked_reqs = [req for req in waiting_queue if not req.has_pending_chunk] - + # C10: chunked-resume no longer in waiting_queue (post-C4); revert to main-upstream sort. last_node_to_reqs = defaultdict(list) - for req in non_chunked_reqs: + for req in waiting_queue: last_node_to_reqs[req.last_node].append(req) node_to_weight = defaultdict(int) @@ -321,7 +303,6 @@ def _sort_by_dfs_weight( SchedulePolicy._calc_weight(tree_cache.root_node, node_to_weight) waiting_queue.clear() - waiting_queue.extend(chunked_reqs) SchedulePolicy._get_dfs_priority( tree_cache.root_node, node_to_weight, diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 825b9b5a6be8..f0acc401857a 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -609,7 +609,7 @@ def __init__( max_total_num_tokens=self.max_total_num_tokens, get_last_batch=lambda: self.last_batch, get_running_batch=lambda: self.running_batch, - get_waiting_queue=lambda: self.waiting_queue, + get_active_reqs=lambda: self.active_reqs, ) self.invariant_checker = SchedulerInvariantChecker( @@ -627,7 +627,7 @@ def __init__( pool_stats_observer=self.pool_stats_observer, get_last_batch=lambda: self.last_batch, get_running_batch=lambda: self.running_batch, - get_waiting_queue=lambda: self.waiting_queue, + get_active_reqs=lambda: self.active_reqs, ) self.kv_events_publisher = SchedulerKvEventsPublisher( @@ -1041,18 +1041,12 @@ def init_running_status(self): def _activate(self, req: Req) -> None: """Mark req as entering active lifecycle. - Gated: only sync-mode non-DLLM reqs enter active_reqs. Disagg - PREFILL/DECODE reqs are owned by their respective queues - (disagg_*_queue); DLLM reqs are owned by dllm_manager.staging_queue. - See plan §2 Scope (Q1=(c)) and audit §1 总览. - - Without this gate, _activate would enroll disagg PREFILL / DLLM - admits (they share _get_new_batch_prefill_raw with sync mode) into - active_reqs, but their finish paths don't call _deactivate, leading - to memory leak + abort_all crash on the active-segment stashed- - chunked assert (C6). + Gated: only sync mode + disagg PREFILL + non-DLLM reqs enter + active_reqs. Disagg DECODE has its own prealloc/transfer queue + ownership; DLLM has its own staging_queue. See plan §2 Scope and + C10 fix plan §2 (disagg PREFILL bug). """ - if self.disaggregation_mode != DisaggregationMode.NULL: + if self.disaggregation_mode == DisaggregationMode.DECODE: return if req.is_dllm(): return diff --git a/python/sglang/srt/managers/scheduler_components/invariant_checker.py b/python/sglang/srt/managers/scheduler_components/invariant_checker.py index 9bebbe1dded9..8a236aba68d1 100644 --- a/python/sglang/srt/managers/scheduler_components/invariant_checker.py +++ b/python/sglang/srt/managers/scheduler_components/invariant_checker.py @@ -50,7 +50,7 @@ class SchedulerInvariantChecker: pool_stats_observer: SchedulerPoolStatsObserver get_last_batch: Callable get_running_batch: Callable - get_waiting_queue: Callable + get_active_reqs: Callable count_req_pool_leak_warnings: int = 0 count_memory_leak_warnings: int = 0 @@ -163,20 +163,21 @@ def _get_total_uncached_sizes( and not self.get_running_batch().is_empty() ): req_groups.append(list(self.get_running_batch().reqs)) - # Chunked-resume reqs in waiting_queue carry uncached tail + # Chunked-resume reqs in active_reqs carry uncached tail # (kv_committed_len - cache_protected_len, < page_size) that # filter_batch just removed from last_batch but haven't been # re-admitted to running_batch yet. The leak invariant must count it. + # C10: chunked-resume now lives in active_reqs (post-C4). seen_ids = {id(req) for group in req_groups for req in group} - chunked_in_queue = [ + chunked_in_active = [ req - for req in self.get_waiting_queue() + for req in self.get_active_reqs().values() if req.has_pending_chunk and req.req_pool_idx is not None and id(req) not in seen_ids ] - if chunked_in_queue: - req_groups.append(chunked_in_queue) + if chunked_in_active: + req_groups.append(chunked_in_active) full_uncached = 0 swa_uncached = 0 diff --git a/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py b/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py index 782147653c20..f01cee22814c 100644 --- a/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py +++ b/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py @@ -153,7 +153,7 @@ class SchedulerPoolStatsObserver: max_total_num_tokens: int get_last_batch: Callable get_running_batch: Callable - get_waiting_queue: Callable + get_active_reqs: Callable def streaming_session_count(self) -> int: return sum( @@ -164,7 +164,7 @@ def streaming_session_count(self) -> int: def active_pool_idxs(self) -> set: """Pool idxs currently owned by reqs in last_batch / running_batch or - held by chunked-resume reqs sitting in waiting_queue. + held by chunked-resume reqs in active_reqs. Used to decide which session slots' KV is owned by batch reqs (and thus counted via uncached_size, not session_held). @@ -176,10 +176,11 @@ def active_pool_idxs(self) -> set: for req in batch.reqs: if req.req_pool_idx is not None: idxs.add(req.req_pool_idx) - # Chunked-resume reqs in waiting_queue still own their row across iters + # Chunked-resume reqs in active_reqs still own their row across iters # (filter_batch may have just moved them out of last_batch but they # haven't yet been re-admitted to running_batch). - for req in self.get_waiting_queue(): + # C10: chunked-resume now lives in active_reqs (post-C4). + for req in self.get_active_reqs().values(): if req.has_pending_chunk and req.req_pool_idx is not None: idxs.add(req.req_pool_idx) return idxs From e6a9f0771225a5251cc8f9eb694a8215dfb3f853 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 27 May 2026 16:41:55 +0800 Subject: [PATCH 52/52] Fix: abort_request stashed-chunked disagg PREFILL cleanup (C11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit C10 narrowed _activate's gate to DECODE-only, so disagg PREFILL chunked-resume reqs now enter active_reqs and can be reached by the abort_request active段 stashed-chunked branch (C6). But that branch only does release_kv_cache + _deactivate — missing two pieces of disagg PREFILL cleanup that pause_generation(retract) does correctly: 1. disagg_kv_sender.abort() — without this, the peer decode node waits forever for the remaining chunks (hang). 2. release_req_to_metadata_buffer() — metadata buffer slot leak. Mirrors pause_generation(retract) PREFILL handling and abort_request waiting段 PREFILL handling. Also: clean stale "assert above" comment in disagg/decode.py (the assert was deleted in C10). Part of waiting_queue refactor chain, commit 11/7. --- python/sglang/srt/disaggregation/decode.py | 6 +++--- python/sglang/srt/managers/scheduler.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index afad8b4adfc0..9f9046a2d050 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -1660,9 +1660,9 @@ def get_next_disagg_decode_batch_to_run( ) # Defensive: chunked prefill is a prefill-side concept; decode-side # prebuilt batches shouldn't carry has_pending_chunk reqs. The - # assert above already guards waiting_queue; this flag protects - # against any future code that would route a chunked req through - # the disagg decode path. + # waiting_queue invariant is checked by _assert_invariants in sync + # mode; this flag protects against any future code that would route + # a chunked req through the disagg decode path. new_prebuilt_batch.filter_batch(exclude_chunked_req=True) if not new_prebuilt_batch.is_empty(): if self.running_batch.is_empty(): diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index f0acc401857a..f8b65e5ca070 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -3779,7 +3779,28 @@ def abort_request(self, recv_req: AbortReq): f"req_pool_idx={req.req_pool_idx}" ) if self.disaggregation_mode != DisaggregationMode.DECODE: + # C11: disagg PREFILL stashed-chunked req has already been + # sending KV chunks to the peer decode node. Signal abort so + # the peer doesn't wait forever for the remaining chunks. + # Mirrors pause_generation(retract) PREFILL handling + # (scheduler.py pause section). + if ( + self.disaggregation_mode == DisaggregationMode.PREFILL + and req.disagg_kv_sender is not None + ): + if hasattr(req.disagg_kv_sender, "abort"): + req.disagg_kv_sender.abort() + req.disagg_kv_sender = None + release_kv_cache(req, self.tree_cache, is_insert=False) + + # C11: PREFILL mode also needs to release the metadata buffer + # slot. Mirrors abort_request waiting-segment PREFILL handling. + if self.disaggregation_mode == DisaggregationMode.PREFILL: + release_req_to_metadata_buffer( + req, self.req_to_metadata_buffer_idx_allocator + ) + req.has_pending_chunk = False req.pending_middle_outputs = 0 self._deactivate(req)