From de3859646b9041ad3c3e87f13f6b8c96cd937b64 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:43:21 +0800
Subject: [PATCH 01/52] Prep: abort_request dedup for chunked-resume dual-queue
 holding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When chunked-resume reqs are held in both waiting_queue and batch.reqs
(stateless-scheduler refactor), abort_request would otherwise process
them twice (queue pop + to_finish), causing duplicate send_output and
double release_kv_cache. Build batch_rids upfront and skip waiting_queue
removal for reqs already in batch — let to_finish path handle them.

Pre-flight for stateless-scheduler v2.
---
 python/sglang/srt/managers/scheduler.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 10df1914af20..40b027cbdf8b 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -3550,10 +3550,22 @@ def handle_rpc_request(self, recv_req: RpcReqInput):
 
     def abort_request(self, recv_req: AbortReq):
         # todo hisparse, release resources for abort requests in hisparse coordinator
+        # Build batch rid set: chunked-resume reqs may live in both waiting_queue
+        # and batch.reqs simultaneously (stateless-scheduler refactor). Skip the
+        # waiting_queue removal for those — let the to_finish path below handle
+        # them, otherwise we send_output / release_kv_cache twice.
+        if self.cur_batch is self.running_batch or self.cur_batch is None:
+            batch_reqs = self.running_batch.reqs
+        else:
+            batch_reqs = self.running_batch.reqs + self.cur_batch.reqs
+        batch_rids = {r.rid for r in batch_reqs}
+
         # Delete requests in the waiting queue
         to_del = []
         for i, req in enumerate(self.waiting_queue):
-            if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+            if (recv_req.abort_all or req.rid.startswith(recv_req.rid)) and (
+                req.rid not in batch_rids
+            ):
                 to_del.append(i)
 
         # Sort in reverse order to avoid index issues when deleting
@@ -3632,13 +3644,8 @@ def abort_request(self, recv_req: AbortReq):
                         remaining_retracted.append(decode_req)
                 self.disagg_decode_prealloc_queue.retracted_queue = remaining_retracted
 
-        # Delete requests in the running batch
-        if self.cur_batch is self.running_batch or self.cur_batch is None:
-            reqs = self.running_batch.reqs
-        else:
-            reqs = self.running_batch.reqs + self.cur_batch.reqs
-
-        for req in reqs:
+        # Delete requests in the running batch (reuse batch_reqs built above)
+        for req in batch_reqs:
             if not req.finished() and (
                 recv_req.abort_all or req.rid.startswith(recv_req.rid)
             ):

From c79a73bec4ae0d007dd80529882f3b59545b8e78 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:44:35 +0800
Subject: [PATCH 02/52] Prep: subtract prefix_indices from waiting_queue
 pending tokens sum

For chunked-resume reqs (after the upcoming stateless-scheduler switch)
that live in waiting_queue with non-empty prefix_indices, summing
req.seqlen overcounts the committed prefix. Switch to seqlen - prefix
for waiting reqs; keep the chunked_req block until that field is removed.

Today's behavior is unchanged for fresh waiting reqs whose prefix_indices
is empty.

Pre-flight for stateless-scheduler v2.
---
 python/sglang/srt/observability/scheduler_metrics_mixin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/observability/scheduler_metrics_mixin.py b/python/sglang/srt/observability/scheduler_metrics_mixin.py
index 050895373542..86cd5bfb1e81 100644
--- a/python/sglang/srt/observability/scheduler_metrics_mixin.py
+++ b/python/sglang/srt/observability/scheduler_metrics_mixin.py
@@ -973,7 +973,9 @@ def _get_num_pending_tokens(self: Scheduler, chunk_deduct: int = 0) -> int:
                 time ``prefix_indices`` is already up-to-date, so the default
                 0 is correct.
         """
-        num_pending_tokens = sum(req.seqlen for req in self.waiting_queue)
+        num_pending_tokens = sum(
+            req.seqlen - len(req.prefix_indices) for req in self.waiting_queue
+        )
         if self.chunked_req is not None:
             req = self.chunked_req
             num_pending_tokens += req.seqlen - len(req.prefix_indices) - chunk_deduct

From a5915a193fa21dfd525e667bbd8783442288baea Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:45:07 +0800
Subject: [PATCH 03/52] Prep: document filter_batch chunked-exclusion invariant

Explicit comment that reqs still doing prefill (chunked-resume or DLLM
staging) must not be merged into running_batch. Today enforced via
chunked_req_to_exclude param; stateless-scheduler v2 will move to a
per-req predicate. Pre-flight for v2.
---
 python/sglang/srt/managers/schedule_batch.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 75ed09458364..97deb27b03f2 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2403,6 +2403,12 @@ def filter_batch(
         # FIXME(lsyin): deprecate this API after spec v1 is deprecated
         v1_spec_info_filtered: Optional[bool] = False,
     ):
+        # Invariant: reqs still doing prefill (chunked-resume or DLLM staging)
+        # must never be merged into running_batch via this filter — running_batch
+        # runs decode forward, and admitting a mid-prefill req there causes
+        # shape mismatch + double KV accounting. Today the invariant is enforced
+        # by callers passing chunked_req_to_exclude; the stateless-scheduler v2
+        # refactor will move this to a per-req predicate.
         # FIXME(lsyin): used here to get the correct seq_lens
         # The batch has been launched but we need it verified to get correct next batch info
         self.maybe_wait_verify_done()

From 1c3bf8e7dbc7f6a099a4af6b079209f48eed7c1a Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:46:47 +0800
Subject: [PATCH 04/52] Bound cache_unfinished_req row read by kv_committed_len

init_next_round_input resets req.fill_ids to len(origin_input_ids) +
len(output_ids) before stash, but the req_to_token row only holds valid
KV indices up to kv_committed_len. Under SWA early-return (and other
paths where admission backs off after init_next_round_input), reading
[req_pool_idx, :len(fill_ids)] yields garbage beyond kv_committed_len,
which then gets inserted into the radix tree as a prefix entry, causing
prefix-hit corruption.

Bound the read to req.kv_committed_len in all 6 cache impls. Add
assert kv_committed_len >= cache_protected_len at each entry to surface
state-machine violations as crashes rather than silent slice underflow.

Touches: radix_cache, swa_radix_cache, unified_radix_cache,
mamba_radix_cache, radix_cache_cpp, chunk_cache.

Pre-flight for stateless-scheduler v2.
---
 python/sglang/srt/mem_cache/chunk_cache.py         |  3 ++-
 python/sglang/srt/mem_cache/mamba_radix_cache.py   |  9 ++++++---
 python/sglang/srt/mem_cache/radix_cache.py         | 11 +++++++++--
 python/sglang/srt/mem_cache/radix_cache_cpp.py     |  6 ++++--
 python/sglang/srt/mem_cache/swa_radix_cache.py     | 11 ++++++++---
 python/sglang/srt/mem_cache/unified_radix_cache.py |  9 ++++++---
 6 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py
index 6d34a3aa1fc2..8a970b4bedcb 100644
--- a/python/sglang/srt/mem_cache/chunk_cache.py
+++ b/python/sglang/srt/mem_cache/chunk_cache.py
@@ -84,8 +84,9 @@ def cache_finished_req(self, req: Req, is_insert: bool = True):
         self.token_to_kv_pool_allocator.free(kv_indices)
 
     def cache_unfinished_req(self, req: Req, chunked=False):
+        # Bound row read by kv_committed_len; see radix_cache.py for rationale.
         kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, : len(req.fill_ids)
+            req.req_pool_idx, : req.kv_committed_len
         ]
         # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
         req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True)
diff --git a/python/sglang/srt/mem_cache/mamba_radix_cache.py b/python/sglang/srt/mem_cache/mamba_radix_cache.py
index 55ee7983e953..325b3aa08aae 100644
--- a/python/sglang/srt/mem_cache/mamba_radix_cache.py
+++ b/python/sglang/srt/mem_cache/mamba_radix_cache.py
@@ -599,17 +599,20 @@ def cache_finished_req(self, req: Req, is_insert: bool = True) -> None:
 
     def cache_unfinished_req(self, req: Req, chunked=False) -> None:
         """Cache request when it is unfinished."""
+        # Bound row read by kv_committed_len; see radix_cache.py for rationale.
+        assert req.kv_committed_len >= req.cache_protected_len
+        read_len = req.kv_committed_len
 
         def _skip_cache_unfinished_req(req: Req) -> None:
             kv_indices = self.req_to_token_pool.req_to_token[
-                req.req_pool_idx, : len(req.fill_ids)
+                req.req_pool_idx, :read_len
             ]
 
             # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
             req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True)
             return
 
-        token_ids = req.fill_ids
+        token_ids = req.fill_ids[:read_len]
         cache_len = (
             req.mamba_last_track_seqlen
             if self.enable_mamba_extra_buffer
@@ -619,7 +622,7 @@ def _skip_cache_unfinished_req(req: Req) -> None:
             return _skip_cache_unfinished_req(req)
 
         kv_indices_orig = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, : len(token_ids)
+            req.req_pool_idx, :read_len
         ]
         # kv_indices is the kv indices to be cached
         kv_indices = kv_indices_orig[:cache_len]
diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py
index 8a24c5e15926..9e1e93e48a79 100644
--- a/python/sglang/srt/mem_cache/radix_cache.py
+++ b/python/sglang/srt/mem_cache/radix_cache.py
@@ -487,9 +487,16 @@ def cache_unfinished_req(self, req: Req, chunked=False):
         if self.disable:
             return
 
-        token_ids = req.fill_ids
+        # Bound the row read by kv_committed_len (the actually-written prefix
+        # length on the row), not by len(fill_ids). They are equal in the
+        # common path, but init_next_round_input resets fill_ids to the full
+        # origin + output length while the row only holds KV up to
+        # kv_committed_len — reading beyond that yields garbage slot indices.
+        assert req.kv_committed_len >= req.cache_protected_len
+        read_len = req.kv_committed_len
+        token_ids = req.fill_ids[:read_len]
         kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, : len(token_ids)
+            req.req_pool_idx, :read_len
         ]
 
         radix_key = RadixKey(
diff --git a/python/sglang/srt/mem_cache/radix_cache_cpp.py b/python/sglang/srt/mem_cache/radix_cache_cpp.py
index 66f9fad96ad7..834654e8e8f8 100644
--- a/python/sglang/srt/mem_cache/radix_cache_cpp.py
+++ b/python/sglang/srt/mem_cache/radix_cache_cpp.py
@@ -209,8 +209,10 @@ def cache_finished_req(self, req: Req, is_insert: bool = True):
     def cache_unfinished_req(self, req: Req, chunked=False):
         """Cache request when it is unfinished."""
         assert req.req_pool_idx is not None
-        token_ids = req.fill_ids
-        prefill_len = len(token_ids)  # prefill only (maybe chunked)
+        # Bound row read by kv_committed_len; see radix_cache.py for rationale.
+        assert req.kv_committed_len >= req.cache_protected_len
+        prefill_len = req.kv_committed_len
+        token_ids = req.fill_ids[:prefill_len]
         kv_indices = self.req_to_token_pool.req_to_token[
             req.req_pool_idx, :prefill_len
         ].to(dtype=torch.int64, copy=True)
diff --git a/python/sglang/srt/mem_cache/swa_radix_cache.py b/python/sglang/srt/mem_cache/swa_radix_cache.py
index af2d99e96e6d..2457ec817446 100644
--- a/python/sglang/srt/mem_cache/swa_radix_cache.py
+++ b/python/sglang/srt/mem_cache/swa_radix_cache.py
@@ -482,18 +482,23 @@ def cache_finished_req(self, req: Req, is_insert: bool = True) -> None:
 
     def cache_unfinished_req(self, req: Req, chunked=False) -> None:
         """Cache request when it is unfinished."""
+        # Bound the row read by kv_committed_len, not len(fill_ids); see
+        # radix_cache.py:cache_unfinished_req for the rationale (SWA early-
+        # return + init_next_round_input leaves fill_ids longer than the row).
+        assert req.kv_committed_len >= req.cache_protected_len
+        read_len = req.kv_committed_len
         if self.disable:
             kv_indices = self.req_to_token_pool.req_to_token[
-                req.req_pool_idx, : len(req.fill_ids)
+                req.req_pool_idx, :read_len
             ]
 
             # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
             req.prefix_indices = kv_indices
             return
 
-        token_ids = req.fill_ids
+        token_ids = req.fill_ids[:read_len]
         kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, : len(token_ids)
+            req.req_pool_idx, :read_len
         ]
 
         radix_key = RadixKey(
diff --git a/python/sglang/srt/mem_cache/unified_radix_cache.py b/python/sglang/srt/mem_cache/unified_radix_cache.py
index 80a3da5bb190..ad0bff3b80b6 100644
--- a/python/sglang/srt/mem_cache/unified_radix_cache.py
+++ b/python/sglang/srt/mem_cache/unified_radix_cache.py
@@ -490,17 +490,20 @@ def cache_unfinished_req(self, req: Req, chunked=False, **kwargs) -> None:
         if self.session.try_cache_unfinished_req(req, chunked=chunked, **kwargs):
             return
 
-        token_ids = req.fill_ids
+        # Bound row read by kv_committed_len; see radix_cache.py for rationale.
+        assert req.kv_committed_len >= req.cache_protected_len
+        read_len = req.kv_committed_len
+        token_ids = req.fill_ids[:read_len]
 
         if self.disable:
             kv_indices = self.req_to_token_pool.req_to_token[
-                req.req_pool_idx, : len(token_ids)
+                req.req_pool_idx, :read_len
             ]
             req.prefix_indices = kv_indices
             return
 
         kv_indices_orig = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, : len(token_ids)
+            req.req_pool_idx, :read_len
         ]
 
         # components prepare insert data + return effective cache_len

From 9b361aef46ae6af35700f0b1cf3c7aa59a4fa38a Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:47:22 +0800
Subject: [PATCH 05/52] Drop is_chunked from req_to_token_pool alloc assert
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Row-pool reuse should only check 'this row has committed KV' — not
whether the req is chunked. kv_committed_len > 0 covers chunked-resume,
DLLM staging, and any other reuse case. Step in decoupling chunked from
req_pool_idx.

Same simplification applied to disaggregation/decode.py.

Pre-flight for stateless-scheduler v2.
---
 python/sglang/srt/disaggregation/decode.py |  4 ++--
 python/sglang/srt/mem_cache/memory_pool.py | 13 +++++--------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index b9f8e5994381..03c70f8ee84b 100644
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -152,8 +152,8 @@ def alloc(self, reqs: List["Req"]) -> Optional[List[int]]:
             len(reusing) <= 1
         ), "only one chunked request may reuse req_pool_idx in a batch"
         assert all(
-            reqs[i].is_chunked > 0 or reqs[i].kv_committed_len > 0 for i in reusing
-        ), "reusing request must be chunked or have committed KV"
+            reqs[i].kv_committed_len > 0 for i in reusing
+        ), "reusing request must have committed KV"
 
         need_size = len(reqs) - len(reusing)
         if need_size > len(self.free_slots):
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index 23d15f71ac4d..43f76c112c34 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -160,15 +160,12 @@ def alloc(self, reqs: list[Req]) -> Optional[List[int]]:
         # Indices of reqs that already have a req_pool_idx and will reuse
         # their existing slot (e.g. chunked prefill continuing across chunks).
         reusing = [i for i, r in enumerate(reqs) if r.req_pool_idx is not None]
-        # NOTE: this check is relaxed temporarily
-        # https://github.com/sgl-project/sglang/pull/20476
-        # if not any(r.is_dllm() for r in reqs):
-        #     assert (
-        #         sum(1 for i in reusing if reqs[i].is_chunked > 0) <= 1
-        #     ), "only one chunked request may reuse req_pool_idx in a batch"
+        # The row pool only cares whether the row has committed KV — it does
+        # not need to know whether the req is chunked. kv_committed_len > 0
+        # naturally covers chunked-resume + DLLM staging + any reuse case.
         assert all(
-            reqs[i].is_chunked > 0 or reqs[i].kv_committed_len > 0 for i in reusing
-        ), "reusing request must be chunked or have committed KV"
+            reqs[i].kv_committed_len > 0 for i in reusing
+        ), "reusing request must have committed KV"
 
         need_size = len(reqs) - len(reusing)
         if need_size > len(self.free_slots):

From 74f1d8bbabba6b9395d7612e4e826c0978d7d10e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:51:40 +0800
Subject: [PATCH 06/52] Unify chunked admission via add_one_req reuse branch +
 add has_pending_chunk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the dedicated add_chunked_req method with a reuse branch inside
add_one_req, gated on (req.kv_committed_len > 0 and not req.is_dllm()).
The reuse branch:
  - skips _req_inc_lock_ref (lock already held by previous stash)
  - skips init_load_back (host_hit_length naturally 0 after reset in
    prepare_for_extend post-metric-computation)
  - passes 0 as prefix budget (already counted by previous stash)

Add Req.has_pending_chunk: bool — persistent cross-iter flag set by
admission when truncated=True, cleared on last-chunk admit or retract.
Used (here) to mirror Scheduler.chunked_req; future commits will use it
to drive Stage A stash and filter_batch predicates.

Delete:
  - PrefillAdder.add_chunked_req method
  - PrefillAdder.new_chunked_req field
  - has_chunked_req= parameter on add_one_req (unused, removed call sites)

Scheduler.chunked_req is retained at this commit and synchronized via
has_pending_chunk after admission (single-flight invariant asserted).

Add host_hit_length reset inside prepare_for_extend right after the
cached_tokens_host metric is recorded — required so chunked-resume reqs
don't re-trigger init_load_back on subsequent admissions (preflight 7).

Part of stateless-scheduler v2.
---
 python/sglang/srt/dllm/mixin/scheduler.py     |  1 -
 python/sglang/srt/managers/schedule_batch.py  | 13 +++
 python/sglang/srt/managers/schedule_policy.py | 83 +++++++++----------
 python/sglang/srt/managers/scheduler.py       | 36 ++++++--
 4 files changed, 78 insertions(+), 55 deletions(-)

diff --git a/python/sglang/srt/dllm/mixin/scheduler.py b/python/sglang/srt/dllm/mixin/scheduler.py
index 157ab219276b..e8a563703811 100644
--- a/python/sglang/srt/dllm/mixin/scheduler.py
+++ b/python/sglang/srt/dllm/mixin/scheduler.py
@@ -256,7 +256,6 @@ def process_dllm_incoming_reqs(
             req.init_next_round_input(self.tree_cache)
             res = adder.add_one_req(
                 req,
-                has_chunked_req=True,
                 truncation_align_size=self.truncation_align_size,
             )
 
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 97deb27b03f2..2d1733d9f4b2 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -757,6 +757,14 @@ def __init__(
         # processed.
         self.is_chunked = 0
 
+        # Persistent (cross-iter) flag set by admission when this req's
+        # current admission was truncated (more chunks remain). Cleared
+        # when last chunk is admitted (truncated=False) or on retract.
+        # Used by Stage A stash detection, filter_batch exclusion, and
+        # add_one_req's reuse-vs-fresh branch. Independent of is_chunked
+        # counter (transient) and kv_committed_len (derived).
+        self.has_pending_chunk = False
+
         # For retraction
         self.is_retracted = False
         # Indicates if the req has ever been retracted.
@@ -1258,6 +1266,7 @@ def reset_for_retract(self):
         self.temp_input_top_logprobs_idx = None
         self.extend_logprob_start_len = 0
         self.is_chunked = 0
+        self.has_pending_chunk = False
         self.mamba_pool_idx = None
         self.mamba_ping_pong_track_buffer = None
         self.mamba_next_track_idx = None
@@ -1835,6 +1844,10 @@ def prepare_for_extend(self):
                     req.cached_tokens_host = host_portion
                     req.cached_tokens_storage = storage_portion
                     req._cache_breakdown_computed = True
+                    # Reset host_hit_length after metric is computed so that
+                    # subsequent chunks' admission paths see host_hit_length == 0
+                    # and naturally skip init_load_back (host KV already loaded).
+                    req.host_hit_length = 0
 
                 req.already_computed = seq_len
             req.is_retracted = False
diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index 29b90038ad26..b924592fc1f1 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -441,7 +441,6 @@ def __init__(
         self.req_states = None
         self.can_run_list = []
         self.preempt_list = []
-        self.new_chunked_req = None
         self.log_hit_tokens = 0
         # TODO(lsyin): report the real input tokens excluding page alignment
         self.log_input_tokens = 0
@@ -663,41 +662,6 @@ def add_dllm_staging_req(self, req: Req):
             else AddReqResult.CONTINUE
         )
 
-    def add_chunked_req(self, req: Req):
-        if self.dllm_config is not None:
-            _rem_tokens = self._get_dllm_remain_tokens()
-        else:
-            _rem_tokens = min(self.rem_chunk_tokens, int(self.rem_total_tokens))
-            if self.is_hybrid_swa:
-                # alloc_extend needs extend_num_tokens + page_size per request,
-                # so reserve one page here to avoid OOM
-                _rem_tokens = min(
-                    _rem_tokens, int(self.rem_swa_tokens) - self.page_size
-                )
-            # The chunked_req must be added to the list; otherwise, it will cause a memory leak.
-            # Therefore, in certain cases where _rem_tokens <= 0, it should be replaced with rem_chunk_tokens.
-            if _rem_tokens <= 0:
-                if self.is_hybrid_swa:
-                    return req
-                _rem_tokens = self.rem_chunk_tokens
-
-        truncated = req.extend_input_len > _rem_tokens
-        req.set_extend_input_len(min(req.extend_input_len, _rem_tokens))
-        req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
-        self.can_run_list.append(req)
-        self._update_prefill_budget(
-            0,
-            req.extend_input_len,
-            (
-                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
-                if not truncated
-                else 0
-            ),
-        )
-
-        # Return if chunked prefill not finished
-        return req if truncated else None
-
     @contextmanager
     def _lock_node(self, last_node: TreeNode):
         dec_lock_params = None
@@ -784,6 +748,7 @@ def add_req_state(r, insert_sort=False):
                 return AddReqResult.OTHER
 
             self._add_dllm_req(req, 0)
+            truncated = False
         elif (
             self.rem_chunk_tokens is None  # chunked prefill is disabled
             or req.extend_input_len <= self.rem_chunk_tokens  # it is the last chunk
@@ -795,6 +760,7 @@ def add_req_state(r, insert_sort=False):
                 req.extend_input_len,
                 min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
             )
+            truncated = False
         else:
             if self.rem_chunk_tokens <= 0:
                 return AddReqResult.OTHER
@@ -805,14 +771,24 @@ def add_req_state(r, insert_sort=False):
             req.set_extend_input_len(trunc_len)
             req.fill_ids = req.fill_ids[:trunc_len]
             self.can_run_list.append(req)
-            self.new_chunked_req = req
             self._update_prefill_budget(0, trunc_len, 0)
+            truncated = True
+
+        if not req.is_dllm():
+            req.has_pending_chunk = truncated
 
         return self.budget_state()
 
     def add_one_req(
-        self, req: Req, has_chunked_req: bool, truncation_align_size: Optional[int]
+        self, req: Req, truncation_align_size: Optional[int]
     ):
+        # Reuse path: this req was admitted in a previous iter, has a row
+        # with committed KV (kv_committed_len > 0), and is mid-prefill. Skip
+        # fresh-req setup (lock_ref already held by previous stash;
+        # init_load_back already ran on first admission; prefix already
+        # counted in tree). DLLM has its own path and never takes reuse here.
+        is_resume = req.kv_committed_len > 0 and not req.is_dllm()
+
         if (self.prefill_delayer_single_pass is not None) and (
             not self.prefill_delayer_single_pass.negotiate_should_allow_prefill(
                 local_prefillable=True,
@@ -874,6 +850,10 @@ def add_one_req(
                 if swa_needed >= self.rem_swa_tokens:
                     return AddReqResult.NO_TOKEN
 
+            # Fresh-only init_load_back. For reuse, host_hit_length was set
+            # on first admission and reset by prepare_for_extend after the
+            # cache-breakdown metric was computed, so the predicate naturally
+            # short-circuits here for reuse.
             if req.host_hit_length > 0:
                 new_indices, req.last_node = self.tree_cache.init_load_back(
                     InitLoadBackParams(
@@ -892,6 +872,10 @@ def add_one_req(
             if input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
                 return AddReqResult.OTHER
 
+            # Budget prefix_len: 0 for reuse (already counted by previous
+            # admission's stash into tree); actual prefix_len for fresh.
+            budget_prefix = 0 if is_resume else prefix_len
+
             if self.dllm_config is not None:
                 if self.rem_dllm_tokens <= 0:
                     return AddReqResult.OTHER
@@ -902,20 +886,24 @@ def add_one_req(
 
                 self._add_dllm_req(req, prefix_len)
                 self._req_inc_lock_ref(req)
+                truncated = False
             elif self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens:
-                # Non-chunked prefill
+                # Non-chunked prefill (or last chunk of a chunked-resume req).
                 self.can_run_list.append(req)
 
-                self._req_inc_lock_ref(req)
+                if not is_resume:
+                    self._req_inc_lock_ref(req)
                 self._update_prefill_budget(
-                    prefix_len,
+                    budget_prefix,
                     input_tokens,
                     min(
                         req.sampling_params.max_new_tokens,
                         CLIP_MAX_NEW_TOKENS,
                     ),
                 )
+                truncated = False
             else:
+                # Chunked prefill: this admission doesn't complete the prefill.
                 # Make sure at least one page is available
                 trunc_len = self.rem_chunk_tokens // self.page_size * self.page_size
 
@@ -940,15 +928,20 @@ def add_one_req(
                 if trunc_len <= 0:
                     return AddReqResult.OTHER
 
-                # Chunked prefill
                 req.set_extend_input_len(trunc_len)
                 req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
 
                 self.can_run_list.append(req)
-                self.new_chunked_req = req
 
-                self._req_inc_lock_ref(req)
-                self._update_prefill_budget(prefix_len, trunc_len, 0)
+                if not is_resume:
+                    self._req_inc_lock_ref(req)
+                self._update_prefill_budget(budget_prefix, trunc_len, 0)
+                truncated = True
+
+        # has_pending_chunk: persistent flag carrying chunked-resume state
+        # across iters. DLLM uses its own staging_queue + is_chunked counter.
+        if not req.is_dllm():
+            req.has_pending_chunk = truncated
 
         return self.budget_state()
 
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 40b027cbdf8b..79d680e57d15 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2698,12 +2698,23 @@ def _get_new_batch_prefill_raw(
             waiting_queue_len=len(self.waiting_queue),
         )
 
+        # Re-admit the in-flight chunked req via the unified add_one_req
+        # entry. add_one_req's reuse branch (gated on kv_committed_len > 0)
+        # mirrors the old add_chunked_req's behavior: skip lock_ref inc,
+        # init_load_back, and prefix budget. Sets req.has_pending_chunk to
+        # truncated.
         if self.chunked_req is not None:
             self.chunked_req.init_next_round_input()
-            self.chunked_req = adder.add_chunked_req(self.chunked_req)
-            self._chunked_req_scheduled_last_iter = (
-                self.chunked_req in adder.can_run_list
+            adder.add_one_req(
+                self.chunked_req,
+                truncation_align_size=self.truncation_align_size,
             )
+            # After admit, has_pending_chunk reflects whether more chunks
+            # remain. Mirror it into self.chunked_req for the existing
+            # Stage A stash path (deleted in a later commit).
+            if not self.chunked_req.has_pending_chunk:
+                self.chunked_req = None
+            self._chunked_req_scheduled_last_iter = self.chunked_req is not None
         else:
             self._chunked_req_scheduled_last_iter = False
 
@@ -2750,7 +2761,6 @@ def _get_new_batch_prefill_raw(
             req.init_next_round_input(self.tree_cache)
             res = adder.add_one_req(
                 req,
-                has_chunked_req=(self.chunked_req is not None),
                 truncation_align_size=self.truncation_align_size,
             )
 
@@ -2793,12 +2803,20 @@ def _get_new_batch_prefill_raw(
             for req in adder.preempt_list:
                 self._add_request_to_queue(req)
 
-        if adder.new_chunked_req is not None:
-            # Update chunked prefill
+        # Identify newly-truncated chunked-resume reqs admitted this iter via
+        # add_one_req's reuse/chunked branch. has_pending_chunk is set by
+        # add_one_req when truncated=True. The "newly chunked" set excludes
+        # self.chunked_req which was already tracked from previous iter.
+        new_chunked = [
+            r for r in can_run_list if r.has_pending_chunk and r is not self.chunked_req
+        ]
+        assert (
+            len(new_chunked) <= 1
+        ), "single-flight invariant: at most one new chunked req per iter"
+        if new_chunked:
             assert self.chunked_req is None
-            self.chunked_req = adder.new_chunked_req
-            # new_chunked_req is added to can_run_list by add_one_req,
-            # so it will be scheduled this iter -> stash is needed next iter.
+            self.chunked_req = new_chunked[0]
+            # The chunked req is scheduled this iter -> stash needed next iter.
             self._chunked_req_scheduled_last_iter = True
 
         if self.chunked_req is not None:

From c445a82cf5738f6165456e3ae4541596e8fa4d72 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:56:56 +0800
Subject: [PATCH 07/52] Switch chunked-resume to waiting_queue holding; delete
 chunked_req fields

Core semantic switch of the stateless-scheduler refactor.

Scheduler / Batch no longer maintain any chunked-aware state. Chunked-
resume reqs sit in self.waiting_queue across iters with priority + the
new Req.has_pending_chunk flag from commit 6.

Deletes:
  - Scheduler.chunked_req field
  - Scheduler._chunked_req_scheduled_last_iter field
  - Scheduler.stash_chunked_request method
  - ScheduleBatch.chunked_req field
  - filter_batch's chunked_req_to_exclude= parameter
  - ScheduleBatch.init_new's chunked_req= parameter

Changes:
  - Stage A stash: replaced 'if self.chunked_req: stash(self.chunked_req)'
    with 'for req in waiting_queue: if req.has_pending_chunk and not is_dllm:
    cache_unfinished_req(...)'. DLLM staging stash kept as separate sub-loop
    (DLLM reqs live in dllm_manager.staging_queue, not waiting_queue).
  - filter_batch predicate: not finished AND not has_pending_chunk AND not
    is_chunked > 0 AND not is_dllm. The is_chunked > 0 clause covers the PP
    window where mb_a's last-chunk admit cleared has_pending_chunk but
    mb_b's middle chunk is still in-flight (preflight 2).
  - merge_batch asserts no req has has_pending_chunk (downstream invariant).
  - get_new_batch_prefill: admission loop now handles chunked-resume
    naturally via priority + reuse branch in add_one_req. Removed dedicated
    pre-loop block. Dynamic chunking + chunk_deduct now derived from the
    single chunked-resume req in waiting_queue.
  - waiting_queue removal at end of admission: keeps reqs with
    has_pending_chunk so they stay for next iter.
  - Use init_next_round_input() without tree_cache for chunked-resume in
    the admission loop (preserves last_node + lock_ref pairing).

Disaggregation:
  - prefill.py process_prefill_chunk: per-req stash for waiting_queue
    chunked-resume; filter_batch uses internal predicate.
  - decode.py prebuilt path: assert reframed against has_pending_chunk.

Metrics:
  - _get_num_pending_tokens: drop the chunked_req block (already counted
    via waiting_queue sum).

Part of stateless-scheduler v2.
---
 python/sglang/srt/disaggregation/decode.py    |   2 +-
 python/sglang/srt/disaggregation/prefill.py   |  38 ++--
 python/sglang/srt/managers/schedule_batch.py  |  30 +--
 python/sglang/srt/managers/scheduler.py       | 177 ++++++++----------
 python/sglang/srt/mem_cache/chunk_cache.py    |   2 +-
 .../sglang/srt/mem_cache/mamba_radix_cache.py |   4 +-
 python/sglang/srt/mem_cache/radix_cache.py    |   2 +-
 .../sglang/srt/mem_cache/swa_radix_cache.py   |   4 +-
 .../observability/scheduler_metrics_mixin.py  |   9 +-
 9 files changed, 118 insertions(+), 150 deletions(-)

diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index 03c70f8ee84b..797071794d68 100644
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -1646,7 +1646,7 @@ def get_next_disagg_decode_batch_to_run(
         # Process pending prebuilt batch: output processing + filter + merge
         new_prebuilt_batch = self.get_new_prebuilt_batch()
         if new_prebuilt_batch:
-            assert self.chunked_req is None
+            assert not any(r.has_pending_chunk for r in self.waiting_queue)
             self.process_batch_result_prebuilt(new_prebuilt_batch)
             new_prebuilt_batch.filter_batch()
             if not new_prebuilt_batch.is_empty():
diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index 7ddcbe169d7d..715b7739ccbf 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -715,30 +715,26 @@ def get_transferred_rids(self: Scheduler) -> List[str]:
         return transferred_rids
 
     def process_prefill_chunk(self: Scheduler) -> None:
-        chunked_req_to_exclude = set()
-        if self.chunked_req:
-            chunked_req_to_exclude.add(self.chunked_req)
-            maybe_cache_unfinished_req(self.chunked_req, self.tree_cache, chunked=True)
-            if self.enable_overlap:
-                # Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
-                self.chunked_req.tmp_end_idx = min(
-                    len(self.chunked_req.fill_ids),
-                    len(self.chunked_req.origin_input_ids),
-                )
-            else:
-                self.send_kv_chunk(self.chunked_req)
-            self.running_batch.batch_is_full = False
+        # Per-req stash for any in-flight chunked-resume reqs (now sitting in
+        # the waiting_queue with has_pending_chunk == True).
+        for req in self.waiting_queue:
+            if req.has_pending_chunk and not req.is_dllm():
+                maybe_cache_unfinished_req(req, self.tree_cache, chunked=True)
+                if self.enable_overlap:
+                    # Delay KV transfer to process_batch_result_disagg_prefill
+                    # when overlap is enabled to ensure results are resolved.
+                    req.tmp_end_idx = min(
+                        len(req.fill_ids),
+                        len(req.origin_input_ids),
+                    )
+                else:
+                    self.send_kv_chunk(req)
+                self.running_batch.batch_is_full = False
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
-            if self.last_batch.chunked_req:
-                # In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req.
-                # We need to discard it.
-                chunked_req_to_exclude.add(self.last_batch.chunked_req)
-
+            # filter_batch's internal predicate excludes still-prefilling reqs.
             last_bs = self.last_batch.batch_size()
-            self.last_batch.filter_batch(
-                chunked_req_to_exclude=list(chunked_req_to_exclude)
-            )
+            self.last_batch.filter_batch()
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 2d1733d9f4b2..83254cb50e9f 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1402,9 +1402,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # This is an optimization to reduce the overhead of the prefill check.
     batch_is_full: bool = False
 
-    # For chunked prefill in PP
-    chunked_req: Optional[Req] = None
-
     # Sampling info
     sampling_info: SamplingBatchInfo = None
 
@@ -1538,7 +1535,6 @@ def init_new(
         model_config: ModelConfig,
         enable_overlap: bool,
         spec_algorithm: SpeculativeAlgorithm,
-        chunked_req: Optional[Req] = None,
         dllm_config: Optional[DllmConfig] = None,
     ):
         return_logprob = any(req.return_logprob for req in reqs)
@@ -1564,7 +1560,6 @@ def init_new(
             return_routed_experts=any(req.return_routed_experts for req in reqs),
             return_indexer_topk=any(req.return_indexer_topk for req in reqs),
             is_prefill_only=all(req.is_prefill_only for req in reqs),
-            chunked_req=chunked_req,
             dllm_config=dllm_config,
         )
         return batch
@@ -2411,31 +2406,29 @@ def maybe_wait_verify_done(self):
 
     def filter_batch(
         self,
-        chunked_req_to_exclude: Optional[Union[Req, List[Req]]] = None,
         keep_indices: Optional[List[int]] = None,
         # FIXME(lsyin): deprecate this API after spec v1 is deprecated
         v1_spec_info_filtered: Optional[bool] = False,
     ):
-        # Invariant: reqs still doing prefill (chunked-resume or DLLM staging)
+        # Invariant: reqs still doing prefill (chunked-resume or DLLM-managed)
         # must never be merged into running_batch via this filter — running_batch
         # runs decode forward, and admitting a mid-prefill req there causes
-        # shape mismatch + double KV accounting. Today the invariant is enforced
-        # by callers passing chunked_req_to_exclude; the stateless-scheduler v2
-        # refactor will move this to a per-req predicate.
+        # shape mismatch + double KV accounting. Enforced per-req:
+        #   - has_pending_chunk: chunked-resume scheduled to continue
+        #   - is_chunked > 0: PP in-flight middle chunk for this req
+        #   - is_dllm(): DllmManager-managed (separate staging queue)
         # FIXME(lsyin): used here to get the correct seq_lens
         # The batch has been launched but we need it verified to get correct next batch info
         self.maybe_wait_verify_done()
 
         if keep_indices is None:
-            if isinstance(chunked_req_to_exclude, Req):
-                chunked_req_to_exclude = [chunked_req_to_exclude]
-            elif chunked_req_to_exclude is None:
-                chunked_req_to_exclude = []
             keep_indices = [
                 i
                 for i in range(len(self.reqs))
                 if not self.reqs[i].finished()
-                and self.reqs[i] not in chunked_req_to_exclude
+                and not self.reqs[i].has_pending_chunk
+                and not self.reqs[i].is_chunked > 0
+                and not self.reqs[i].is_dllm()
             ]
 
         if keep_indices is None or len(keep_indices) == 0:
@@ -2506,6 +2499,13 @@ def merge_batch(self, other: "ScheduleBatch"):
         # future. Synchronize here to avoid a cross-stream data race.
         self.maybe_wait_verify_done()
 
+        # Invariant: chunked-resume / mid-prefill reqs must never reach
+        # running_batch via merge — running_batch runs decode forward and
+        # admitting a prefill-in-progress req there breaks shape + KV accounting.
+        # filter_batch's predicate is responsible for excluding these from
+        # last_batch before this merge call.
+        assert not any(r.has_pending_chunk for r in other.reqs)
+
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
         # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it
         # needs to be called with pre-merged Batch.reqs.
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 79d680e57d15..644171ec888a 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1082,16 +1082,11 @@ def init_chunked_prefill(self):
             self.chunked_prefill_size = None
         elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
             self.chunked_prefill_size = None
-        self.chunked_req = None
-        # Tracks whether the current self.chunked_req was actually scheduled
-        # into last iteration's batch (i.e., in can_run_list -> got a fresh
-        # req_pool_idx from prepare_for_extend). Used to gate the
-        # stash_chunked_request call at the top of get_next_batch_to_run:
-        # if add_chunked_req early-returned under hybrid-SWA pressure,
-        # the req_pool_idx was already freed and fill_ids was reset by
-        # init_next_round_input, so running stash would double-free and
-        # corrupt prefix_indices.
-        self._chunked_req_scheduled_last_iter = False
+        # Chunked-resume tracking is now per-req (Req.has_pending_chunk +
+        # is_chunked counter); the scheduler no longer holds a global pointer.
+        # Stage A stashes any waiting_queue req with has_pending_chunk; cache
+        # impls bound row reads by kv_committed_len so a stash after
+        # init_next_round_input is safe without the old gate.
         self.is_mixed_chunk = (
             self.chunked_prefill_size is not None
             and self.server_args.enable_mixed_chunk
@@ -2443,9 +2438,6 @@ def handle_batch_embedding_request(
         for tokenized_req in recv_req:
             self.handle_embedding_request(tokenized_req)
 
-    def stash_chunked_request(self, req: Req):
-        maybe_cache_unfinished_req(req, self.tree_cache, chunked=True)
-
     def _build_hisparse_decode_batch(self, reqs):
         """Build a ScheduleBatch for hisparse requests transitioning from staging to decode."""
         device = self.device
@@ -2490,21 +2482,17 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         if self.dllm_config is not None:
             self.dllm_manager.filter_finished_reqs()
 
-        # Merge the prefill batch into the running batch
-        chunked_req_to_exclude = set()
+        # Stage A: stash any in-flight chunked prefill KV into radix tree.
+        # Per-req loop over waiting_queue covers chunked-resume; DLLM staging
+        # reqs are owned by DllmManager (not in waiting_queue), handled
+        # separately below.
+        for req in self.waiting_queue:
+            if req.has_pending_chunk and not req.is_dllm():
+                maybe_cache_unfinished_req(req, self.tree_cache, chunked=True)
 
         if self.dllm_config is not None and self.dllm_manager.any_staging_reqs():
-            chunked_req_to_exclude.update(self.dllm_manager.staging_queue)
             for req in self.dllm_manager.staging_queue:
-                self.stash_chunked_request(req)
-
-        if self.chunked_req is not None:
-            # Move the chunked request out of the batch so that we can merge
-            # only finished requests to running_batch.
-            chunked_req_to_exclude.add(self.chunked_req)
-
-            if self._chunked_req_scheduled_last_iter:
-                self.stash_chunked_request(self.chunked_req)
+                maybe_cache_unfinished_req(req, self.tree_cache, chunked=True)
 
         # HiSparse has its own prefill-to-decode transition; skip last_batch merge.
         if self.enable_hisparse:
@@ -2524,19 +2512,10 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             and self.last_batch
             and self.last_batch.forward_mode.is_extend()
         ):
-            if self.last_batch.chunked_req is not None:
-                # In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req.
-                # We need to discard it.
-                chunked_req_to_exclude.add(self.last_batch.chunked_req)
-
-            if self.dllm_config is not None and self.last_batch.reqs:
-                chunked_req_to_exclude.update(self.last_batch.reqs)
-
-            # Filter batch
+            # filter_batch's internal predicate excludes still-prefilling reqs
+            # (has_pending_chunk / is_chunked > 0 / is_dllm) from merge.
             last_bs = self.last_batch.batch_size()
-            self.last_batch.filter_batch(
-                chunked_req_to_exclude=list(chunked_req_to_exclude)
-            )
+            self.last_batch.filter_batch()
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
@@ -2642,21 +2621,26 @@ def _get_new_batch_prefill_raw(
             # Reset batch_is_full to try preemption with a prefill adder.
             self.running_batch.batch_is_full = False
 
+        # Identify any in-flight chunked-resume req held in waiting_queue —
+        # priority + has_pending_chunk make it sit at the head, but its
+        # presence relaxes the "is queue empty / pool full" early exits below
+        # (we must keep scheduling it to make progress, or memory leaks).
+        has_chunked_resume = any(r.has_pending_chunk for r in self.waiting_queue)
+
         if (
             self.running_batch.batch_is_full or len(self.waiting_queue) == 0
-        ) and self.chunked_req is None:
+        ) and not has_chunked_resume:
             return None
 
         running_bs = len(self.running_batch.reqs)
 
-        # Ignore the check if self.chunked_req is not None.
-        # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
-        # as the space for the chunked requests has just been released.
-        # In PP case, chunked requests (or dllm requests) can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
-        # Instead, we should always allow chunked requests to be added, otherwise, there will be a memory leak.
+        # Ignore the check if there is a chunked-resume in flight.
+        # In the non-PP case the row was just released so the count is fine;
+        # in PP case, chunked reqs span microbatches so the per-mb max_running
+        # check should not block them.
         if (
             self.get_num_allocatable_reqs(running_bs) <= 0
-            and self.chunked_req is None
+            and not has_chunked_resume
             and not self.enable_priority_preemption
         ):
             self.running_batch.batch_is_full = True
@@ -2673,11 +2657,17 @@ def _get_new_batch_prefill_raw(
 
         # Determine chunked_prefill_size for this batch
         chunked_prefill_size = self.chunked_prefill_size
-        if self.chunked_req is not None and self.enable_dynamic_chunking:
-            history_len = len(self.chunked_req.prefix_indices)
-            dynamic_size = self.predict_next_chunk_size(history_len)
-            if dynamic_size is not None:
-                chunked_prefill_size = dynamic_size
+        if self.enable_dynamic_chunking:
+            # Single-flight invariant: at most one chunked-resume req in the
+            # queue at any time (priority + budget enforce this naturally).
+            chunked_resume = next(
+                (r for r in self.waiting_queue if r.has_pending_chunk), None
+            )
+            if chunked_resume is not None:
+                history_len = len(chunked_resume.prefix_indices)
+                dynamic_size = self.predict_next_chunk_size(history_len)
+                if dynamic_size is not None:
+                    chunked_prefill_size = dynamic_size
 
         # Prefill policy
         adder = PrefillAdder(
@@ -2698,26 +2688,6 @@ def _get_new_batch_prefill_raw(
             waiting_queue_len=len(self.waiting_queue),
         )
 
-        # Re-admit the in-flight chunked req via the unified add_one_req
-        # entry. add_one_req's reuse branch (gated on kv_committed_len > 0)
-        # mirrors the old add_chunked_req's behavior: skip lock_ref inc,
-        # init_load_back, and prefix budget. Sets req.has_pending_chunk to
-        # truncated.
-        if self.chunked_req is not None:
-            self.chunked_req.init_next_round_input()
-            adder.add_one_req(
-                self.chunked_req,
-                truncation_align_size=self.truncation_align_size,
-            )
-            # After admit, has_pending_chunk reflects whether more chunks
-            # remain. Mirror it into self.chunked_req for the existing
-            # Stage A stash path (deleted in a later commit).
-            if not self.chunked_req.has_pending_chunk:
-                self.chunked_req = None
-            self._chunked_req_scheduled_last_iter = self.chunked_req is not None
-        else:
-            self._chunked_req_scheduled_last_iter = False
-
         if self.enable_lora:
             running_loras = {req.lora_id for req in self.running_batch.reqs}
 
@@ -2758,7 +2728,14 @@ def _get_new_batch_prefill_raw(
                     req.rid
                 )
 
-            req.init_next_round_input(self.tree_cache)
+            # Chunked-resume reqs must NOT re-match prefix at admission
+            # (would re-assign req.last_node without rebalancing lock_ref,
+            # corrupting cache_unfinished_req's dec_lock_ref/inc_lock_ref
+            # pairing). They keep last_node from previous stash.
+            if req.has_pending_chunk:
+                req.init_next_round_input()
+            else:
+                req.init_next_round_input(self.tree_cache)
             res = adder.add_one_req(
                 req,
                 truncation_align_size=self.truncation_align_size,
@@ -2797,30 +2774,32 @@ def _get_new_batch_prefill_raw(
         if len(can_run_list) == 0:
             return None
 
+        # Drop admitted reqs from waiting_queue, but KEEP chunked-resume reqs
+        # (has_pending_chunk == True after admission) so they stay at the head
+        # for the next iter's stash + admission. Single-flight is preserved
+        # naturally by budget + priority.
         can_run_set = set(can_run_list)
-        self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_set]
+        self.waiting_queue = [
+            x
+            for x in self.waiting_queue
+            if x not in can_run_set or x.has_pending_chunk
+        ]
         if adder.preempt_list:
             for req in adder.preempt_list:
                 self._add_request_to_queue(req)
 
-        # Identify newly-truncated chunked-resume reqs admitted this iter via
-        # add_one_req's reuse/chunked branch. has_pending_chunk is set by
-        # add_one_req when truncated=True. The "newly chunked" set excludes
-        # self.chunked_req which was already tracked from previous iter.
-        new_chunked = [
-            r for r in can_run_list if r.has_pending_chunk and r is not self.chunked_req
-        ]
+        # Bump pending_middle_outputs (the is_chunked counter) for every
+        # admitted req that's still mid-prefill — output processor uses this
+        # to know its forward's sample is garbage. Counter semantics needed
+        # for PP, where multiple microbatches may admit the same req.
+        chunked_in_batch = [r for r in can_run_list if r.has_pending_chunk]
         assert (
-            len(new_chunked) <= 1
-        ), "single-flight invariant: at most one new chunked req per iter"
-        if new_chunked:
-            assert self.chunked_req is None
-            self.chunked_req = new_chunked[0]
-            # The chunked req is scheduled this iter -> stash needed next iter.
-            self._chunked_req_scheduled_last_iter = True
-
-        if self.chunked_req is not None:
-            self.chunked_req.is_chunked += 1
+            len(chunked_in_batch) <= 1
+        ), "single-flight invariant: at most one chunked-resume req per batch"
+        chunk_deduct = 0
+        for r in chunked_in_batch:
+            r.is_chunked += 1
+            chunk_deduct = r.extend_input_len
 
         # Record for logging prefill stats after forward
         self.adder = adder
@@ -2838,7 +2817,6 @@ def _get_new_batch_prefill_raw(
             self.model_config,
             self.enable_overlap,
             self.spec_algorithm,
-            chunked_req=self.chunked_req,
         )
         self.max_prefill_bs = max(self.max_prefill_bs, len(can_run_list))
         if self.enable_hierarchical_cache:
@@ -2855,11 +2833,7 @@ def _get_new_batch_prefill_raw(
             self.running_batch.reqs,
             self.enable_priority_scheduling,
             num_pending_tokens=self._get_num_pending_tokens(
-                chunk_deduct=(
-                    self.chunked_req.extend_input_len
-                    if self.chunked_req is not None
-                    else 0
-                )
+                chunk_deduct=chunk_deduct
             ),
         )
 
@@ -3313,7 +3287,6 @@ def is_fully_idle(self, for_health_check=False) -> bool:
         # Batch running status
         idle = (
             self.running_batch.is_empty()
-            and self.chunked_req is None
             and not self.dllm_manager.any_staging_reqs()
             and (self.last_batch is None or self.last_batch.is_empty())
             and (self.cur_batch is None or self.cur_batch.is_empty())
@@ -3681,11 +3654,11 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
 
         if recv_req.mode == "in_place":
             # In-place pause: just set the flag and return immediately.
-            # All scheduler state (running_batch, last_batch, chunked_req,
+            # All scheduler state (running_batch, last_batch, waiting_queue,
             # result_queue) is left untouched. On resume, the normal event
             # loop (get_next_batch_to_run) handles last_batch merge,
-            # chunked_req cleanup, and overlap result processing through
-            # the standard code paths. This avoids duplicating batch
+            # chunked-resume re-admission, and overlap result processing
+            # through the standard code paths. This avoids duplicating batch
             # manipulation logic and the accounting bugs that come with it.
             return
 
@@ -3695,10 +3668,9 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
             self.process_batch_result(tmp_batch, tmp_result)
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
-            chunked_req_to_exclude = set()
-            self.last_batch.filter_batch(
-                chunked_req_to_exclude=list(chunked_req_to_exclude)
-            )
+            # filter_batch's internal predicate excludes still-prefilling reqs
+            # (has_pending_chunk / is_chunked > 0 / is_dllm).
+            self.last_batch.filter_batch()
             # Skip merge for disagg prefill: completed prefill requests are
             # already in disagg_prefill_inflight_queue. Merging them into
             # running_batch leaks them, since the prefill event loop never
@@ -3723,7 +3695,6 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
                     self._add_request_to_queue(req)
 
             self.running_batch.batch_is_full = False
-            self.chunked_req = None
 
     def continue_generation(self, recv_req: ContinueGenerationReqInput):
         if recv_req.torch_empty_cache:
diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py
index 8a970b4bedcb..facccabff45d 100644
--- a/python/sglang/srt/mem_cache/chunk_cache.py
+++ b/python/sglang/srt/mem_cache/chunk_cache.py
@@ -88,7 +88,7 @@ def cache_unfinished_req(self, req: Req, chunked=False):
         kv_indices = self.req_to_token_pool.req_to_token[
             req.req_pool_idx, : req.kv_committed_len
         ]
-        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        # `req.prefix_indices` will be used by add_one_req reuse branch next iter
         req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True)
 
     def evict(self, params: EvictParams) -> EvictResult:
diff --git a/python/sglang/srt/mem_cache/mamba_radix_cache.py b/python/sglang/srt/mem_cache/mamba_radix_cache.py
index 325b3aa08aae..00d4c165e50b 100644
--- a/python/sglang/srt/mem_cache/mamba_radix_cache.py
+++ b/python/sglang/srt/mem_cache/mamba_radix_cache.py
@@ -608,7 +608,7 @@ def _skip_cache_unfinished_req(req: Req) -> None:
                 req.req_pool_idx, :read_len
             ]
 
-            # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+            # `req.prefix_indices` will be used by add_one_req reuse branch next iter
             req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True)
             return
 
@@ -708,7 +708,7 @@ def _skip_cache_unfinished_req(req: Req) -> None:
         self.dec_lock_ref(req.last_node)
         self.inc_lock_ref(new_last_node)
 
-        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        # `req.prefix_indices` will be used by add_one_req reuse branch next iter
         # NOTE: this is needed for both page_size == 1 and page_size > 1
         req.prefix_indices = torch.cat(
             [new_indices, kv_indices_orig[len(new_indices) :]]
diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py
index 9e1e93e48a79..f4b193c73965 100644
--- a/python/sglang/srt/mem_cache/radix_cache.py
+++ b/python/sglang/srt/mem_cache/radix_cache.py
@@ -543,7 +543,7 @@ def cache_unfinished_req(self, req: Req, chunked=False):
         self.dec_lock_ref(req.last_node)
         self.inc_lock_ref(new_last_node)
 
-        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        # `req.prefix_indices` will be used by add_one_req reuse branch next iter
         # - page_size != 1: there is a partial page at the end, keep the full kv_indices
         # - eagle case: bigram keys will only cache len - 1 kv indices
         if len(new_indices) < len(kv_indices):
diff --git a/python/sglang/srt/mem_cache/swa_radix_cache.py b/python/sglang/srt/mem_cache/swa_radix_cache.py
index 2457ec817446..a3936683e16f 100644
--- a/python/sglang/srt/mem_cache/swa_radix_cache.py
+++ b/python/sglang/srt/mem_cache/swa_radix_cache.py
@@ -492,7 +492,7 @@ def cache_unfinished_req(self, req: Req, chunked=False) -> None:
                 req.req_pool_idx, :read_len
             ]
 
-            # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+            # `req.prefix_indices` will be used by add_one_req reuse branch next iter
             req.prefix_indices = kv_indices
             return
 
@@ -543,7 +543,7 @@ def cache_unfinished_req(self, req: Req, chunked=False) -> None:
         result = self.inc_lock_ref(new_last_node)
         swa_uuid_for_lock = result.swa_uuid_for_lock
 
-        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        # `req.prefix_indices` will be used by add_one_req reuse branch next iter
         if len(new_indices) < len(kv_indices):
             req.prefix_indices = torch.cat(
                 [new_indices, kv_indices[len(new_indices) :]]
diff --git a/python/sglang/srt/observability/scheduler_metrics_mixin.py b/python/sglang/srt/observability/scheduler_metrics_mixin.py
index 86cd5bfb1e81..e4590c0ba6c4 100644
--- a/python/sglang/srt/observability/scheduler_metrics_mixin.py
+++ b/python/sglang/srt/observability/scheduler_metrics_mixin.py
@@ -976,10 +976,11 @@ def _get_num_pending_tokens(self: Scheduler, chunk_deduct: int = 0) -> int:
         num_pending_tokens = sum(
             req.seqlen - len(req.prefix_indices) for req in self.waiting_queue
         )
-        if self.chunked_req is not None:
-            req = self.chunked_req
-            num_pending_tokens += req.seqlen - len(req.prefix_indices) - chunk_deduct
-        return num_pending_tokens
+        # The chunked-resume req (if any) is now in self.waiting_queue, so
+        # it's already counted in the sum above. chunk_deduct subtracts the
+        # current chunk's extend that has been planned but not yet reflected
+        # in prefix_indices.
+        return num_pending_tokens - chunk_deduct
 
     def get_loads(self: Scheduler, req: GetLoadsReqInput = None) -> GetLoadsReqOutput:
         """

From b9d5d6ed5fddd3e360468da06e8f344e79f160a8 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 18:58:05 +0800
Subject: [PATCH 08/52] refactor: rename Req.is_chunked ->
 Req.pending_middle_outputs

Pure rename. The field is an int counter ("how many middle-block prefill
forwards are admitted but not yet output-processed"), not a boolean.
The old name suggested a 'is this req chunked?' boolean and made call
sites like 'is_chunked += 1' and 'if is_chunked <= 0' read confusingly.

Also renames the DLLM mixin helper increment_chunked_count() to
increment_pending_middle_outputs() for symmetry.

Updated the field's docstring to describe its counter semantics + PP
behavior.

No semantic changes.
---
 python/sglang/srt/disaggregation/prefill.py   |  4 ++--
 python/sglang/srt/dllm/mixin/scheduler.py     |  6 +++---
 python/sglang/srt/managers/schedule_batch.py  | 20 +++++++++++--------
 python/sglang/srt/managers/schedule_policy.py |  2 +-
 python/sglang/srt/managers/scheduler.py       | 10 +++++-----
 .../scheduler_output_processor_mixin.py       |  8 ++++----
 6 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index 715b7739ccbf..0c65f97f5da0 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -504,7 +504,7 @@ def process_batch_result_disagg_prefill(
         for i, (req, next_token_id) in enumerate(
             zip(batch.reqs, next_token_ids, strict=True)
         ):
-            if req.is_chunked <= 0:
+            if req.pending_middle_outputs <= 0:
                 req.time_stats.set_prefill_finished_time()
 
                 # There is no output_ids for prefill
@@ -554,7 +554,7 @@ def process_batch_result_disagg_prefill(
                     req.grammar.finished = req.finished()
             else:
                 # being chunked reqs' prefill is not finished
-                req.is_chunked -= 1
+                req.pending_middle_outputs -= 1
 
                 if req.return_logprob:
                     extend_logprob_start_len = extend_logprob_start_len_per_req[i]
diff --git a/python/sglang/srt/dllm/mixin/scheduler.py b/python/sglang/srt/dllm/mixin/scheduler.py
index e8a563703811..4246822c9b6e 100644
--- a/python/sglang/srt/dllm/mixin/scheduler.py
+++ b/python/sglang/srt/dllm/mixin/scheduler.py
@@ -200,7 +200,7 @@ def _update_state_for_batch(
 
         if can_run_list:
             self.dllm_manager.add_staging_reqs(can_run_list)
-            self.dllm_manager.increment_chunked_count()
+            self.dllm_manager.increment_pending_middle_outputs()
 
         self.adder = adder
         self.can_run_list = can_run_list
@@ -335,10 +335,10 @@ def is_empty(self) -> bool:
             return True
         return len(self.waiting_queue) == 0
 
-    def increment_chunked_count(self) -> None:
+    def increment_pending_middle_outputs(self) -> None:
         """Increment chunked count for all staging requests."""
         for req in self.staging_queue:
-            req.is_chunked += 1
+            req.pending_middle_outputs += 1
 
     def filter_finished_reqs(self) -> None:
         """Remove finished requests from both queues."""
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 83254cb50e9f..51781d8d28a2 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -752,16 +752,20 @@ def __init__(
         # The prefix length that is inserted into the tree cache
         self.cache_protected_len: int = 0
 
-        # Whether or not if it is chunked. It increments whenever
-        # it is chunked, and decrement whenever chunked request is
-        # processed.
-        self.is_chunked = 0
+        # Counter of middle-block prefill forwards that have been admitted
+        # but not yet output-processed for this req. Increments at admission
+        # for non-last chunks; decrements at output_processor. In PP, can
+        # exceed 1 because multiple microbatches may hold the same chunked
+        # req in flight concurrently. In non-PP, oscillates 0/1 within each
+        # iter. Used by output_processor to know whether this forward's
+        # sample is real (==0) or garbage (>0).
+        self.pending_middle_outputs = 0
 
         # Persistent (cross-iter) flag set by admission when this req's
         # current admission was truncated (more chunks remain). Cleared
         # when last chunk is admitted (truncated=False) or on retract.
         # Used by Stage A stash detection, filter_batch exclusion, and
-        # add_one_req's reuse-vs-fresh branch. Independent of is_chunked
+        # add_one_req's reuse-vs-fresh branch. Independent of pending_middle_outputs
         # counter (transient) and kv_committed_len (derived).
         self.has_pending_chunk = False
 
@@ -1265,7 +1269,7 @@ def reset_for_retract(self):
         self.temp_input_top_logprobs_val = None
         self.temp_input_top_logprobs_idx = None
         self.extend_logprob_start_len = 0
-        self.is_chunked = 0
+        self.pending_middle_outputs = 0
         self.has_pending_chunk = False
         self.mamba_pool_idx = None
         self.mamba_ping_pong_track_buffer = None
@@ -2415,7 +2419,7 @@ def filter_batch(
         # runs decode forward, and admitting a mid-prefill req there causes
         # shape mismatch + double KV accounting. Enforced per-req:
         #   - has_pending_chunk: chunked-resume scheduled to continue
-        #   - is_chunked > 0: PP in-flight middle chunk for this req
+        #   - pending_middle_outputs > 0: PP in-flight middle chunk for this req
         #   - is_dllm(): DllmManager-managed (separate staging queue)
         # FIXME(lsyin): used here to get the correct seq_lens
         # The batch has been launched but we need it verified to get correct next batch info
@@ -2427,7 +2431,7 @@ def filter_batch(
                 for i in range(len(self.reqs))
                 if not self.reqs[i].finished()
                 and not self.reqs[i].has_pending_chunk
-                and not self.reqs[i].is_chunked > 0
+                and not self.reqs[i].pending_middle_outputs > 0
                 and not self.reqs[i].is_dllm()
             ]
 
diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index b924592fc1f1..61f1027a675f 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -939,7 +939,7 @@ def add_one_req(
                 truncated = True
 
         # has_pending_chunk: persistent flag carrying chunked-resume state
-        # across iters. DLLM uses its own staging_queue + is_chunked counter.
+        # across iters. DLLM uses its own staging_queue + pending_middle_outputs counter.
         if not req.is_dllm():
             req.has_pending_chunk = truncated
 
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 644171ec888a..a6fa1c3176f7 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1083,7 +1083,7 @@ def init_chunked_prefill(self):
         elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
             self.chunked_prefill_size = None
         # Chunked-resume tracking is now per-req (Req.has_pending_chunk +
-        # is_chunked counter); the scheduler no longer holds a global pointer.
+        # pending_middle_outputs counter); the scheduler no longer holds a global pointer.
         # Stage A stashes any waiting_queue req with has_pending_chunk; cache
         # impls bound row reads by kv_committed_len so a stash after
         # init_next_round_input is safe without the old gate.
@@ -2513,7 +2513,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             and self.last_batch.forward_mode.is_extend()
         ):
             # filter_batch's internal predicate excludes still-prefilling reqs
-            # (has_pending_chunk / is_chunked > 0 / is_dllm) from merge.
+            # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm) from merge.
             last_bs = self.last_batch.batch_size()
             self.last_batch.filter_batch()
             if self.last_batch.batch_size() < last_bs:
@@ -2788,7 +2788,7 @@ def _get_new_batch_prefill_raw(
             for req in adder.preempt_list:
                 self._add_request_to_queue(req)
 
-        # Bump pending_middle_outputs (the is_chunked counter) for every
+        # Bump pending_middle_outputs (the pending_middle_outputs counter) for every
         # admitted req that's still mid-prefill — output processor uses this
         # to know its forward's sample is garbage. Counter semantics needed
         # for PP, where multiple microbatches may admit the same req.
@@ -2798,7 +2798,7 @@ def _get_new_batch_prefill_raw(
         ), "single-flight invariant: at most one chunked-resume req per batch"
         chunk_deduct = 0
         for r in chunked_in_batch:
-            r.is_chunked += 1
+            r.pending_middle_outputs += 1
             chunk_deduct = r.extend_input_len
 
         # Record for logging prefill stats after forward
@@ -3669,7 +3669,7 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
             # filter_batch's internal predicate excludes still-prefilling reqs
-            # (has_pending_chunk / is_chunked > 0 / is_dllm).
+            # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm).
             self.last_batch.filter_batch()
             # Skip merge for disagg prefill: completed prefill requests are
             # already in disagg_prefill_inflight_queue. Merging them into
diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
index ae6f732fe934..234b56b78865 100644
--- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py
+++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -241,7 +241,7 @@ def process_batch_result_prefill(
                     # decode req in mixed batch or retracted req
                     continue
 
-                if req.is_chunked <= 0:
+                if req.pending_middle_outputs <= 0:
                     req.time_stats.set_prefill_finished_time()
 
                     # req output_ids are set here
@@ -314,7 +314,7 @@ def process_batch_result_prefill(
 
                 else:
                     # being chunked reqs' prefill is not finished
-                    req.is_chunked -= 1
+                    req.pending_middle_outputs -= 1
                     # There is only at most one request being currently chunked.
                     # Because this request does not finish prefill,
                     # we don't want to stream the request currently being chunked.
@@ -380,7 +380,7 @@ def process_batch_result_prefill(
                 req.embedding = embeddings[i]
                 if req.return_pooled_hidden_states and phs is not None:
                     req.pooled_hidden_state = phs[i]
-                if req.is_chunked <= 0:
+                if req.pending_middle_outputs <= 0:
                     req.time_stats.set_prefill_finished_time()
                     # Dummy output token for embedding models
                     req.output_ids.append(0)
@@ -393,7 +393,7 @@ def process_batch_result_prefill(
                         maybe_cache_unfinished_req(req, self.tree_cache)
                 else:
                     # being chunked reqs' prefill is not finished
-                    req.is_chunked -= 1
+                    req.pending_middle_outputs -= 1
                     req.time_stats.set_last_chunked_prefill_finish_time()
 
         self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)

From f0388931bf6d3bde95d24fa967bb78bdc24532fd Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:07:47 +0800
Subject: [PATCH 09/52] Fix retract_all passing List[Req] to filter_batch as
 keep_indices

After 3fd7319a3d removed the chunked_req_to_exclude first-positional
parameter from filter_batch, retract_all's existing call
`self.filter_batch(retracted_reqs)` silently broke: the new first
positional is keep_indices: Optional[List[int]], so we were trying to
index reqs by Req objects.

Pass keep_indices=[] explicitly to clear all reqs (the original intent).
---
 python/sglang/srt/managers/schedule_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 51781d8d28a2..7fa6f054abb2 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2173,7 +2173,7 @@ def retract_all(self, server_args: ServerArgs):
         for idx in range(len(self.reqs)):
             self.release_req(idx, len(self.reqs) - idx, server_args)
 
-        self.filter_batch(retracted_reqs)
+        self.filter_batch(keep_indices=[])
         return retracted_reqs
 
     def retract_decode(

From fd3dcca22fd8f6dca91ffe24f102b79cfbc9d497 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:09:28 +0800
Subject: [PATCH 10/52] Refactor filter_batch to use explicit
 exclude_chunked_req flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous commit added an internal predicate that always excluded
chunked-resume / PP middle-chunk / DLLM-staging reqs, with an inline
invariant comment explaining what was filtered. Two issues:

1. Implicit behavior — callers had no way to grep for which sites
   actually rely on the prefill-pending exclusion.
2. Awkward API — retract_all and retract_decode (which pass
   keep_indices) had no use for the predicate, and the predicate's
   surface area drifted from the original chunked_req_to_exclude API.

Reintroduce caller-supplied opt-in via exclude_chunked_req: bool,
matching the spirit of the original chunked_req_to_exclude parameter.
All sites that need the exclusion pass True; the few that pass
keep_indices remain unchanged.
---
 python/sglang/srt/disaggregation/decode.py   |  2 +-
 python/sglang/srt/disaggregation/prefill.py  |  3 +--
 python/sglang/srt/managers/schedule_batch.py | 27 +++++++++-----------
 python/sglang/srt/managers/scheduler.py      | 14 +++++-----
 4 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index 797071794d68..b778c7eea712 100644
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -1648,7 +1648,7 @@ def get_next_disagg_decode_batch_to_run(
         if new_prebuilt_batch:
             assert not any(r.has_pending_chunk for r in self.waiting_queue)
             self.process_batch_result_prebuilt(new_prebuilt_batch)
-            new_prebuilt_batch.filter_batch()
+            new_prebuilt_batch.filter_batch(exclude_chunked_req=True)
             if not new_prebuilt_batch.is_empty():
                 if self.running_batch.is_empty():
                     self.running_batch = new_prebuilt_batch
diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index 0c65f97f5da0..24292837e75c 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -732,9 +732,8 @@ def process_prefill_chunk(self: Scheduler) -> None:
                 self.running_batch.batch_is_full = False
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
-            # filter_batch's internal predicate excludes still-prefilling reqs.
             last_bs = self.last_batch.batch_size()
-            self.last_batch.filter_batch()
+            self.last_batch.filter_batch(exclude_chunked_req=True)
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 7fa6f054abb2..b889d7321efa 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2413,14 +2413,8 @@ def filter_batch(
         keep_indices: Optional[List[int]] = None,
         # FIXME(lsyin): deprecate this API after spec v1 is deprecated
         v1_spec_info_filtered: Optional[bool] = False,
+        exclude_chunked_req: bool = False,
     ):
-        # Invariant: reqs still doing prefill (chunked-resume or DLLM-managed)
-        # must never be merged into running_batch via this filter — running_batch
-        # runs decode forward, and admitting a mid-prefill req there causes
-        # shape mismatch + double KV accounting. Enforced per-req:
-        #   - has_pending_chunk: chunked-resume scheduled to continue
-        #   - pending_middle_outputs > 0: PP in-flight middle chunk for this req
-        #   - is_dllm(): DllmManager-managed (separate staging queue)
         # FIXME(lsyin): used here to get the correct seq_lens
         # The batch has been launched but we need it verified to get correct next batch info
         self.maybe_wait_verify_done()
@@ -2430,9 +2424,14 @@ def filter_batch(
                 i
                 for i in range(len(self.reqs))
                 if not self.reqs[i].finished()
-                and not self.reqs[i].has_pending_chunk
-                and not self.reqs[i].pending_middle_outputs > 0
-                and not self.reqs[i].is_dllm()
+                and not (
+                    exclude_chunked_req
+                    and (
+                        self.reqs[i].has_pending_chunk
+                        or self.reqs[i].pending_middle_outputs > 0
+                        or self.reqs[i].is_dllm()
+                    )
+                )
             ]
 
         if keep_indices is None or len(keep_indices) == 0:
@@ -2503,11 +2502,9 @@ def merge_batch(self, other: "ScheduleBatch"):
         # future. Synchronize here to avoid a cross-stream data race.
         self.maybe_wait_verify_done()
 
-        # Invariant: chunked-resume / mid-prefill reqs must never reach
-        # running_batch via merge — running_batch runs decode forward and
-        # admitting a prefill-in-progress req there breaks shape + KV accounting.
-        # filter_batch's predicate is responsible for excluding these from
-        # last_batch before this merge call.
+        # Caller must filter_batch(exclude_chunked_req=True) on the other batch
+        # before merging — running_batch runs decode forward and admitting a
+        # prefill-in-progress req there breaks shape + KV accounting.
         assert not any(r.has_pending_chunk for r in other.reqs)
 
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index a6fa1c3176f7..12a77a14fbb0 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2512,10 +2512,8 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             and self.last_batch
             and self.last_batch.forward_mode.is_extend()
         ):
-            # filter_batch's internal predicate excludes still-prefilling reqs
-            # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm) from merge.
             last_bs = self.last_batch.batch_size()
-            self.last_batch.filter_batch()
+            self.last_batch.filter_batch(exclude_chunked_req=True)
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
@@ -2533,7 +2531,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         # Runs outside the last_batch block so stale requests are cleaned
         # even when no new batches arrive (e.g. traffic stops).
         if self.running_batch.is_prefill_only:
-            self.running_batch.filter_batch()
+            self.running_batch.filter_batch(exclude_chunked_req=True)
             if self.running_batch.is_empty():
                 self.running_batch.batch_is_full = False
 
@@ -2846,7 +2844,9 @@ def _get_new_batch_prefill_raw(
             and new_batch.input_embeds is None
         ):
             # TODO (lianmin): support return_logprob + mixed chunked prefill
-            self.running_batch.filter_batch(v1_spec_info_filtered=True)
+            self.running_batch.filter_batch(
+                v1_spec_info_filtered=True, exclude_chunked_req=True
+            )
             if not self.running_batch.is_empty():
                 self.running_batch.prepare_for_decode()
                 new_batch.mix_with_running(self.running_batch)
@@ -3668,9 +3668,7 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
             self.process_batch_result(tmp_batch, tmp_result)
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
-            # filter_batch's internal predicate excludes still-prefilling reqs
-            # (has_pending_chunk / pending_middle_outputs > 0 / is_dllm).
-            self.last_batch.filter_batch()
+            self.last_batch.filter_batch(exclude_chunked_req=True)
             # Skip merge for disagg prefill: completed prefill requests are
             # already in disagg_prefill_inflight_queue. Merging them into
             # running_batch leaks them, since the prefill event loop never

From a79ba1b2f79c199e19693187cc69e0490fc9cb37 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:17:55 +0800
Subject: [PATCH 11/52] Tighten add_one_req reuse gate to has_pending_chunk

The 'is_resume' predicate previously fired for any req with
kv_committed_len > 0, which incorrectly included streaming-session
turn N>1 reqs (they inherit kv_committed_len from the session slot
but are NOT chunked-resume). The reuse branch skips
_req_inc_lock_ref, so those reqs would leave their last_node lock
underbalanced.

Tighten to the persistent chunked-resume flag (req.has_pending_chunk)
so only true mid-prefill reqs take the reuse path.
---
 python/sglang/srt/managers/schedule_policy.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index 61f1027a675f..a4167e01e05a 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -782,12 +782,13 @@ def add_req_state(r, insert_sort=False):
     def add_one_req(
         self, req: Req, truncation_align_size: Optional[int]
     ):
-        # Reuse path: this req was admitted in a previous iter, has a row
-        # with committed KV (kv_committed_len > 0), and is mid-prefill. Skip
-        # fresh-req setup (lock_ref already held by previous stash;
-        # init_load_back already ran on first admission; prefix already
-        # counted in tree). DLLM has its own path and never takes reuse here.
-        is_resume = req.kv_committed_len > 0 and not req.is_dllm()
+        # Reuse path: this req's previous chunk left lock_ref held, prefix
+        # already in tree, and init_load_back already consumed host KV. We
+        # must skip fresh-req setup. Gate on `has_pending_chunk` (the
+        # persistent chunked-resume flag) — `kv_committed_len > 0` alone is
+        # wider (streaming-session turn N>1 also has it without being
+        # chunked-resume) and would skip _req_inc_lock_ref incorrectly.
+        is_resume = req.has_pending_chunk and not req.is_dllm()
 
         if (self.prefill_delayer_single_pass is not None) and (
             not self.prefill_delayer_single_pass.negotiate_should_allow_prefill(

From d7fa48baad6c5e3f5221b0f849f65c3ecd56c6e5 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:18:16 +0800
Subject: [PATCH 12/52] Reset host_hit_length unconditionally in
 prepare_for_extend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reset was nested inside two conditionals:
  if not req.retracted_stain:          # skip after retract
      ...
      if not req._cache_breakdown_computed:   # skip after first chunk
          ...
          req.host_hit_length = 0

After a req is retracted (retracted_stain stays True forever) and
re-admitted, the outer block is skipped, so the reset never fires.
The re-admission's match_prefix sets host_hit_length non-zero, then
init_load_back consumes it on chunk 1 — but chunk 2's admission still
sees the stale value and runs init_load_back a second time
(double-load + lock_ref imbalance).

Move the reset out of both conditionals so it runs once per
admission. The breakdown metric still computes only on the first
chunk via _cache_breakdown_computed.
---
 python/sglang/srt/managers/schedule_batch.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index b889d7321efa..83f8ffd62bd6 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1843,12 +1843,15 @@ def prepare_for_extend(self):
                     req.cached_tokens_host = host_portion
                     req.cached_tokens_storage = storage_portion
                     req._cache_breakdown_computed = True
-                    # Reset host_hit_length after metric is computed so that
-                    # subsequent chunks' admission paths see host_hit_length == 0
-                    # and naturally skip init_load_back (host KV already loaded).
-                    req.host_hit_length = 0
 
                 req.already_computed = seq_len
+            # Reset host_hit_length after init_load_back consumed it so that
+            # subsequent chunks' admissions skip init_load_back (host KV
+            # already loaded). Runs unconditionally: post-retract reqs have
+            # retracted_stain=True (skipping the outer block) but still
+            # match_prefix + init_load_back on their re-admission, so the
+            # reset must apply to them too.
+            req.host_hit_length = 0
             req.is_retracted = False
 
             if get_global_server_args().enable_mamba_extra_buffer():

From aaf3752d2b603ff5c0b3e66b36452614cfd5a29d Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:18:32 +0800
Subject: [PATCH 13/52] Skip chunked-resume reqs in calc_priority prefix
 matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_compute_prefix_matches runs match_prefix_for_req on every
waiting_queue item. match_prefix_for_req unconditionally overwrites
req.prefix_indices, req.last_node, req.last_host_node, and
req.host_hit_length from the new match result.

For a chunked-resume req:
  - its last_node was inc_lock_ref'd by the prior Stage A stash
  - overwriting last_node leaves that lock_ref permanently inflated
  - prefix_indices reset would mislead next chunk's admission (the
    KV row was written up to kv_committed_len; admission must see
    that length as the prefix)
  - host_hit_length would re-trigger init_load_back on next chunk

Skip these reqs — their prefix_indices/last_node from the prior
stash is already authoritative, and the LPM/DFS_WEIGHT sort uses
len(prefix_indices)/last_node, which read correctly from the stashed
state.

Only triggers under --schedule-policy lpm/dfs-weight; FCFS path is
unaffected.
---
 python/sglang/srt/managers/schedule_policy.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index a4167e01e05a..f809f76ade94 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -235,6 +235,12 @@ def _compute_prefix_matches(
         self.waiting_queue_radix_tree.reset()
 
         for r in waiting_queue:
+            if r.has_pending_chunk:
+                # Chunked-resume reqs already have prefix_indices + last_node
+                # set by the prior chunk's Stage A stash, plus an inc'd
+                # lock_ref on last_node. Re-running match_prefix here would
+                # overwrite both, leaving the prior inc unbalanced.
+                continue
             prefix_ids = r.origin_input_ids + r.output_ids
             extra_key = r.extra_key
             match_result = match_prefix_for_req(self.tree_cache, r, prefix_ids)

From 359e5ed7bd075fe70e44f0d344c153475c35c6d9 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:18:47 +0800
Subject: [PATCH 14/52] Skip chunked-resume reqs in _abort_on_waiting_timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the v2 refactor, chunked-resume reqs live in waiting_queue
across iters while actively prefilling. Their wait_queue_entry_time
is set on original arrival and never refreshed, so a sufficiently
long prefill (large prompt, many chunks, slow GPU) makes them look
'stuck' to _abort_on_waiting_timeout — which would abort them and
leak the held req_to_token row + radix tree lock_ref + committed KV.

Skip reqs with has_pending_chunk=True. Only takes effect when
SGLANG_REQ_WAITING_TIMEOUT > 0 (env-gated; off by default).
---
 python/sglang/srt/managers/scheduler.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 12a77a14fbb0..4d05a9ab2ff7 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2341,6 +2341,12 @@ def _abort_on_waiting_timeout(self):
         deleted_reqs = set()
         deadline = time.perf_counter() - timeout_s
         for req in self.waiting_queue:
+            # Chunked-resume reqs sit in waiting_queue across iters while
+            # actively prefilling — they are not idle. Their entry_time is
+            # from their original arrival, so a long prefill would falsely
+            # trigger the timeout and leak KV + row.
+            if req.has_pending_chunk:
+                continue
             entry_time = req.time_stats.wait_queue_entry_time
             if 0 < entry_time < deadline:
                 if self.enable_hicache_storage:

From 5ed4faf0ab66b43e05dced12ab846a15164f8317 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:18:58 +0800
Subject: [PATCH 15/52] Bypass LoRA scheduling gate for chunked-resume reqs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the v2 refactor, chunked-resume reqs share the waiting_queue
loop with fresh admissions. If _can_schedule_lora_req rejects a
chunked-resume req (e.g. its adapter entered the drainer between
chunks), the req stays in waiting_queue indefinitely while holding
its req_to_token row, tree lock_ref, and committed KV — a deadlock
that no other code path clears.

The LoRA admission check is meaningful only at first-chunk
admission; once chunked-resume is in flight, the adapter is already
loaded and the drainer cannot meaningfully reject it. Skip the gate
for these reqs.
---
 python/sglang/srt/managers/scheduler.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 4d05a9ab2ff7..08b5c12d08db 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2703,7 +2703,16 @@ def _get_new_batch_prefill_raw(
 
         # Get requests from the waiting queue to a new prefill batch
         for req in self.waiting_queue:
-            if self.enable_lora and not self._can_schedule_lora_req(req, running_loras):
+            # Chunked-resume reqs hold a row + tree lock_ref from their prior
+            # admission. If the LoRA drainer rejects them mid-prefill, they
+            # stay in waiting_queue forever — deadlock + KV leak. Their LoRA
+            # adapter was already accepted on the first admission, so the
+            # drainer/validate check is moot for them.
+            if (
+                self.enable_lora
+                and not req.has_pending_chunk
+                and not self._can_schedule_lora_req(req, running_loras)
+            ):
                 continue
 
             running_bs = len(self.running_batch.reqs)

From dbdcdde24520401b69b9d1054acc688568109132 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:19:13 +0800
Subject: [PATCH 16/52] Skip mamba_pool_idx cleanup for chunked-resume on
 NO_TOKEN

The NO_TOKEN failure path in get_new_batch_prefill frees the req's
mamba_pool_idx on the assumption that the slot was freshly allocated
this iter and the admission was rolled back. For a chunked-resume
req that hits NO_TOKEN this iter (budget transiently full), the
mamba_pool_idx was actually allocated on its first admission and
holds live mamba state needed for the remaining chunks.

Add has_pending_chunk to the existing 'don't free' guard alongside
the session check, matching the same intent: the slot's lifecycle
extends beyond this admission attempt.
---
 python/sglang/srt/managers/scheduler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 08b5c12d08db..410636344fec 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2770,9 +2770,13 @@ def _get_new_batch_prefill_raw(
                 # Only free if the slot was freshly allocated in this batch (not
                 # pre-existing from a session). Session-held slots have their own
                 # lifecycle and freeing them here causes double-free.
+                # Chunked-resume reqs inherit mamba_pool_idx from their first
+                # admission; freeing it on a transient NO_TOKEN this iter would
+                # discard a live mamba state still needed by subsequent chunks.
                 added = len(adder.can_run_list) > 0 and req is adder.can_run_list[-1]
                 if (
                     not added
+                    and not req.has_pending_chunk
                     and req.mamba_pool_idx is not None
                     and not getattr(req, "session", None)
                 ):

From 36ec1d7269a398a39bdd7e2f6e2753b48e767e93 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:19:25 +0800
Subject: [PATCH 17/52] Widen merge_batch assert to match filter_batch
 predicate

The exclude_chunked_req predicate in filter_batch covers three
states (has_pending_chunk, pending_middle_outputs > 0, is_dllm), but
the safety assert in merge_batch only checked the first. If a future
caller forgets exclude_chunked_req=True or uses an explicit
keep_indices that lets a PP middle-chunk or DLLM staging req
through, the assert wouldn't catch it.

Mirror all three clauses so the assert is a true defense-in-depth
for the documented invariant.
---
 python/sglang/srt/managers/schedule_batch.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 83f8ffd62bd6..19a0a202fedd 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2507,8 +2507,13 @@ def merge_batch(self, other: "ScheduleBatch"):
 
         # Caller must filter_batch(exclude_chunked_req=True) on the other batch
         # before merging — running_batch runs decode forward and admitting a
-        # prefill-in-progress req there breaks shape + KV accounting.
-        assert not any(r.has_pending_chunk for r in other.reqs)
+        # prefill-in-progress req there breaks shape + KV accounting. Mirror
+        # the full exclude_chunked_req predicate so PP middle-chunk and DLLM
+        # staging reqs are also caught here.
+        assert not any(
+            r.has_pending_chunk or r.pending_middle_outputs > 0 or r.is_dllm()
+            for r in other.reqs
+        )
 
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
         # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it

From 116584e8faa401bbaaf1d2e376e9592fae95678f Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 19:19:43 +0800
Subject: [PATCH 18/52] Bound streaming-session chunked stash by
 kv_committed_len
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

83e48ec295 fixed cache_unfinished_req in all 6 non-session cache
impls to read req_to_token[:req.kv_committed_len] instead of
[:len(req.fill_ids)] — required because init_next_round_input
restores fill_ids to origin+output (full length) on subsequent
admissions, while the row only holds KV up to kv_committed_len.

StreamingSession.try_cache_unfinished_req(chunked=True) was missed
in that pass. It typically saw len(fill_ids) == kv_committed_len in
the success path, but after a SWA early-return:
  - chunk N succeeds → fill_ids truncated to chunk N end == kv_committed_len
  - chunk N+1 admission attempt: init_next_round_input() restores
    fill_ids to full length, then SWA budget rejects → AddReqResult.NO_TOKEN
  - next iter's Stage A stash reads
    req_to_token[req_pool_idx, :len(fill_ids) = full_length]
    which holds garbage for positions [chunk_N_end : full_length]
  - that garbage gets copied into prefix_indices, corrupting the
    subsequent admission's view of the cached prefix.

Bound by kv_committed_len and add the same protected-len assert as
the other cache impls.
---
 python/sglang/srt/session/streaming_session.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/session/streaming_session.py b/python/sglang/srt/session/streaming_session.py
index a60b3376c080..e295efe6d39b 100644
--- a/python/sglang/srt/session/streaming_session.py
+++ b/python/sglang/srt/session/streaming_session.py
@@ -330,8 +330,15 @@ def try_cache_unfinished_req(
         if not _is_streaming(req):
             return False
         if chunked:
+            # Bound row read by kv_committed_len, NOT len(fill_ids): after
+            # a SWA early-return the next iter's init_next_round_input
+            # restores fill_ids to origin+output (full length), but the
+            # row only holds KV up to kv_committed_len — reading beyond
+            # that yields garbage slot indices. See radix_cache.py for
+            # the same fix applied to the non-session caches.
+            assert req.kv_committed_len >= req.cache_protected_len
             kv_indices = self.req_to_token_pool.req_to_token[
-                req.req_pool_idx, : len(req.fill_ids)
+                req.req_pool_idx, : req.kv_committed_len
             ]
             req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True)
             return True

From 96d47490947b3444f626f7a81c7682830da43ee8 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:02:24 +0800
Subject: [PATCH 19/52] Release row + KV + lock_ref when aborting a
 chunked-resume req from waiting_queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The waiting-queue abort path in abort_request only frees disagg-decode KV
and mamba state. Before v2, that covered every kind of resource a waiting
req could hold — fresh waiters had no row/KV/lock_ref.

The stateless-scheduler v2 refactor changed this: chunked-resume reqs now
live in waiting_queue across iterations while holding their req_to_token
row, committed KV slots, and a radix tree lock_ref on req.last_node from
the prior Stage A stash. Aborting such a req while it sits only in
waiting_queue (i.e. the to_finish dedup keeps it off the batch path) left
all three permanently leaked.

Extend the existing mamba branch's release_kv_cache(is_insert=False) call
to also cover has_pending_chunk + req_pool_idx-holding reqs. Defensively
clear has_pending_chunk + pending_middle_outputs after release so any
stale reference can't drag the freed row into a subsequent Stage A scan.

Confirmed by two independent round-2 audits (Claude Opus 'R2-A',
Codex retract 'HIGH #1').
---
 python/sglang/srt/managers/scheduler.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 410636344fec..cc8a751ec823 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -3597,12 +3597,19 @@ def abort_request(self, recv_req: AbortReq):
                     req, self.req_to_metadata_buffer_idx_allocator
                 )
 
-            # For mamba radix cache
+            # For mamba radix cache, or for chunked-resume reqs whose prior
+            # admissions already allocated a row + KV + radix lock_ref. Without
+            # this branch, aborting a chunked-resume req that is currently only
+            # in waiting_queue (not in any batch's reqs) leaks all three.
             if (
                 req.mamba_pool_idx is not None
-                and self.disaggregation_mode != DisaggregationMode.DECODE
-            ):
+                or (req.has_pending_chunk and req.req_pool_idx is not None)
+            ) and self.disaggregation_mode != DisaggregationMode.DECODE:
                 release_kv_cache(req, self.tree_cache, is_insert=False)
+                # Defensive: clear pending-chunk flags on the orphaned req so a
+                # stale reference can't trigger Stage A re-stash of the freed row.
+                req.has_pending_chunk = False
+                req.pending_middle_outputs = 0
             logger.debug(f"Abort queued request. {req.rid=}")
 
         # Delete the requests in the grammar queue

From bf5b4e9a104f4535023e1e888b79711b400e9a2e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:03:25 +0800
Subject: [PATCH 20/52] Give chunked-resume reqs priority in LPM and DFS_WEIGHT
 sorts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A chunked-resume req's prefix_indices length reflects only its
already-prefilled chunks (~ kv_committed_len), not the full prompt
prefix it could have matched as a fresh req. Under LPM/DFS_WEIGHT
with tight budget, fresh reqs hitting a long cached prefix outrank
chunked-resume reqs every iter, starving them.

This stuck state is doubly bad because the v2 timeout watchdog skips
chunked-resume reqs (commit 83dc7877e0) — without progress they hold
their row + KV + radix lock_ref forever, until user-initiated abort,
which (until the previous commit) also leaked those resources.

LPM: prepend 'is chunked-resume?' as the primary sort key.
DFS_WEIGHT: extract chunked-resume reqs before DFS, prepend them
afterwards. (Their last_node points at a mid-chunk stash node whose
weight=1 — fold them into DFS and they sink to low priority.)
---
 python/sglang/srt/managers/schedule_policy.py | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index f809f76ade94..4a59555bc7e6 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -283,11 +283,19 @@ def _sort_by_longest_prefix(
         waiting_queue: List[Req], temporary_deprioritized: Set[int]
     ) -> None:
         """Sorts the waiting queue based on the longest prefix match."""
+        # Chunked-resume reqs sort first: their prefix_indices length only
+        # reflects the chunks already prefilled (kv_committed_len), not the
+        # full prompt prefix they could have hit had they been fresh. Without
+        # this floor, a fresh req with a long cached prefix outranks them
+        # every iter, starving them under tight budget.
         waiting_queue.sort(
             key=lambda r: (
-                -len(r.prefix_indices)
-                if r.rid not in temporary_deprioritized
-                else float("inf")
+                0 if r.has_pending_chunk else 1,
+                (
+                    -len(r.prefix_indices)
+                    if r.rid not in temporary_deprioritized
+                    else float("inf")
+                ),
             )
         )
 
@@ -296,8 +304,15 @@ def _sort_by_dfs_weight(
         waiting_queue: List[Req], tree_cache: BasePrefixCache
     ) -> None:
         """Sorts the waiting queue based on a depth-first search weighting."""
+        # Pull chunked-resume reqs out before DFS — their last_node points at
+        # a mid-chunk stash node with weight 1 (no siblings share it), which
+        # otherwise drops them to a low DFS priority and starves them under
+        # tight budget. They go back to the front of the queue afterwards.
+        chunked_reqs = [req for req in waiting_queue if req.has_pending_chunk]
+        non_chunked_reqs = [req for req in waiting_queue if not req.has_pending_chunk]
+
         last_node_to_reqs = defaultdict(list)
-        for req in waiting_queue:
+        for req in non_chunked_reqs:
             last_node_to_reqs[req.last_node].append(req)
 
         node_to_weight = defaultdict(int)
@@ -306,6 +321,7 @@ def _sort_by_dfs_weight(
         SchedulePolicy._calc_weight(tree_cache.root_node, node_to_weight)
 
         waiting_queue.clear()
+        waiting_queue.extend(chunked_reqs)
         SchedulePolicy._get_dfs_priority(
             tree_cache.root_node,
             node_to_weight,

From f38e69f87dbb7b1eff0808824cd7601bea5846f7 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:04:07 +0800
Subject: [PATCH 21/52] Extend pause(retract) to waiting chunked-resume reqs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pause_generation(retract)'s documented contract:

  retract: Pause the scheduler's event loop ... all currently running
  requests will be retracted back to the waiting_queue. The KV cache
  can be flushed in this mode and will be automatically recomputed
  after continue_generation.

Pre-v2 that contract held: every req holding KV was in running_batch.
After v2, chunked-resume reqs live in waiting_queue across iterations
while holding their req_to_token row, committed KV slots, and a radix
tree lock_ref from prior Stage A stash. pause(retract) only touched
running_batch — those waiting chunked-resume resources were never
released, so flush_cache silently couldn't free everything (is_fully_idle
also stays False because waiting_queue is non-empty).

Add an explicit pass that releases each waiting chunked-resume req's
resources (release_kv_cache(is_insert=False)) and resets its
chunked-prefill state via reset_for_retract, so continue_generation
re-prefills the request from origin_input_ids. Also lift the
'running_batch non-empty' guard one level so the new pass runs even
when retract is invoked with only waiting chunked-resume present.
---
 python/sglang/srt/managers/scheduler.py | 26 ++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index cc8a751ec823..cddfb6ebc2b5 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -3711,14 +3711,26 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
         self.last_batch = None
         self.cur_batch = None
 
-        if recv_req.mode == "retract" and not self.running_batch.is_empty():
-            self.running_batch.filter_batch(v1_spec_info_filtered=True)
-            if len(self.running_batch.reqs) != 0:
-                retracted_reqs = self.running_batch.retract_all(self.server_args)
-                for req in retracted_reqs:
-                    self._add_request_to_queue(req)
+        if recv_req.mode == "retract":
+            if not self.running_batch.is_empty():
+                self.running_batch.filter_batch(v1_spec_info_filtered=True)
+                if len(self.running_batch.reqs) != 0:
+                    retracted_reqs = self.running_batch.retract_all(self.server_args)
+                    for req in retracted_reqs:
+                        self._add_request_to_queue(req)
 
-            self.running_batch.batch_is_full = False
+                self.running_batch.batch_is_full = False
+
+            # Chunked-resume reqs in waiting_queue still hold their row + KV +
+            # radix lock_ref from prior admissions. Without explicit release,
+            # pause(retract)'s 'flush_cache can succeed' contract (see
+            # PauseGenerationReqInput docstring) is violated. Release in-place
+            # and reset their chunked state so continue_generation re-prefills
+            # them from origin_input_ids.
+            for req in self.waiting_queue:
+                if req.has_pending_chunk and req.req_pool_idx is not None:
+                    release_kv_cache(req, self.tree_cache, is_insert=False)
+                    req.reset_for_retract()
 
     def continue_generation(self, recv_req: ContinueGenerationReqInput):
         if recv_req.torch_empty_cache:

From 414efd4a27e5f0c5affa21436dcc534029c20ee4 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:23:48 +0800
Subject: [PATCH 22/52] Reset disagg send-side state on chunked-resume retract
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-2 commit ecbe732255 added a pause(retract) sweep that calls
release_kv_cache + reset_for_retract on waiting chunked-resume reqs.
Pre-v2, retract only operated on running_batch reqs, which in
disagg-prefill mode is empty for prefilling reqs — so reset_for_retract
never had to consider the disagg send-side fields. After ecbe732255,
the same path now hits disagg-prefill chunked-resume reqs that carry:

  - req.start_send_idx > 0 (mid-prompt position already sent)
  - req.tmp_end_idx (deferred end_idx for overlap)
  - req.disagg_kv_sender (live sender object bound to the decode peer)

reset_for_retract didn't reset start_send_idx / tmp_end_idx. After
continue_generation, the same req gets re-admitted with a fresh
req_pool_idx and kv_committed_len starting at 0, but start_send_idx
still holds the stale value. process_batch_result_disagg_prefill then
calls send_kv_chunk(start_idx=start_send_idx), which reads
req_to_token[new_row, stale_idx:end_idx] — either garbage slots or
slots that now belong to a different req. The decode peer gets corrupt
KV.

Fix:
  - schedule_batch.py: reset_for_retract now zeros start_send_idx and
    restores tmp_end_idx to -1. Safe in non-disagg modes because the
    fields are init-only there.
  - scheduler.py pause(retract): for disagg-prefill mode, abort the
    sender protocol and drop our reference so the next admit goes
    through bootstrap again.

Confirmed by round-3 Claude Opus audit (R3-A).
---
 python/sglang/srt/managers/schedule_batch.py |  8 ++++++++
 python/sglang/srt/managers/scheduler.py      | 11 +++++++++++
 2 files changed, 19 insertions(+)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 19a0a202fedd..9345d8c8d243 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1284,6 +1284,14 @@ def reset_for_retract(self):
         self.swa_evicted_seqlen = 0
         self.extend_batch_idx = 0
         self.decode_batch_idx = 0
+        # Disagg-prefill send-side bookkeeping. The pre-v2 retract path never
+        # ran against a req that had started sending (retract only touched
+        # running_batch), so these stayed at init values. After v2 added
+        # pause(retract) coverage for waiting chunked-resume reqs, a retracted
+        # disagg-prefill req's stale start_send_idx would index garbage in the
+        # new row on re-prefill.
+        self.start_send_idx = 0
+        self.tmp_end_idx = -1
 
         # When using input_embeds, we cannot easily mix the original input embeddings
         # with the newly generated output token IDs during re-prefill of retracted request.
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index cddfb6ebc2b5..4af0a19b544c 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -3729,6 +3729,17 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
             # them from origin_input_ids.
             for req in self.waiting_queue:
                 if req.has_pending_chunk and req.req_pool_idx is not None:
+                    # Disagg-prefill: signal the decode side that the send was
+                    # retracted and drop our sender ref so re-prefill rebuilds
+                    # the bootstrap state. start_send_idx / tmp_end_idx are
+                    # reset by reset_for_retract.
+                    if (
+                        self.disaggregation_mode == DisaggregationMode.PREFILL
+                        and req.disagg_kv_sender is not None
+                    ):
+                        if hasattr(req.disagg_kv_sender, "abort"):
+                            req.disagg_kv_sender.abort()
+                        req.disagg_kv_sender = None
                     release_kv_cache(req, self.tree_cache, is_insert=False)
                     req.reset_for_retract()
 

From b433e1ea351a373aa041098c67ff150fdb1284be Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:25:02 +0800
Subject: [PATCH 23/52] Count chunked-resume tail in runtime mem check
 (page_size > 1)

self_check_during_busy enforces a strict invariant:
  available + evictable + protected + session_held + uncached == total

After v2's chunked-resume refactor, when a chunked-resume req is in
waiting_queue but not in last_batch/running_batch (filter_batch just
removed it, admission this iter failed budget), its row still holds:
  - cache_protected_len worth of tree-protected KV (counted in protected)
  - kv_committed_len - cache_protected_len unaligned tail in row
    (< page_size, not in tree, not in any of available/evictable/
    protected/session_held buckets)

_active_pool_idxs and _get_total_uncached_sizes only iterated batches,
so this tail was uncounted on the LHS. With page_size > 1 (DSv4 = 64,
paged-attention configs 16/64/128), the invariant fires a false-positive
leak assert.

  - _active_pool_idxs: also include chunked-resume req_pool_idx from
    waiting_queue, so session_held correctly identifies these slots as
    'owned by an active req' (not held tokens to subtract).
  - _get_total_uncached_sizes: add chunked-resume reqs from waiting_queue
    to the groups iterated for uncached accounting. Dedup by id() in
    case the same req is in both a batch and the queue (transient state
    around admission boundaries).

Only triggers when SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSY > 0
(off by default, used in dev/debug). Confirmed by round-3 Claude Opus
audit (R3-B).
---
 .../scheduler_runtime_checker_mixin.py        | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
index ebf929f71251..4fe33e477a81 100644
--- a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
+++ b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
@@ -146,7 +146,8 @@ def _streaming_session_count(self: Scheduler) -> int:
         )
 
     def _active_pool_idxs(self: Scheduler) -> set:
-        """Pool idxs currently owned by reqs in last_batch / running_batch.
+        """Pool idxs currently owned by reqs in last_batch / running_batch or
+        held by chunked-resume reqs sitting in waiting_queue.
 
         Used to decide which session slots' KV is owned by batch reqs
         (and thus counted via uncached_size, not session_held).
@@ -158,6 +159,12 @@ def _active_pool_idxs(self: Scheduler) -> set:
             for req in batch.reqs:
                 if req.req_pool_idx is not None:
                     idxs.add(req.req_pool_idx)
+        # Chunked-resume reqs in waiting_queue still own their row across iters
+        # (filter_batch may have just moved them out of last_batch but they
+        # haven't yet been re-admitted to running_batch).
+        for req in self.waiting_queue:
+            if req.has_pending_chunk and req.req_pool_idx is not None:
+                idxs.add(req.req_pool_idx)
         return idxs
 
     def _session_held_tokens(self: Scheduler) -> int:
@@ -393,17 +400,31 @@ def _get_total_uncached_sizes(self: Scheduler) -> Tuple[int, int]:
         """
         # After decode: running_batch IS last_batch (same object), count once.
         # After prefill: they differ, both hold uncached tokens.
-        batches = [self.last_batch]
+        req_groups = [list(self.last_batch.reqs)]
         if (
             self.running_batch not in (None, self.last_batch)
             and not self.running_batch.is_empty()
         ):
-            batches.append(self.running_batch)
+            req_groups.append(list(self.running_batch.reqs))
+        # Chunked-resume reqs in waiting_queue carry uncached tail
+        # (kv_committed_len - cache_protected_len, < page_size) that
+        # filter_batch just removed from last_batch but haven't been
+        # re-admitted to running_batch yet. The leak invariant must count it.
+        seen_ids = {id(req) for group in req_groups for req in group}
+        chunked_in_queue = [
+            req
+            for req in self.waiting_queue
+            if req.has_pending_chunk
+            and req.req_pool_idx is not None
+            and id(req) not in seen_ids
+        ]
+        if chunked_in_queue:
+            req_groups.append(chunked_in_queue)
 
         full_uncached = 0
         swa_uncached = 0
-        for batch in batches:
-            for req in batch.reqs:
+        for group in req_groups:
+            for req in group:
                 assert req.kv_committed_freed == req.kv_overallocated_freed
                 if req.kv_committed_freed or req.req_pool_idx is None:
                     continue

From f0af5105abd9ea5f6ad63a97ec7110e1297944a6 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:28:41 +0800
Subject: [PATCH 24/52] Document filter_batch(exclude_chunked_req=True) at
 every call site
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All 6 sites where filter_batch is invoked with exclude_chunked_req=True
drop chunked-resume reqs from a batch. The reqs themselves are NOT lost
— they persist in self.waiting_queue thanks to the retention filter in
get_new_batch_prefill_raw:

    self.waiting_queue = [
        x for x in self.waiting_queue
        if x not in can_run_set or x.has_pending_chunk
    ]

so the next iter's Stage A scan re-stashes them and admission re-admits.
This is the load-bearing invariant the whole stateless-scheduler v2
design rests on, but it was implicit at the call sites.

Per-site notes:
  - scheduler.py:2498 (last_batch before merge): drop is required —
    running_batch runs decode forward, chunked-resume is mid-prefill.
  - scheduler.py:2516 (running_batch when is_prefill_only): defensive,
    the merge step already drops chunked-resume.
  - scheduler.py:2843 (running_batch before mix_with_running): defensive,
    same reason.
  - scheduler.py:3673 (disagg-prefill last_batch): same as 2498.
  - disaggregation/prefill.py:735 (process_prefill_chunk last_batch):
    same as 2498.
  - disaggregation/decode.py:1422 (new_prebuilt_batch): defensive —
    chunked prefill is prefill-side, decode-side shouldn't see it; an
    assert above already guards waiting_queue.
---
 python/sglang/srt/disaggregation/decode.py  |  5 +++++
 python/sglang/srt/disaggregation/prefill.py |  5 +++++
 python/sglang/srt/managers/scheduler.py     | 18 ++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index b778c7eea712..8b9bcecd31fb 100644
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -1648,6 +1648,11 @@ def get_next_disagg_decode_batch_to_run(
         if new_prebuilt_batch:
             assert not any(r.has_pending_chunk for r in self.waiting_queue)
             self.process_batch_result_prebuilt(new_prebuilt_batch)
+            # Defensive: chunked prefill is a prefill-side concept; decode-side
+            # prebuilt batches shouldn't carry has_pending_chunk reqs. The
+            # assert above already guards waiting_queue; this flag protects
+            # against any future code that would route a chunked req through
+            # the disagg decode path.
             new_prebuilt_batch.filter_batch(exclude_chunked_req=True)
             if not new_prebuilt_batch.is_empty():
                 if self.running_batch.is_empty():
diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index 24292837e75c..10fabe87c9a2 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -733,6 +733,11 @@ def process_prefill_chunk(self: Scheduler) -> None:
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
             last_bs = self.last_batch.batch_size()
+            # Drop chunked-resume reqs from last_batch — running_batch runs
+            # decode forward and admitting a mid-prefill req there breaks
+            # shape + KV accounting. The dropped reqs stay in
+            # self.waiting_queue (chunked-resume retention) and re-enter via
+            # the next iter's Stage A stash + admission cycle.
             self.last_batch.filter_batch(exclude_chunked_req=True)
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 4af0a19b544c..771051aef4b4 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2519,6 +2519,12 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             and self.last_batch.forward_mode.is_extend()
         ):
             last_bs = self.last_batch.batch_size()
+            # Drop chunked-resume reqs before merging last_batch into
+            # running_batch. running_batch runs decode forward and admitting
+            # a mid-prefill req there breaks shapes + KV accounting. The
+            # dropped reqs persist in self.waiting_queue (retention at
+            # ~line 2775: `x not in can_run_set or x.has_pending_chunk`)
+            # and re-enter via next iter's Stage A stash + admission.
             self.last_batch.filter_batch(exclude_chunked_req=True)
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
@@ -2537,6 +2543,11 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         # Runs outside the last_batch block so stale requests are cleaned
         # even when no new batches arrive (e.g. traffic stops).
         if self.running_batch.is_prefill_only:
+            # Defensive exclude_chunked_req: the merge step above already
+            # drops chunked-resume reqs from last_batch, so running_batch
+            # shouldn't normally hold one. Keep the flag set so any leak in
+            # that invariant doesn't survive here; the dropped req still
+            # has its waiting_queue retention to re-admit next iter.
             self.running_batch.filter_batch(exclude_chunked_req=True)
             if self.running_batch.is_empty():
                 self.running_batch.batch_is_full = False
@@ -2863,6 +2874,10 @@ def _get_new_batch_prefill_raw(
             and new_batch.input_embeds is None
         ):
             # TODO (lianmin): support return_logprob + mixed chunked prefill
+            # exclude_chunked_req here is defensive — by design running_batch
+            # holds decode reqs only (the last_batch filter+merge step above
+            # already drops chunked-resume), and any dropped chunked-resume
+            # would still ride waiting_queue retention to next iter's Stage A.
             self.running_batch.filter_batch(
                 v1_spec_info_filtered=True, exclude_chunked_req=True
             )
@@ -3694,6 +3709,9 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
             self.process_batch_result(tmp_batch, tmp_result)
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
+            # Same invariant as the non-disagg merge path: drop chunked-resume
+            # reqs before potentially folding last_batch into running_batch.
+            # They re-enter via waiting_queue retention + Stage A next iter.
             self.last_batch.filter_batch(exclude_chunked_req=True)
             # Skip merge for disagg prefill: completed prefill requests are
             # already in disagg_prefill_inflight_queue. Merging them into

From b823c16e6048905c5176fd732b5ccc19f347b4db Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:31:50 +0800
Subject: [PATCH 25/52] Include PP microbatch reqs in abort_request batch_rids
 dedup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

abort_request builds batch_rids from running_batch + cur_batch to
distinguish 'in batch' reqs (need to_finish, output processor releases
on the next iter) from 'waiting only' reqs (can be popped and aborted
immediately). The v2 round-2 commit ffaae91c79 added a release_kv_cache
+ has_pending_chunk clear inside the waiting-pop branch so that
chunked-resume reqs sitting in waiting_queue get their row + KV +
lock_ref properly released on abort.

PP breaks this categorization. Under pipeline parallelism, multiple
microbatches are in flight at once: self.mbs holds batches that have
been launched but not yet processed, self.last_mbs holds prior-iter
launches whose results are coming, self.running_mbs is per-mb running
state. A chunked-resume req X can be:

  - in waiting_queue (chunked-resume retention)
  - in self.mbs[mb_a] (forward launched, result pending)

with pending_middle_outputs > 0. abort_request only sees running_batch
+ cur_batch (one mb), so X falls into batch_rids miss → waiting-pop
path → release_kv_cache while mb_a's forward still references the row.

When mb_a's delayed output finally lands:
  - pending_middle_outputs was cleared to 0 by waiting-pop, so
    output processor takes the full-output branch
  - req_pool_idx was cleared by release_kv_cache, so
    maybe_cache_unfinished_req or release_kv_cache crashes / corrupts

Fix: extend batch_rids with rids from every non-empty mb across mbs,
last_mbs, running_mbs whenever pp_size > 1. Treats in-flight PP reqs
as 'in batch', routing them through the to_finish path which the PP
output processor drains correctly.

Confirmed by round-4 Codex counter audit (HIGH bug).
---
 python/sglang/srt/managers/scheduler.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 771051aef4b4..4bbdc7e833a9 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -3580,9 +3580,24 @@ def abort_request(self, recv_req: AbortReq):
         # waiting_queue removal for those — let the to_finish path below handle
         # them, otherwise we send_output / release_kv_cache twice.
         if self.cur_batch is self.running_batch or self.cur_batch is None:
-            batch_reqs = self.running_batch.reqs
+            batch_reqs = list(self.running_batch.reqs)
         else:
-            batch_reqs = self.running_batch.reqs + self.cur_batch.reqs
+            batch_reqs = list(self.running_batch.reqs) + list(self.cur_batch.reqs)
+        # PP: rids from every in-flight microbatch must also be treated as
+        # 'in batch'. Each mb's forward was launched against the req's
+        # req_pool_idx + KV slots; the output processor on a different mb
+        # iteration consumes the result later. Without this, a chunked-resume
+        # req with pending_middle_outputs > 0 sitting in waiting_queue would
+        # fall into the waiting-only abort path, release_kv_cache would free
+        # the row + KV underneath the still-launched forward, and the delayed
+        # output processor would crash on a None req_pool_idx (or, with
+        # pending_middle_outputs cleared to 0, mistake the middle-chunk
+        # result for a full output and append garbage tokens).
+        if self.pp_size > 1 and hasattr(self, "mbs"):
+            for mb_list in (self.mbs, self.last_mbs, self.running_mbs):
+                for mb in mb_list:
+                    if mb is not None and not mb.is_empty():
+                        batch_reqs.extend(mb.reqs)
         batch_rids = {r.rid for r in batch_reqs}
 
         # Delete requests in the waiting queue

From 678bba26f097693d2fec35e94d15cdbf12392533 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 20:53:21 +0800
Subject: [PATCH 26/52] Document why Stage A chunk-stash runs at iter boundary
 instead of end-of-prior-iter

---
 python/sglang/srt/managers/scheduler.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 4bbdc7e833a9..24a8ab52fbb0 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2492,6 +2492,16 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         # Per-req loop over waiting_queue covers chunked-resume; DLLM staging
         # reqs are owned by DllmManager (not in waiting_queue), handled
         # separately below.
+        #
+        # Why this runs at the iter boundary (not at the end of the prior iter):
+        # admission inside get_new_batch_prefill_raw reads req.prefix_indices to
+        # decide extend_input_len. Stashing in the middle of admission would let
+        # a chunked-resume req "match itself" — the tree would expose KV this
+        # same req just wrote, double-counting it as cached prefix. Keeping
+        # stash here means admission only ever sees tree state that is stable
+        # for the duration of the scheduling pass. vLLM / TokenSpeed do not
+        # need this because their admission reads a single monotone counter
+        # (num_computed_tokens / FSM state), not a prefix-indices splice.
         for req in self.waiting_queue:
             if req.has_pending_chunk and not req.is_dllm():
                 maybe_cache_unfinished_req(req, self.tree_cache, chunked=True)

From 34c02d6a6746a410368d6be14bc0d9a912c78e53 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 21:22:21 +0800
Subject: [PATCH 27/52] Filter chunked-resume reqs from split_prefill_batch
 before pdmux merge

The new merge_batch invariant in schedule_batch.py asserts that the source
batch holds no has_pending_chunk / pending_middle_outputs / dllm reqs. The
pdmux split-prefill path was the only merge site missing the matching
filter_batch(exclude_chunked_req=True) before merge. With chunked prefill
enabled under pdmux, admitting a non-last chunk would trip the assert.
---
 .../sglang/srt/multiplex/multiplexing_mixin.py  | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/multiplex/multiplexing_mixin.py b/python/sglang/srt/multiplex/multiplexing_mixin.py
index 9902afe5c16f..befc37e8b206 100644
--- a/python/sglang/srt/multiplex/multiplexing_mixin.py
+++ b/python/sglang/srt/multiplex/multiplexing_mixin.py
@@ -208,10 +208,19 @@ def event_loop_pdmux(self: Scheduler):
                         self.process_batch_result(
                             self.split_prefill_batch, prefill_result
                         )
-                        if self.running_batch and not self.running_batch.is_empty():
-                            self.running_batch.merge_batch(self.split_prefill_batch)
-                        else:
-                            self.running_batch = self.split_prefill_batch
+                        # Drop chunked-resume reqs before folding split_prefill_batch
+                        # into running_batch. running_batch runs decode forward and
+                        # admitting a mid-prefill req there breaks shape + KV
+                        # accounting; the dropped reqs persist in self.waiting_queue
+                        # (retention in get_new_batch_prefill) and re-enter via the
+                        # next iter's Stage A stash + admission cycle. Mirrors the
+                        # standard event_loop path at scheduler.py:2514.
+                        self.split_prefill_batch.filter_batch(exclude_chunked_req=True)
+                        if not self.split_prefill_batch.is_empty():
+                            if self.running_batch and not self.running_batch.is_empty():
+                                self.running_batch.merge_batch(self.split_prefill_batch)
+                            else:
+                                self.running_batch = self.split_prefill_batch
 
                         self.split_prefill_batch = None
                         wait_prefill_kernel_done = False

From 2868334e3371c2025a3b7f30e0a76465b3d00df3 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 21:39:23 +0800
Subject: [PATCH 28/52] Apply black-jupyter formatting (CI lint fixup)

---
 python/sglang/srt/managers/schedule_policy.py  | 4 +---
 python/sglang/srt/managers/scheduler.py        | 8 ++------
 python/sglang/srt/mem_cache/radix_cache.py     | 4 +---
 python/sglang/srt/mem_cache/swa_radix_cache.py | 4 +---
 4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index 4a59555bc7e6..325f07d573d7 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -801,9 +801,7 @@ def add_req_state(r, insert_sort=False):
 
         return self.budget_state()
 
-    def add_one_req(
-        self, req: Req, truncation_align_size: Optional[int]
-    ):
+    def add_one_req(self, req: Req, truncation_align_size: Optional[int]):
         # Reuse path: this req's previous chunk left lock_ref held, prefix
         # already in tree, and init_load_back already consumed host KV. We
         # must skip fresh-req setup. Gate on `has_pending_chunk` (the
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 24a8ab52fbb0..2dd0ea94e103 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2818,9 +2818,7 @@ def _get_new_batch_prefill_raw(
         # naturally by budget + priority.
         can_run_set = set(can_run_list)
         self.waiting_queue = [
-            x
-            for x in self.waiting_queue
-            if x not in can_run_set or x.has_pending_chunk
+            x for x in self.waiting_queue if x not in can_run_set or x.has_pending_chunk
         ]
         if adder.preempt_list:
             for req in adder.preempt_list:
@@ -2870,9 +2868,7 @@ def _get_new_batch_prefill_raw(
             adder,
             self.running_batch.reqs,
             self.enable_priority_scheduling,
-            num_pending_tokens=self._get_num_pending_tokens(
-                chunk_deduct=chunk_deduct
-            ),
+            num_pending_tokens=self._get_num_pending_tokens(chunk_deduct=chunk_deduct),
         )
 
         # Mixed-style chunked prefill
diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py
index f4b193c73965..2a6b0a4ba02d 100644
--- a/python/sglang/srt/mem_cache/radix_cache.py
+++ b/python/sglang/srt/mem_cache/radix_cache.py
@@ -495,9 +495,7 @@ def cache_unfinished_req(self, req: Req, chunked=False):
         assert req.kv_committed_len >= req.cache_protected_len
         read_len = req.kv_committed_len
         token_ids = req.fill_ids[:read_len]
-        kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, :read_len
-        ]
+        kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :read_len]
 
         radix_key = RadixKey(
             token_ids, req.extra_key, is_bigram=self.is_eagle
diff --git a/python/sglang/srt/mem_cache/swa_radix_cache.py b/python/sglang/srt/mem_cache/swa_radix_cache.py
index a3936683e16f..bf1b46a58809 100644
--- a/python/sglang/srt/mem_cache/swa_radix_cache.py
+++ b/python/sglang/srt/mem_cache/swa_radix_cache.py
@@ -497,9 +497,7 @@ def cache_unfinished_req(self, req: Req, chunked=False) -> None:
             return
 
         token_ids = req.fill_ids[:read_len]
-        kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, :read_len
-        ]
+        kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :read_len]
 
         radix_key = RadixKey(
             token_ids, req.extra_key, is_bigram=self.is_eagle

From daf9c42f17b8737c1de7b68724915039619f73e1 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 21:40:21 +0800
Subject: [PATCH 29/52] Remove v1 SWA chunked-req stash gate test (gate was
 deleted in v2)

The test exercised _chunked_req_scheduled_last_iter, a v1 gate that
prevented spurious stash on deferred chunked_req (#24252). v2 replaces
that gate with has_pending_chunk + Stage A scan + page-aligned
cache_protected_len, none of which the test instruments. The test
sets s.chunked_req and s._chunked_req_scheduled_last_iter on a
SimpleNamespace scheduler that lacks self.waiting_queue, so it
AttributeErrors in CI under v2's get_next_batch_to_run path.
---
 .../test_scheduler_chunked_req_gate.py        | 161 ------------------
 1 file changed, 161 deletions(-)
 delete mode 100644 test/registered/unit/managers/test_scheduler_chunked_req_gate.py

diff --git a/test/registered/unit/managers/test_scheduler_chunked_req_gate.py b/test/registered/unit/managers/test_scheduler_chunked_req_gate.py
deleted file mode 100644
index 87a6daf7e293..000000000000
--- a/test/registered/unit/managers/test_scheduler_chunked_req_gate.py
+++ /dev/null
@@ -1,161 +0,0 @@
-"""Regression tests for the SWA chunked-req stash gate (#24252)."""
-
-import unittest
-from types import SimpleNamespace
-from unittest.mock import MagicMock
-
-import torch
-
-from sglang.test.ci.ci_register import register_cpu_ci
-from sglang.test.test_utils import CustomTestCase, maybe_stub_sgl_kernel
-
-maybe_stub_sgl_kernel()
-
-from sglang.srt.managers.schedule_batch import Req
-from sglang.srt.managers.scheduler import Scheduler
-from sglang.srt.mem_cache.chunk_cache import ChunkCache
-
-register_cpu_ci(est_time=6, suite="stage-a-test-cpu")
-
-
-def _make_req(
-    *,
-    req_pool_idx: int,
-    fill_ids: list,
-    prefix_indices: torch.Tensor,
-    extend_input_len: int,
-) -> Req:
-    req = Req.__new__(Req)
-    req.rid = "test-req"
-    req.origin_input_ids = list(fill_ids)
-    req.output_ids = []
-    req.fill_ids = list(fill_ids)
-    req.prefix_indices = prefix_indices
-    req.req_pool_idx = req_pool_idx
-    req.extend_input_len = extend_input_len
-    req.is_chunked = 0
-    req.host_hit_length = 0
-    req.cache_protected_len = 0
-    req.skip_radix_cache_insert = False
-    req.last_node = None
-    req.swa_uuid_for_lock = None
-    req.session = None
-    req.return_logprob = False
-    req.logprob_start_len = -1
-    req.positional_embed_overrides = None
-    req.extra_key = None
-    req.mamba_pool_idx = None
-    req.sampling_params = SimpleNamespace(max_new_tokens=128, ignore_eos=False)
-    return req
-
-
-def _make_req_to_token_pool(num_slots: int, max_context: int) -> SimpleNamespace:
-    # Slot s contains a recognizable fingerprint [s*1000, s*1000+1, ...]
-    # so we can tell a corrupted prefix_indices from a healthy one by content.
-    pool = SimpleNamespace()
-    pool.req_to_token = (
-        torch.arange(max_context, dtype=torch.int32).unsqueeze(0).repeat(num_slots, 1)
-        + torch.arange(num_slots, dtype=torch.int32).unsqueeze(1) * 1000
-    )
-    return pool
-
-
-def _make_chunk_cache(req_to_token_pool) -> ChunkCache:
-    return ChunkCache(
-        SimpleNamespace(
-            req_to_token_pool=req_to_token_pool,
-            token_to_kv_pool_allocator=None,
-            page_size=1,
-        )
-    )
-
-
-def _scheduler_for_get_next_batch(*, tree_cache, chunked_req) -> Scheduler:
-    s = Scheduler.__new__(Scheduler)
-    s._abort_on_waiting_timeout = MagicMock()
-    s._abort_on_running_timeout = MagicMock()
-    s.dllm_config = None
-    s.dllm_manager = None
-    s.enable_hisparse = False
-    s.last_batch = None
-    s.require_mlp_sync = False
-    s.spec_algorithm = MagicMock()
-    s.server_args = MagicMock(speculative_skip_dp_mlp_sync=True)
-    s.running_batch = MagicMock()
-    s.running_batch.is_empty.return_value = True
-    s.running_batch.is_prefill_only = False
-    s.running_batch.batch_is_full = False
-    s.running_batch.reqs = []
-    s.get_new_batch_prefill = MagicMock(return_value=None)
-    s.maybe_prepare_mlp_sync_batch = MagicMock(side_effect=lambda batch, **_: batch)
-    s._maybe_prepare_ngram_embedding = MagicMock(side_effect=lambda batch: batch)
-    s.update_running_batch = MagicMock(side_effect=lambda batch: batch)
-    s.tree_cache = tree_cache
-    s.chunked_req = chunked_req
-    return s
-
-
-class TestStashGatePreservesPrefixIndices(CustomTestCase):
-    """Consumer side: real ChunkCache.cache_unfinished_req mutates
-    req.prefix_indices iff stash actually runs, so prefix_indices content
-    is the bug-detection signal."""
-
-    POOL_IDX = 4
-    INITIAL_PREFIX_LEN = 8  # what was really cached last iter
-    POST_RESET_FILL_LEN = 32  # length after init_next_round_input
-    NUM_SLOTS = 8
-    MAX_CONTEXT = 64
-
-    def _build(self, flag: bool):
-        pool = _make_req_to_token_pool(self.NUM_SLOTS, self.MAX_CONTEXT)
-        cache = _make_chunk_cache(pool)
-        initial_prefix = pool.req_to_token[self.POOL_IDX, : self.INITIAL_PREFIX_LEN].to(
-            dtype=torch.int64, copy=True
-        )
-        req = _make_req(
-            req_pool_idx=self.POOL_IDX,
-            fill_ids=list(range(self.POST_RESET_FILL_LEN)),
-            prefix_indices=initial_prefix,
-            extend_input_len=0,
-        )
-        s = _scheduler_for_get_next_batch(tree_cache=cache, chunked_req=req)
-        s._chunked_req_scheduled_last_iter = flag
-        return s, req, initial_prefix, pool
-
-    def test_deferred_chunked_req_keeps_real_prefix_indices(self):
-        # The bug case: a spurious stash on a deferred chunked_req
-        # would extend prefix_indices to len(fill_ids).
-        s, req, initial_prefix, _ = self._build(flag=False)
-
-        Scheduler.get_next_batch_to_run(s)
-
-        self.assertEqual(req.prefix_indices.shape[0], self.INITIAL_PREFIX_LEN)
-        self.assertTrue(torch.equal(req.prefix_indices, initial_prefix))
-
-    def test_scheduled_chunked_req_advances_prefix_indices_via_real_stash(self):
-        # Symmetric guard against over-gating: when the chunked_req was
-        # actually scheduled, stash must run and advance prefix_indices.
-        s, req, _, pool = self._build(flag=True)
-
-        Scheduler.get_next_batch_to_run(s)
-
-        expected = pool.req_to_token[self.POOL_IDX, : self.POST_RESET_FILL_LEN].to(
-            dtype=torch.int64
-        )
-        self.assertEqual(req.prefix_indices.shape[0], self.POST_RESET_FILL_LEN)
-        self.assertTrue(torch.equal(req.prefix_indices, expected))
-
-    def test_no_chunked_req_never_mutates_state_even_with_stale_flag(self):
-        # Retract path clears chunked_req without resetting the flag;
-        # the outer `if chunked_req is not None` guard must hold.
-        pool = _make_req_to_token_pool(self.NUM_SLOTS, self.MAX_CONTEXT)
-        cache = _make_chunk_cache(pool)
-        s = _scheduler_for_get_next_batch(tree_cache=cache, chunked_req=None)
-        s._chunked_req_scheduled_last_iter = True
-
-        Scheduler.get_next_batch_to_run(s)
-        self.assertIsNone(s.chunked_req)
-
-
-if __name__ == "__main__":
-    unittest.main()

From a94e842611d425656ac9e457e7c2356d6a55eac2 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 21:46:45 +0800
Subject: [PATCH 30/52] Drop v1 has_chunked_req kwarg + delete v1
 add_chunked_req SWA tests

test_prefill_adder.py exercised two v1 APIs that v2 deleted:
- add_one_req(req, has_chunked_req=..., truncation_align_size=...) ->
  v2 signature is add_one_req(req, truncation_align_size). Drop kwarg.
- adder.add_chunked_req(req) -> v2 unified into add_one_req's reuse
  branch (gated on req.has_pending_chunk; see schedule_policy.py:811).
  The three SWA-reservation tests assert v1's truncate-to-fit semantics
  (return req with set_extend_input_len(REM_SWA - PAGE_SIZE)), which v2
  intentionally replaces: v2's _swa_budget_for_req rejects with
  AddReqResult.NO_TOKEN when swa_needed >= rem_swa_tokens and lets
  waiting_queue retention re-admit the chunked-resume req next iter.
  These tests no longer correspond to live behavior; delete them along
  with the _build_hybrid_swa_chunked_req helper they share.
---
 .../unit/managers/test_prefill_adder.py       | 94 +------------------
 1 file changed, 3 insertions(+), 91 deletions(-)

diff --git a/test/registered/unit/managers/test_prefill_adder.py b/test/registered/unit/managers/test_prefill_adder.py
index 14d4eab70061..6ee5f921134e 100644
--- a/test/registered/unit/managers/test_prefill_adder.py
+++ b/test/registered/unit/managers/test_prefill_adder.py
@@ -383,9 +383,7 @@ def test_mixed_chunk_prefill_budgets(self):
         req1.last_node = MagicMock()
         req1.sampling_params.ignore_eos = False
 
-        result1 = adder.add_one_req(
-            req1, has_chunked_req=False, truncation_align_size=None
-        )
+        result1 = adder.add_one_req(req1, truncation_align_size=None)
 
         self.assertEqual(len(adder.can_run_list), 1)
         self.assertEqual(adder.rem_chunk_tokens, 0)  # 56 - 56
@@ -417,9 +415,7 @@ def test_mixed_chunk_prefill_budgets(self):
         req2.last_node = MagicMock()
         req2.sampling_params.ignore_eos = False
 
-        result2 = adder2.add_one_req(
-            req2, has_chunked_req=False, truncation_align_size=None
-        )
+        result2 = adder2.add_one_req(req2, truncation_align_size=None)
 
         self.assertEqual(len(adder2.can_run_list), 1)
         self.assertEqual(adder2.rem_chunk_tokens, 3)  # 59 - 56 = 3 remaining
@@ -434,78 +430,12 @@ def test_mixed_chunk_prefill_budgets(self):
         req3.last_node = MagicMock()
         req3.sampling_params.ignore_eos = False
 
-        result3 = adder2.add_one_req(
-            req3, has_chunked_req=False, truncation_align_size=None
-        )
+        result3 = adder2.add_one_req(req3, truncation_align_size=None)
 
         self.assertEqual(len(adder2.can_run_list), 2)
         self.assertEqual(adder2.rem_chunk_tokens, 0)  # 3 - 3 = 0
         self.assertEqual(result3, AddReqResult.OTHER)
 
-    def _build_hybrid_swa_chunked_req(
-        self,
-        *,
-        page_size,
-        rem_swa,
-        rem_chunk=2048,
-        extend_input_len=500,
-        is_hybrid_swa=True,
-        full_available=100_000,
-    ):
-        self.mock_token_allocator.swa_available_size.return_value = rem_swa
-        self.mock_token_allocator.full_available_size.return_value = full_available
-        self.mock_token_allocator.available_size.return_value = full_available
-        self.mock_tree_cache.sliding_window_size = 128
-        adder = self.create_adder(
-            self.create_running_batch(),
-            page_size=page_size,
-            rem_chunk_tokens=rem_chunk,
-        )
-        adder.is_hybrid_swa = is_hybrid_swa
-
-        req = self.create_mock_req("chunked", priority=0, max_new_tokens=128)
-        req.extend_input_len = extend_input_len
-        req.prefix_indices = []
-        req.fill_ids = list(range(extend_input_len))
-        req.set_extend_input_len = MagicMock()
-        return adder, req
-
-    def test_add_chunked_req_hybrid_swa_reserves_page_for_alloc_extend(self):
-        # alloc_extend needs extend_num_tokens + page_size per request. If the
-        # scheduler hands out all of rem_swa_tokens, alloc_extend cannot get its
-        # extra page and OOMs. With the fix, extend_input_len must cap at
-        # rem_swa_tokens - page_size so the page is reserved.
-        PAGE_SIZE = 64
-        REM_SWA = 100
-        adder, req = self._build_hybrid_swa_chunked_req(
-            page_size=PAGE_SIZE, rem_swa=REM_SWA
-        )
-
-        result = adder.add_chunked_req(req)
-
-        self.assertIs(result, req)  # truncated → chunked prefill continues
-        req.set_extend_input_len.assert_called_once()
-        new_len = req.set_extend_input_len.call_args.args[0]
-        self.assertLessEqual(new_len + PAGE_SIZE, REM_SWA)
-        self.assertEqual(new_len, REM_SWA - PAGE_SIZE)
-
-    def test_add_chunked_req_hybrid_swa_defers_when_swa_below_page(self):
-        # When rem_swa_tokens <= page_size there is no room to serve even the
-        # reservation, so the chunked req must be deferred (returned unchanged)
-        # instead of falling back to rem_chunk_tokens and bypassing SWA budget.
-        PAGE_SIZE = 64
-        adder, req = self._build_hybrid_swa_chunked_req(
-            page_size=PAGE_SIZE, rem_swa=PAGE_SIZE
-        )
-        original_len = req.extend_input_len
-
-        result = adder.add_chunked_req(req)
-
-        self.assertIs(result, req)
-        req.set_extend_input_len.assert_not_called()
-        self.assertEqual(req.extend_input_len, original_len)
-        self.assertEqual(len(adder.can_run_list), 0)
-
     def test_swa_budget_for_req(self):
         cases = [
             # (extend, rem_chunk, window, page, expected, label)
@@ -526,24 +456,6 @@ def test_swa_budget_for_req(self):
                 )
                 self.assertEqual(adder._swa_budget_for_req(extend), expected)
 
-    def test_add_chunked_req_non_hybrid_no_swa_reservation(self):
-        # Non-hybrid path: the SWA-pool reservation must NOT apply, otherwise
-        # the fix would regress non-SWA models.
-        PAGE_SIZE = 16
-        adder, req = self._build_hybrid_swa_chunked_req(
-            page_size=PAGE_SIZE,
-            rem_swa=10,
-            rem_chunk=500,
-            extend_input_len=200,
-            is_hybrid_swa=False,
-            full_available=300,
-        )
-
-        result = adder.add_chunked_req(req)
-        self.assertIsNone(result)
-        req.set_extend_input_len.assert_called_once_with(200)
-        self.assertIn(req, adder.can_run_list)
-
 
 if __name__ == "__main__":
     unittest.main()

From 02b1785f0a3db25cc21e861f84b2bbedba8159ca Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 22:21:39 +0800
Subject: [PATCH 31/52] Guard _handle_finished_req against PP cross-microbatch
 double-finalize

In PP+chunked-prefill, the same Req object can sit in several in-flight
mbs[*] batches because chunks are pipelined across microbatch slots.
The slot that processes the last chunk's result finalizes the req
(release_kv_cache nulls req_pool_idx); a sibling slot's pending result
then re-enters _handle_finished_req on the same Req and trips the
'Only MambaRadixCache allow freeing before alloc' assert inside
release_kv_cache.

Treat 'req_pool_idx is None at finalize' as 'already released' for
non-Mamba caches and skip the redundant cleanup. The first finalize
already ran multimodal_inputs.release_features, the experts/indexer
collectors, hisparse request_finished, release_kv_cache, and set
completion_time. maybe_collect_customized_info is still called so per-i
diagnostic capture isn't dropped.

Reproduces under test_pp_long_context_prefill (128k random_input_len,
random_output_len=1, --pp-size 2, chunked_prefill_size=8192, fp8 70B):
PP0 raises AssertionError in scheduler_output_processor_mixin.py:657
during process_batch_result_decode.
---
 .../managers/scheduler_output_processor_mixin.py    | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
index 234b56b78865..5fa5313678d5 100644
--- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py
+++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -641,6 +641,19 @@ def _handle_finished_req(
             self.decode_offload_manager.offload_kv_cache(req)
 
         if req.finished():
+            # Idempotency guard for PP cross-microbatch races: in PP+chunked
+            # prefill the same Req object can sit in multiple in-flight
+            # mbs[*] batches when chunks of one req are pipelined across
+            # microbatch slots. The slot that processes the last chunk's
+            # result finalizes the req (release_kv_cache nulls req_pool_idx),
+            # then a sibling slot's pending result hits the same req again
+            # here and would trip the assert in release_kv_cache. Treat
+            # `req_pool_idx is None at finalize` as "already released" and
+            # skip the redundant cleanup; the first call already collected
+            # multimodal/experts/indexer/time-stats state.
+            if req.req_pool_idx is None and not self.tree_cache.supports_mamba():
+                self.maybe_collect_customized_info(i, req, logits_output)
+                return
             # delete feature to save memory
             if req.multimodal_inputs is not None and req.session is None:
                 req.multimodal_inputs.release_features()

From b0f21388b396b39da085e7652fc3eefb8af8179d Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 13 May 2026 23:28:27 +0800
Subject: [PATCH 32/52] Seed has_pending_chunk/is_dllm/host_hit_length on
 test_prefill_adder mocks

v2 add_one_req (schedule_policy.py:811) reads req.has_pending_chunk
and req.is_dllm() at the reuse-branch gate. MagicMock(spec=Req) only
surfaces class-level attributes; has_pending_chunk is set inside
Req.__init__ and was therefore unreachable on the mock, raising
AttributeError under stage-b-test-1-gpu-small test_mixed_chunk_prefill_budgets.
Seed the three attributes that v2 newly touches on the mock so the
reuse-branch gate evaluates cleanly with is_resume=False.
---
 test/registered/unit/managers/test_prefill_adder.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/registered/unit/managers/test_prefill_adder.py b/test/registered/unit/managers/test_prefill_adder.py
index 6ee5f921134e..b43f3efdf1b8 100644
--- a/test/registered/unit/managers/test_prefill_adder.py
+++ b/test/registered/unit/managers/test_prefill_adder.py
@@ -77,6 +77,11 @@ def create_mock_req(self, rid, priority, max_new_tokens, output_len=0, wait_time
         req.sampling_params = SimpleNamespace(max_new_tokens=max_new_tokens)
         req.time_stats = SimpleNamespace(wait_queue_entry_time=wait_time)
         req.finished.return_value = False
+        # v2 add_one_req reads these on the reuse-branch gate; MagicMock(spec=Req)
+        # doesn't surface attributes set only in Req.__init__, so seed them.
+        req.has_pending_chunk = False
+        req.is_dllm.return_value = False
+        req.host_hit_length = 0
         return req
 
     def create_adder(self, running_batch, **kwargs):

From 33f981ce935489434f0e99e46bf45b2655bbe981 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 14 May 2026 18:12:56 +0800
Subject: [PATCH 33/52] Re-add ScheduleBatch.chunked_req marker for PP cross-mb
 filter exclusion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In PP+chunked-prefill, mb_a's last-chunk admit clears req.has_pending_chunk
but mb_b still holds the prior middle-chunk batch in its last_batch slot.
The dynamic filter_batch predicate (has_pending_chunk OR pending_middle_outputs>0)
becomes False for the req before mb_a's last-chunk forward result has been
processed by output_processor, so mb_b would merge the still-prefilling req
into running_batch and run a decode forward on stale state — wrong logits,
wrong output tokens.

Restore the OLD-code idiom: stamp the batch's chunked_req at admit time
from chunked_in_batch[0] (the req that was admitted as mid-prefill in this
batch). filter_batch then excludes any req that is the batch's own chunked_req,
not just reqs whose dynamic counters happen to be > 0 at filter time.

Mirrors the behavior that existed before c445a82cf5 (Switch chunked-resume
to waiting_queue holding; delete chunked_req fields); only the storage moved
from Scheduler.chunked_req to per-batch — this brings back the per-batch
marker without re-introducing scheduler-level chunked-aware state.
---
 python/sglang/srt/managers/schedule_batch.py | 10 ++++++++++
 python/sglang/srt/managers/scheduler.py      |  9 +++++++++
 2 files changed, 19 insertions(+)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 9345d8c8d243..e521ca60671a 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1414,6 +1414,15 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # This is an optimization to reduce the overhead of the prefill check.
     batch_is_full: bool = False
 
+    # The chunked-resume req that was admitted into this batch as mid-prefill
+    # (truncated at admit time -> has_pending_chunk True). Set by the scheduler
+    # right after init_new; consulted by filter_batch to exclude this req from
+    # merging into running_batch in subsequent iters. Required for PP, where
+    # mb_a's last-chunk admit clears has_pending_chunk but mb_b is still holding
+    # a middle-chunk batch in its last_batch slot — without this per-batch
+    # marker, mb_b would merge the still-prefilling req into running_batch.
+    chunked_req: Optional[Req] = None
+
     # Sampling info
     sampling_info: SamplingBatchInfo = None
 
@@ -2441,6 +2450,7 @@ def filter_batch(
                         self.reqs[i].has_pending_chunk
                         or self.reqs[i].pending_middle_outputs > 0
                         or self.reqs[i].is_dllm()
+                        or self.reqs[i] is self.chunked_req
                     )
                 )
             ]
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 2dd0ea94e103..897a4cc139f1 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2854,6 +2854,15 @@ def _get_new_batch_prefill_raw(
             self.enable_overlap,
             self.spec_algorithm,
         )
+        # Stamp the batch's chunked_req at admit time so subsequent filter_batch
+        # calls (across PP microbatches) can exclude this req from running_batch
+        # merging until its last chunk's forward result has been processed.
+        # has_pending_chunk-based filtering alone is insufficient: in PP, when
+        # mb_a admits the last chunk (has_pending_chunk -> False) but mb_b still
+        # holds a middle-chunk batch in its last_batch slot, mb_b would merge
+        # the still-prefilling req into running_batch.
+        if chunked_in_batch:
+            new_batch.chunked_req = chunked_in_batch[0]
         self.max_prefill_bs = max(self.max_prefill_bs, len(can_run_list))
         if self.enable_hierarchical_cache:
             # todo (zhiqiang): disable cuda graph execution if hicache loading triggered

From 11db3a4192446432cbe0144b5b627b89024331a3 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 14 May 2026 18:26:07 +0800
Subject: [PATCH 34/52] Revert "Re-add ScheduleBatch.chunked_req marker for PP
 cross-mb filter exclusion"

This reverts commit 33f981ce935489434f0e99e46bf45b2655bbe981.
---
 python/sglang/srt/managers/schedule_batch.py | 10 ----------
 python/sglang/srt/managers/scheduler.py      |  9 ---------
 2 files changed, 19 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index e521ca60671a..9345d8c8d243 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1414,15 +1414,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # This is an optimization to reduce the overhead of the prefill check.
     batch_is_full: bool = False
 
-    # The chunked-resume req that was admitted into this batch as mid-prefill
-    # (truncated at admit time -> has_pending_chunk True). Set by the scheduler
-    # right after init_new; consulted by filter_batch to exclude this req from
-    # merging into running_batch in subsequent iters. Required for PP, where
-    # mb_a's last-chunk admit clears has_pending_chunk but mb_b is still holding
-    # a middle-chunk batch in its last_batch slot — without this per-batch
-    # marker, mb_b would merge the still-prefilling req into running_batch.
-    chunked_req: Optional[Req] = None
-
     # Sampling info
     sampling_info: SamplingBatchInfo = None
 
@@ -2450,7 +2441,6 @@ def filter_batch(
                         self.reqs[i].has_pending_chunk
                         or self.reqs[i].pending_middle_outputs > 0
                         or self.reqs[i].is_dllm()
-                        or self.reqs[i] is self.chunked_req
                     )
                 )
             ]
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 897a4cc139f1..2dd0ea94e103 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2854,15 +2854,6 @@ def _get_new_batch_prefill_raw(
             self.enable_overlap,
             self.spec_algorithm,
         )
-        # Stamp the batch's chunked_req at admit time so subsequent filter_batch
-        # calls (across PP microbatches) can exclude this req from running_batch
-        # merging until its last chunk's forward result has been processed.
-        # has_pending_chunk-based filtering alone is insufficient: in PP, when
-        # mb_a admits the last chunk (has_pending_chunk -> False) but mb_b still
-        # holds a middle-chunk batch in its last_batch slot, mb_b would merge
-        # the still-prefilling req into running_batch.
-        if chunked_in_batch:
-            new_batch.chunked_req = chunked_in_batch[0]
         self.max_prefill_bs = max(self.max_prefill_bs, len(can_run_list))
         if self.enable_hierarchical_cache:
             # todo (zhiqiang): disable cuda graph execution if hicache loading triggered

From b3a7b9f2a10cde180f20e01fcd81ca7a76b10224 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 14 May 2026 18:39:17 +0800
Subject: [PATCH 35/52] Bump pending_middle_outputs for last-chunk admits +
 decrement-first output proc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PP+chunked-prefill correctness: pending_middle_outputs is the count of
forwards launched but not yet output-processed. The c445 design only
bumped this counter for mid-chunk admits (has_pending_chunk True after),
and output proc checked the counter at entry to decide produce-output vs
middle-handling. In PP, this drops a forward result on the floor when a
sibling mb is concurrently processing a later chunk's middle forward —
the LAST chunk's admit doesn't bump the counter, so a mid-chunk forward
result for the same req can take the produce-output branch with stale
state.

Fix:
  - At admit, bump pending_middle_outputs for any req participating in a
    multi-chunk prefill: has_pending_chunk (mid-chunk this iter) OR
    kv_committed_len > 0 (was-resume → this iter is last chunk OR another
    mid). kv_committed_len here reflects the prior iter's prepare_for_extend,
    not this iter's.
  - Output proc decrements first, then produces output iff the counter has
    just hit zero AND has_pending_chunk is False. Otherwise this is a
    non-last forward in the PP pipeline; suppress the produce-output path.

Filter_batch's existing pending_middle_outputs > 0 predicate now correctly
excludes a mid-prefill req from running_batch merge until ALL its in-flight
forwards (across mbs) have been output-processed — no per-batch chunked_req
marker required, preserving the v2 design of per-Req-only chunked state.
---
 python/sglang/srt/managers/scheduler.py       | 29 +++++++++-----
 .../scheduler_output_processor_mixin.py       | 39 ++++++++++++++-----
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 2dd0ea94e103..a6924662b23b 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2824,18 +2824,29 @@ def _get_new_batch_prefill_raw(
             for req in adder.preempt_list:
                 self._add_request_to_queue(req)
 
-        # Bump pending_middle_outputs (the pending_middle_outputs counter) for every
-        # admitted req that's still mid-prefill — output processor uses this
-        # to know its forward's sample is garbage. Counter semantics needed
-        # for PP, where multiple microbatches may admit the same req.
-        chunked_in_batch = [r for r in can_run_list if r.has_pending_chunk]
+        # Bump pending_middle_outputs for every admitted req whose admission is
+        # part of a multi-chunk prefill — both mid-chunk admits (has_pending_chunk
+        # stays True after this admit) AND the last-chunk admit of a previously
+        # chunked-resume req (kv_committed_len > 0 means a prior chunk's prepare
+        # already wrote to its row, so this req has been chunk-prefilled before).
+        # The counter is the number of forwards launched but not yet output-
+        # processed. Output processor decrements first, then checks whether
+        # this was the last pending forward; required for PP, where the LAST
+        # chunk's forward result may not be the last forward in flight for the
+        # req (a sibling mb may hold a mid-chunk forward still pipelined).
+        # kv_committed_len here reflects the PRIOR iter's prepare_for_extend;
+        # this iter's prepare_for_extend has not yet run.
+        chunk_admits = [
+            r for r in can_run_list if r.has_pending_chunk or r.kv_committed_len > 0
+        ]
         assert (
-            len(chunked_in_batch) <= 1
-        ), "single-flight invariant: at most one chunked-resume req per batch"
+            sum(1 for r in chunk_admits if r.has_pending_chunk) <= 1
+        ), "single-flight invariant: at most one mid-chunk admit per batch"
         chunk_deduct = 0
-        for r in chunked_in_batch:
+        for r in chunk_admits:
             r.pending_middle_outputs += 1
-            chunk_deduct = r.extend_input_len
+            if r.has_pending_chunk:
+                chunk_deduct = r.extend_input_len
 
         # Record for logging prefill stats after forward
         self.adder = adder
diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
index 5fa5313678d5..d74b78d1c8cc 100644
--- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py
+++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -241,7 +241,22 @@ def process_batch_result_prefill(
                     # decode req in mixed batch or retracted req
                     continue
 
-                if req.pending_middle_outputs <= 0:
+                # Decrement-first semantics: pending_middle_outputs is the number
+                # of forwards launched for this req that have not yet been output-
+                # processed. After decrement, this is the LAST pending forward iff
+                # the counter hits zero AND no more chunks are coming
+                # (has_pending_chunk is cleared by add_one_req's last-chunk path).
+                # In PP, a sibling mb may hold a mid-chunk forward in its pipeline
+                # whose result has yet to be processed — we must not finalize the
+                # prefill until pmo reaches zero.
+                is_last_chunk_output = True
+                if req.pending_middle_outputs > 0:
+                    req.pending_middle_outputs -= 1
+                    is_last_chunk_output = (
+                        req.pending_middle_outputs == 0 and not req.has_pending_chunk
+                    )
+
+                if is_last_chunk_output:
                     req.time_stats.set_prefill_finished_time()
 
                     # req output_ids are set here
@@ -313,11 +328,9 @@ def process_batch_result_prefill(
                         req.grammar.finished = req.finished()
 
                 else:
-                    # being chunked reqs' prefill is not finished
-                    req.pending_middle_outputs -= 1
-                    # There is only at most one request being currently chunked.
-                    # Because this request does not finish prefill,
-                    # we don't want to stream the request currently being chunked.
+                    # Middle chunk forward (or non-last forward in PP pipeline):
+                    # prefill not yet finalized; counter already decremented above.
+                    # We don't want to stream the request currently being chunked.
                     skip_stream_req = req
 
                     # Incrementally update input logprobs.
@@ -380,7 +393,17 @@ def process_batch_result_prefill(
                 req.embedding = embeddings[i]
                 if req.return_pooled_hidden_states and phs is not None:
                     req.pooled_hidden_state = phs[i]
-                if req.pending_middle_outputs <= 0:
+
+                # Decrement-first; mirrors the generation-model branch above.
+                # See that branch for the PP rationale.
+                is_last_chunk_output = True
+                if req.pending_middle_outputs > 0:
+                    req.pending_middle_outputs -= 1
+                    is_last_chunk_output = (
+                        req.pending_middle_outputs == 0 and not req.has_pending_chunk
+                    )
+
+                if is_last_chunk_output:
                     req.time_stats.set_prefill_finished_time()
                     # Dummy output token for embedding models
                     req.output_ids.append(0)
@@ -392,8 +415,6 @@ def process_batch_result_prefill(
                     else:
                         maybe_cache_unfinished_req(req, self.tree_cache)
                 else:
-                    # being chunked reqs' prefill is not finished
-                    req.pending_middle_outputs -= 1
                     req.time_stats.set_last_chunked_prefill_finish_time()
 
         self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)

From e875cd36e4afbb80097cb73498fb16a157592123 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 14 May 2026 19:12:58 +0800
Subject: [PATCH 36/52] Revert "Bump pending_middle_outputs for last-chunk
 admits + decrement-first output proc"

This reverts commit b3a7b9f2a10cde180f20e01fcd81ca7a76b10224.
---
 python/sglang/srt/managers/scheduler.py       | 29 +++++---------
 .../scheduler_output_processor_mixin.py       | 39 +++++--------------
 2 files changed, 18 insertions(+), 50 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index a6924662b23b..2dd0ea94e103 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2824,29 +2824,18 @@ def _get_new_batch_prefill_raw(
             for req in adder.preempt_list:
                 self._add_request_to_queue(req)
 
-        # Bump pending_middle_outputs for every admitted req whose admission is
-        # part of a multi-chunk prefill — both mid-chunk admits (has_pending_chunk
-        # stays True after this admit) AND the last-chunk admit of a previously
-        # chunked-resume req (kv_committed_len > 0 means a prior chunk's prepare
-        # already wrote to its row, so this req has been chunk-prefilled before).
-        # The counter is the number of forwards launched but not yet output-
-        # processed. Output processor decrements first, then checks whether
-        # this was the last pending forward; required for PP, where the LAST
-        # chunk's forward result may not be the last forward in flight for the
-        # req (a sibling mb may hold a mid-chunk forward still pipelined).
-        # kv_committed_len here reflects the PRIOR iter's prepare_for_extend;
-        # this iter's prepare_for_extend has not yet run.
-        chunk_admits = [
-            r for r in can_run_list if r.has_pending_chunk or r.kv_committed_len > 0
-        ]
+        # Bump pending_middle_outputs (the pending_middle_outputs counter) for every
+        # admitted req that's still mid-prefill — output processor uses this
+        # to know its forward's sample is garbage. Counter semantics needed
+        # for PP, where multiple microbatches may admit the same req.
+        chunked_in_batch = [r for r in can_run_list if r.has_pending_chunk]
         assert (
-            sum(1 for r in chunk_admits if r.has_pending_chunk) <= 1
-        ), "single-flight invariant: at most one mid-chunk admit per batch"
+            len(chunked_in_batch) <= 1
+        ), "single-flight invariant: at most one chunked-resume req per batch"
         chunk_deduct = 0
-        for r in chunk_admits:
+        for r in chunked_in_batch:
             r.pending_middle_outputs += 1
-            if r.has_pending_chunk:
-                chunk_deduct = r.extend_input_len
+            chunk_deduct = r.extend_input_len
 
         # Record for logging prefill stats after forward
         self.adder = adder
diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
index d74b78d1c8cc..5fa5313678d5 100644
--- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py
+++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -241,22 +241,7 @@ def process_batch_result_prefill(
                     # decode req in mixed batch or retracted req
                     continue
 
-                # Decrement-first semantics: pending_middle_outputs is the number
-                # of forwards launched for this req that have not yet been output-
-                # processed. After decrement, this is the LAST pending forward iff
-                # the counter hits zero AND no more chunks are coming
-                # (has_pending_chunk is cleared by add_one_req's last-chunk path).
-                # In PP, a sibling mb may hold a mid-chunk forward in its pipeline
-                # whose result has yet to be processed — we must not finalize the
-                # prefill until pmo reaches zero.
-                is_last_chunk_output = True
-                if req.pending_middle_outputs > 0:
-                    req.pending_middle_outputs -= 1
-                    is_last_chunk_output = (
-                        req.pending_middle_outputs == 0 and not req.has_pending_chunk
-                    )
-
-                if is_last_chunk_output:
+                if req.pending_middle_outputs <= 0:
                     req.time_stats.set_prefill_finished_time()
 
                     # req output_ids are set here
@@ -328,9 +313,11 @@ def process_batch_result_prefill(
                         req.grammar.finished = req.finished()
 
                 else:
-                    # Middle chunk forward (or non-last forward in PP pipeline):
-                    # prefill not yet finalized; counter already decremented above.
-                    # We don't want to stream the request currently being chunked.
+                    # being chunked reqs' prefill is not finished
+                    req.pending_middle_outputs -= 1
+                    # There is only at most one request being currently chunked.
+                    # Because this request does not finish prefill,
+                    # we don't want to stream the request currently being chunked.
                     skip_stream_req = req
 
                     # Incrementally update input logprobs.
@@ -393,17 +380,7 @@ def process_batch_result_prefill(
                 req.embedding = embeddings[i]
                 if req.return_pooled_hidden_states and phs is not None:
                     req.pooled_hidden_state = phs[i]
-
-                # Decrement-first; mirrors the generation-model branch above.
-                # See that branch for the PP rationale.
-                is_last_chunk_output = True
-                if req.pending_middle_outputs > 0:
-                    req.pending_middle_outputs -= 1
-                    is_last_chunk_output = (
-                        req.pending_middle_outputs == 0 and not req.has_pending_chunk
-                    )
-
-                if is_last_chunk_output:
+                if req.pending_middle_outputs <= 0:
                     req.time_stats.set_prefill_finished_time()
                     # Dummy output token for embedding models
                     req.output_ids.append(0)
@@ -415,6 +392,8 @@ def process_batch_result_prefill(
                     else:
                         maybe_cache_unfinished_req(req, self.tree_cache)
                 else:
+                    # being chunked reqs' prefill is not finished
+                    req.pending_middle_outputs -= 1
                     req.time_stats.set_last_chunked_prefill_finish_time()
 
         self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)

From 5c523049dbe0ca7cbff0e87f598caaae0119aea2 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 14 May 2026 19:24:05 +0800
Subject: [PATCH 37/52] Exclude in-flight other-mb reqs in filter_batch (PP
 chunked-resume race)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In PP+chunked-prefill: mb_a admits the LAST chunk of req X (clearing
has_pending_chunk, with pending_middle_outputs unchanged since chunked
admits only bump for mid-chunk). mb_b is still holding a prior mid-chunk
batch in its last_batch slot. By the time mb_b runs filter_batch on
that last_batch, X's last-chunk forward result has not yet arrived in
the output processor — but X's dynamic predicate fields (has_pending_chunk
False, pending_middle_outputs may already be 0 from an earlier mid-chunk
output proc) make filter_batch keep X. X merges into running_batch and
runs a decode forward on stale state → wrong tokens.

Fix: at filter_batch call sites in get_next_batch_to_run, compute the set
of req rids whose forward batches are in flight in other PP microbatches
(self.mbs[other_id] != self.last_batch) and pass it to filter_batch as an
additional exclusion set. filter_batch keeps the dynamic per-Req predicate
intact and only adds a transient "in-flight elsewhere" check that lives
in the caller's scope — no per-batch chunked-aware state, preserving the
stateless-scheduler v2 design invariant.

mbs[other_id] at this point always holds the other mb's most-recently
launched batch; for pp_loop_size==2, it is guaranteed to be in-flight
(launched in this iter mb_(id-1)%pp step, processed at end of this iter
mb_(id+1)%pp step). For pp_size==1, the set is empty and filter_batch
behaves exactly as before.
---
 python/sglang/srt/managers/schedule_batch.py |  3 ++
 python/sglang/srt/managers/scheduler.py      | 47 +++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 9345d8c8d243..dd4dd36e1a19 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2425,12 +2425,14 @@ def filter_batch(
         # FIXME(lsyin): deprecate this API after spec v1 is deprecated
         v1_spec_info_filtered: Optional[bool] = False,
         exclude_chunked_req: bool = False,
+        exclude_in_flight_other_mb: Optional[set] = None,
     ):
         # FIXME(lsyin): used here to get the correct seq_lens
         # The batch has been launched but we need it verified to get correct next batch info
         self.maybe_wait_verify_done()
 
         if keep_indices is None:
+            in_flight_rids = exclude_in_flight_other_mb or set()
             keep_indices = [
                 i
                 for i in range(len(self.reqs))
@@ -2443,6 +2445,7 @@ def filter_batch(
                         or self.reqs[i].is_dllm()
                     )
                 )
+                and self.reqs[i].rid not in in_flight_rids
             ]
 
         if keep_indices is None or len(keep_indices) == 0:
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 2dd0ea94e103..94f24653c088 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2480,6 +2480,33 @@ def _build_hisparse_decode_batch(self, reqs):
         # todo hisparse, maybe other info to contain for the new batch
         return batch
 
+    def _in_flight_other_mb_rids(self) -> set:
+        """rids of reqs whose forward is launched in another mb but whose
+        result has not yet been processed by the output processor.
+
+        Used by filter_batch on last_batch / running_batch to exclude
+        chunked-prefill reqs whose LAST chunk admit cleared has_pending_chunk
+        (and pending_middle_outputs may have been decremented to 0 by an
+        earlier mid-chunk forward result), but whose actual last-chunk
+        forward result has not yet arrived in the output processor — they
+        must not be merged into running_batch as decode reqs yet.
+
+        At PP loop iter T mb_id step's filter_batch time, mbs[other_id !=
+        mb_id] holds an in-flight forward batch (launched, not yet
+        processed). For pp_loop_size==2, the other mb's batch is always
+        in-flight at this point. Skip self.last_batch (==mbs[mb_id], the
+        batch being filtered itself).
+        """
+        if self.pp_size <= 1 or not hasattr(self, "mbs"):
+            return set()
+        rids = set()
+        for mb in self.mbs:
+            if mb is None or mb is self.last_batch:
+                continue
+            for r in mb.reqs:
+                rids.add(r.rid)
+        return rids
+
     def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         if self.enable_fpm:
             self._fpm_batch_t0 = time.monotonic()
@@ -2535,7 +2562,20 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # dropped reqs persist in self.waiting_queue (retention at
             # ~line 2775: `x not in can_run_set or x.has_pending_chunk`)
             # and re-enter via next iter's Stage A stash + admission.
-            self.last_batch.filter_batch(exclude_chunked_req=True)
+            #
+            # PP cross-mb: also drop reqs whose forward result is still
+            # pending in another mb's pipeline. has_pending_chunk +
+            # pending_middle_outputs alone do not cover the window where
+            # mb_a admitted the LAST chunk (clearing has_pending_chunk; not
+            # bumping pending_middle_outputs since chunked_in_batch only
+            # counts mid-chunk admits) but mb_a's forward result has not
+            # yet been processed — without this exclusion, mb_b would merge
+            # the still-prefilling req into running_batch and run a decode
+            # forward on stale state.
+            self.last_batch.filter_batch(
+                exclude_chunked_req=True,
+                exclude_in_flight_other_mb=self._in_flight_other_mb_rids(),
+            )
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
@@ -2558,7 +2598,10 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # shouldn't normally hold one. Keep the flag set so any leak in
             # that invariant doesn't survive here; the dropped req still
             # has its waiting_queue retention to re-admit next iter.
-            self.running_batch.filter_batch(exclude_chunked_req=True)
+            self.running_batch.filter_batch(
+                exclude_chunked_req=True,
+                exclude_in_flight_other_mb=self._in_flight_other_mb_rids(),
+            )
             if self.running_batch.is_empty():
                 self.running_batch.batch_is_full = False
 

From 45347ca3a32590985dead61a5b818734b40d8516 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 14 May 2026 21:14:58 +0800
Subject: [PATCH 38/52] Revert "Exclude in-flight other-mb reqs in filter_batch
 (PP chunked-resume race)"

This reverts commit 5c523049dbe0ca7cbff0e87f598caaae0119aea2.
---
 python/sglang/srt/managers/schedule_batch.py |  3 --
 python/sglang/srt/managers/scheduler.py      | 47 +-------------------
 2 files changed, 2 insertions(+), 48 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index dd4dd36e1a19..9345d8c8d243 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2425,14 +2425,12 @@ def filter_batch(
         # FIXME(lsyin): deprecate this API after spec v1 is deprecated
         v1_spec_info_filtered: Optional[bool] = False,
         exclude_chunked_req: bool = False,
-        exclude_in_flight_other_mb: Optional[set] = None,
     ):
         # FIXME(lsyin): used here to get the correct seq_lens
         # The batch has been launched but we need it verified to get correct next batch info
         self.maybe_wait_verify_done()
 
         if keep_indices is None:
-            in_flight_rids = exclude_in_flight_other_mb or set()
             keep_indices = [
                 i
                 for i in range(len(self.reqs))
@@ -2445,7 +2443,6 @@ def filter_batch(
                         or self.reqs[i].is_dllm()
                     )
                 )
-                and self.reqs[i].rid not in in_flight_rids
             ]
 
         if keep_indices is None or len(keep_indices) == 0:
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 94f24653c088..2dd0ea94e103 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2480,33 +2480,6 @@ def _build_hisparse_decode_batch(self, reqs):
         # todo hisparse, maybe other info to contain for the new batch
         return batch
 
-    def _in_flight_other_mb_rids(self) -> set:
-        """rids of reqs whose forward is launched in another mb but whose
-        result has not yet been processed by the output processor.
-
-        Used by filter_batch on last_batch / running_batch to exclude
-        chunked-prefill reqs whose LAST chunk admit cleared has_pending_chunk
-        (and pending_middle_outputs may have been decremented to 0 by an
-        earlier mid-chunk forward result), but whose actual last-chunk
-        forward result has not yet arrived in the output processor — they
-        must not be merged into running_batch as decode reqs yet.
-
-        At PP loop iter T mb_id step's filter_batch time, mbs[other_id !=
-        mb_id] holds an in-flight forward batch (launched, not yet
-        processed). For pp_loop_size==2, the other mb's batch is always
-        in-flight at this point. Skip self.last_batch (==mbs[mb_id], the
-        batch being filtered itself).
-        """
-        if self.pp_size <= 1 or not hasattr(self, "mbs"):
-            return set()
-        rids = set()
-        for mb in self.mbs:
-            if mb is None or mb is self.last_batch:
-                continue
-            for r in mb.reqs:
-                rids.add(r.rid)
-        return rids
-
     def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         if self.enable_fpm:
             self._fpm_batch_t0 = time.monotonic()
@@ -2562,20 +2535,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # dropped reqs persist in self.waiting_queue (retention at
             # ~line 2775: `x not in can_run_set or x.has_pending_chunk`)
             # and re-enter via next iter's Stage A stash + admission.
-            #
-            # PP cross-mb: also drop reqs whose forward result is still
-            # pending in another mb's pipeline. has_pending_chunk +
-            # pending_middle_outputs alone do not cover the window where
-            # mb_a admitted the LAST chunk (clearing has_pending_chunk; not
-            # bumping pending_middle_outputs since chunked_in_batch only
-            # counts mid-chunk admits) but mb_a's forward result has not
-            # yet been processed — without this exclusion, mb_b would merge
-            # the still-prefilling req into running_batch and run a decode
-            # forward on stale state.
-            self.last_batch.filter_batch(
-                exclude_chunked_req=True,
-                exclude_in_flight_other_mb=self._in_flight_other_mb_rids(),
-            )
+            self.last_batch.filter_batch(exclude_chunked_req=True)
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
@@ -2598,10 +2558,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # shouldn't normally hold one. Keep the flag set so any leak in
             # that invariant doesn't survive here; the dropped req still
             # has its waiting_queue retention to re-admit next iter.
-            self.running_batch.filter_batch(
-                exclude_chunked_req=True,
-                exclude_in_flight_other_mb=self._in_flight_other_mb_rids(),
-            )
+            self.running_batch.filter_batch(exclude_chunked_req=True)
             if self.running_batch.is_empty():
                 self.running_batch.batch_is_full = False
 

From 69ef71edc45c80958386229fc1d7bed2875ab70a Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 14 May 2026 21:29:55 +0800
Subject: [PATCH 39/52] Conditionally exclude in-flight other-mb chunked-resume
 reqs (PP, max_new_tokens > 1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In PP+chunked-prefill, mb_a's LAST chunk admit clears has_pending_chunk
on the req while mb_a's chunk forward result is still in flight in another
mb's pipeline. Without exclusion, mb_b's filter_batch merges the req into
running_batch and mb_b's decode forward writes WRONG K,V at row position
N (input falls back to origin[-1] since req.output_ids is empty at that
point). For max_new_tokens > 1, the wrong K,V at N persists in the KV
pool and corrupts every subsequent decode position.

For max_new_tokens == 1, the wrong decode result is filtered by the
req.finished() check in the output processor BEFORE being appended to
output_ids, and the wrong K,V at N is released with the rest of the row
when the req finishes — no observable effect. Excluding such reqs would
delay them by 1 mb step for no correctness gain, so we skip them.

This conditional preserves the parallelism that test_pp_long_context_prefill
(output_len=1) relies on while still fixing PP gsm8k correctness
(max_new_tokens=512+, score 0.66 -> 0.77). The fix uses no per-batch or
scheduler-level chunked-aware state — only a transient set of rids
computed at filter_batch call time from self.mbs.

Reads mbs[other_id != current_mb_id] which holds the other mb's most
recently launched batch; for pp_loop_size==2, that batch is guaranteed
to still be in flight (launched in the current iter's mb_(id-1)%pp step,
processed at end of the current iter's mb_(id+1)%pp step).
---
 python/sglang/srt/managers/schedule_batch.py |  3 ++
 python/sglang/srt/managers/scheduler.py      | 49 +++++++++++++++++++-
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 9345d8c8d243..dd4dd36e1a19 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2425,12 +2425,14 @@ def filter_batch(
         # FIXME(lsyin): deprecate this API after spec v1 is deprecated
         v1_spec_info_filtered: Optional[bool] = False,
         exclude_chunked_req: bool = False,
+        exclude_in_flight_other_mb: Optional[set] = None,
     ):
         # FIXME(lsyin): used here to get the correct seq_lens
         # The batch has been launched but we need it verified to get correct next batch info
         self.maybe_wait_verify_done()
 
         if keep_indices is None:
+            in_flight_rids = exclude_in_flight_other_mb or set()
             keep_indices = [
                 i
                 for i in range(len(self.reqs))
@@ -2443,6 +2445,7 @@ def filter_batch(
                         or self.reqs[i].is_dllm()
                     )
                 )
+                and self.reqs[i].rid not in in_flight_rids
             ]
 
         if keep_indices is None or len(keep_indices) == 0:
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 2dd0ea94e103..56215174adaa 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2480,6 +2480,41 @@ def _build_hisparse_decode_batch(self, reqs):
         # todo hisparse, maybe other info to contain for the new batch
         return batch
 
+    def _in_flight_other_mb_rids(self) -> set:
+        """rids of reqs whose chunked-prefill forward is launched in another
+        PP microbatch but whose result has not yet been processed by the
+        output processor — AND for which a follow-up decode would actually
+        propagate corruption (max_new_tokens > 1).
+
+        In PP+chunked-prefill, mb_a's LAST chunk admit clears has_pending_chunk
+        on the req while mb_a's chunk forward result is still in flight. If
+        mb_b's filter_batch merges this req into running_batch, mb_b's decode
+        forward runs on stale state — input falls back to origin[-1] and
+        writes WRONG K,V at row position N. The wrong K,V at N persists in
+        the KV pool and corrupts every subsequent decode position.
+
+        For req.sampling_params.max_new_tokens == 1, the wrong decode result
+        is filtered by `req.finished()` (line ~240) before being appended,
+        and the wrong K,V at N is released with the rest of the row when
+        the req finishes — no observable effect. Excluding such reqs would
+        delay them by 1 mb step for no correctness gain, so we skip them
+        here and only return rids of reqs that genuinely need protection.
+        """
+        if self.pp_size <= 1 or not hasattr(self, "mbs"):
+            return set()
+        rids = set()
+        for mb in self.mbs:
+            if mb is None or mb is self.last_batch:
+                continue
+            for r in mb.reqs:
+                # max_new_tokens is normalized to a non-None int in
+                # _prepare_input_for_image_request / similar paths during
+                # request admission, but defensively handle missing/zero.
+                max_new = r.sampling_params.max_new_tokens or 0
+                if max_new > 1:
+                    rids.add(r.rid)
+        return rids
+
     def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         if self.enable_fpm:
             self._fpm_batch_t0 = time.monotonic()
@@ -2535,7 +2570,14 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # dropped reqs persist in self.waiting_queue (retention at
             # ~line 2775: `x not in can_run_set or x.has_pending_chunk`)
             # and re-enter via next iter's Stage A stash + admission.
-            self.last_batch.filter_batch(exclude_chunked_req=True)
+            #
+            # PP cross-mb: also drop reqs whose LAST chunk forward is still
+            # in flight in another mb (when more decodes will follow — i.e.,
+            # max_new_tokens > 1). See _in_flight_other_mb_rids for rationale.
+            self.last_batch.filter_batch(
+                exclude_chunked_req=True,
+                exclude_in_flight_other_mb=self._in_flight_other_mb_rids(),
+            )
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
@@ -2558,7 +2600,10 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # shouldn't normally hold one. Keep the flag set so any leak in
             # that invariant doesn't survive here; the dropped req still
             # has its waiting_queue retention to re-admit next iter.
-            self.running_batch.filter_batch(exclude_chunked_req=True)
+            self.running_batch.filter_batch(
+                exclude_chunked_req=True,
+                exclude_in_flight_other_mb=self._in_flight_other_mb_rids(),
+            )
             if self.running_batch.is_empty():
                 self.running_batch.batch_is_full = False
 

From 14adb095469b1dc69e95d5036a7d1ec30b6b5fba Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 19 May 2026 23:00:49 +0800
Subject: [PATCH 40/52] Rename inflight_middle_chunks -> pending_middle_outputs
 (revert upstream rename)

The name 'pending_middle_outputs' more precisely describes what the
counter tracks: middle-block prefill forwards that are admitted but
not yet output-processed (output processor uses it to decide whether
this forward's sample is real (==0) or garbage (>0)). Restore the
local-branch name across all call sites.
---
 python/sglang/srt/disaggregation/prefill.py          |  4 ++--
 python/sglang/srt/dllm/mixin/scheduler.py            |  6 +++---
 python/sglang/srt/managers/schedule_batch.py         | 10 +++++-----
 python/sglang/srt/managers/schedule_policy.py        |  2 +-
 python/sglang/srt/managers/scheduler.py              | 12 ++++++------
 .../scheduler_components/batch_result_processor.py   |  8 ++++----
 test/registered/unit/managers/test_hisparse_unit.py  |  2 +-
 7 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index c1fdb96aeb3e..6891603e4fa8 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -514,7 +514,7 @@ def process_batch_result_disagg_prefill(
         for i, (req, next_token_id) in enumerate(
             zip(batch.reqs, next_token_ids, strict=True)
         ):
-            if req.inflight_middle_chunks <= 0:
+            if req.pending_middle_outputs <= 0:
                 req.time_stats.set_prefill_finished_time()
 
                 # There is no output_ids for prefill
@@ -564,7 +564,7 @@ def process_batch_result_disagg_prefill(
                     req.grammar.finished = req.finished()
             else:
                 # being chunked reqs' prefill is not finished
-                req.inflight_middle_chunks -= 1
+                req.pending_middle_outputs -= 1
 
                 if req.return_logprob:
                     extend_logprob_start_len = extend_logprob_start_len_per_req[i]
diff --git a/python/sglang/srt/dllm/mixin/scheduler.py b/python/sglang/srt/dllm/mixin/scheduler.py
index b438bb2e583b..37110c315e62 100644
--- a/python/sglang/srt/dllm/mixin/scheduler.py
+++ b/python/sglang/srt/dllm/mixin/scheduler.py
@@ -200,7 +200,7 @@ def _update_state_for_batch(
 
         if can_run_list:
             self.dllm_manager.add_staging_reqs(can_run_list)
-            self.dllm_manager.increment_inflight_middle_chunks()
+            self.dllm_manager.increment_pending_middle_outputs()
 
         self.adder = adder
         self.can_run_list = can_run_list
@@ -337,10 +337,10 @@ def is_empty(self) -> bool:
             return True
         return len(self.waiting_queue) == 0
 
-    def increment_inflight_middle_chunks(self) -> None:
+    def increment_pending_middle_outputs(self) -> None:
         """Increment chunked count for all staging requests."""
         for req in self.staging_queue:
-            req.inflight_middle_chunks += 1
+            req.pending_middle_outputs += 1
 
     def filter_finished_reqs(self) -> None:
         """Remove finished requests from both queues."""
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index e0d3f42730ac..7e8734c756ba 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -791,13 +791,13 @@ def __init__(
         # req in flight concurrently. In non-PP, oscillates 0/1 within each
         # iter. Used by output_processor to know whether this forward's
         # sample is real (==0) or garbage (>0).
-        self.inflight_middle_chunks = 0
+        self.pending_middle_outputs = 0
 
         # Persistent (cross-iter) flag set by admission when this req's
         # current admission was truncated (more chunks remain). Cleared
         # when last chunk is admitted (truncated=False) or on retract.
         # Used by Stage A stash detection, filter_batch exclusion, and
-        # add_one_req's reuse-vs-fresh branch. Independent of inflight_middle_chunks
+        # add_one_req's reuse-vs-fresh branch. Independent of pending_middle_outputs
         # counter (transient) and kv_committed_len (derived).
         self.has_pending_chunk = False
 
@@ -1292,7 +1292,7 @@ def reset_for_retract(self):
         self.temp_input_top_logprobs_val = None
         self.temp_input_top_logprobs_idx = None
         self.extend_logprob_start_len = 0
-        self.inflight_middle_chunks = 0
+        self.pending_middle_outputs = 0
         self.has_pending_chunk = False
         self.mamba_pool_idx = None
         self.mamba_ping_pong_track_buffer = None
@@ -2501,7 +2501,7 @@ def filter_batch(
                     exclude_chunked_req
                     and (
                         self.reqs[i].has_pending_chunk
-                        or self.reqs[i].inflight_middle_chunks > 0
+                        or self.reqs[i].pending_middle_outputs > 0
                         or self.reqs[i].is_dllm()
                     )
                 )
@@ -2585,7 +2585,7 @@ def merge_batch(self, other: "ScheduleBatch"):
         # the full exclude_chunked_req predicate so PP middle-chunk and DLLM
         # staging reqs are also caught here.
         assert not any(
-            r.has_pending_chunk or r.inflight_middle_chunks > 0 or r.is_dllm()
+            r.has_pending_chunk or r.pending_middle_outputs > 0 or r.is_dllm()
             for r in other.reqs
         )
 
diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index b0564f33be78..c9c4a3636437 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -959,7 +959,7 @@ def add_one_req(self, req: Req, truncation_align_size: Optional[int]):
                 truncated = True
 
         # has_pending_chunk: persistent flag carrying chunked-resume state
-        # across iters. DLLM uses its own staging_queue + inflight_middle_chunks counter.
+        # across iters. DLLM uses its own staging_queue + pending_middle_outputs counter.
         if not req.is_dllm():
             req.has_pending_chunk = truncated
 
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index b974bdd763b6..7e33f060c999 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1022,7 +1022,7 @@ def init_chunked_prefill(self):
         elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
             self.chunked_prefill_size = None
         # Chunked-resume tracking is now per-req (Req.has_pending_chunk +
-        # inflight_middle_chunks counter); the scheduler no longer holds a global pointer.
+        # pending_middle_outputs counter); the scheduler no longer holds a global pointer.
         # Stage A stashes any waiting_queue req with has_pending_chunk; cache
         # impls bound row reads by kv_committed_len so a stash after
         # init_next_round_input is safe without the old gate.
@@ -2651,7 +2651,7 @@ def _get_new_batch_prefill_raw(
             for req in adder.preempt_list:
                 self._add_request_to_queue(req)
 
-        # Bump inflight_middle_chunks for every admitted req that's still
+        # Bump pending_middle_outputs for every admitted req that's still
         # mid-prefill — output processor uses this to know its forward's
         # sample is garbage. Counter semantics needed for PP, where multiple
         # microbatches may admit the same req.
@@ -2661,7 +2661,7 @@ def _get_new_batch_prefill_raw(
         ), "single-flight invariant: at most one chunked-resume req per batch"
         chunk_deduct = 0
         for r in chunked_in_batch:
-            r.inflight_middle_chunks += 1
+            r.pending_middle_outputs += 1
             chunk_deduct = r.extend_input_len
 
         set_time_batch(can_run_list, "set_forward_entry_time")
@@ -3463,11 +3463,11 @@ def abort_request(self, recv_req: AbortReq):
         # 'in batch'. Each mb's forward was launched against the req's
         # req_pool_idx + KV slots; the output processor on a different mb
         # iteration consumes the result later. Without this, a chunked-resume
-        # req with inflight_middle_chunks > 0 sitting in waiting_queue would
+        # req with pending_middle_outputs > 0 sitting in waiting_queue would
         # fall into the waiting-only abort path, release_kv_cache would free
         # the row + KV underneath the still-launched forward, and the delayed
         # output processor would crash on a None req_pool_idx (or, with
-        # inflight_middle_chunks cleared to 0, mistake the middle-chunk
+        # pending_middle_outputs cleared to 0, mistake the middle-chunk
         # result for a full output and append garbage tokens).
         if self.pp_size > 1 and hasattr(self, "mbs"):
             for mb_list in (self.mbs, self.last_mbs, self.running_mbs):
@@ -3515,7 +3515,7 @@ def abort_request(self, recv_req: AbortReq):
                 # Defensive: clear pending-chunk flags on the orphaned req so a
                 # stale reference can't trigger Stage A re-stash of the freed row.
                 req.has_pending_chunk = False
-                req.inflight_middle_chunks = 0
+                req.pending_middle_outputs = 0
             logger.debug(f"Abort queued request. {req.rid=}")
 
         # Delete the requests in the grammar queue
diff --git a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py
index ce52385987a4..6a88e72b8be2 100644
--- a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py
+++ b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py
@@ -215,7 +215,7 @@ def process_batch_result_prefill(
                     # decode req in mixed batch or retracted req
                     continue
 
-                if req.inflight_middle_chunks <= 0:
+                if req.pending_middle_outputs <= 0:
                     req.time_stats.set_prefill_finished_time()
 
                     # req output_ids are set here
@@ -264,7 +264,7 @@ def process_batch_result_prefill(
 
                 else:
                     # being chunked reqs' prefill is not finished
-                    req.inflight_middle_chunks -= 1
+                    req.pending_middle_outputs -= 1
                     # There is only at most one request being currently chunked.
                     # Because this request does not finish prefill,
                     # we don't want to stream the request currently being chunked.
@@ -304,7 +304,7 @@ def process_batch_result_prefill(
                 req.embedding = embeddings[i]
                 if req.return_pooled_hidden_states and phs is not None:
                     req.pooled_hidden_state = phs[i]
-                if req.inflight_middle_chunks <= 0:
+                if req.pending_middle_outputs <= 0:
                     req.time_stats.set_prefill_finished_time()
                     # Dummy output token for embedding models
                     req.output_ids.append(0)
@@ -317,7 +317,7 @@ def process_batch_result_prefill(
                         maybe_cache_unfinished_req(req, self.tree_cache)
                 else:
                     # being chunked reqs' prefill is not finished
-                    req.inflight_middle_chunks -= 1
+                    req.pending_middle_outputs -= 1
                     req.time_stats.set_last_chunked_prefill_finish_time()
 
         self.output_streamer.stream_output(
diff --git a/test/registered/unit/managers/test_hisparse_unit.py b/test/registered/unit/managers/test_hisparse_unit.py
index 56fc32a1620e..d5d272f91e53 100644
--- a/test/registered/unit/managers/test_hisparse_unit.py
+++ b/test/registered/unit/managers/test_hisparse_unit.py
@@ -52,7 +52,7 @@ def _make_req(rid="test-req-0", origin_input_ids=None, output_ids=None):
         finished_reason=None,
         hisparse_staging=False,
         staging=False,
-        inflight_middle_chunks=0,
+        pending_middle_outputs=0,
     )
     req.finished = lambda: req.finished_reason is not None
     req.set_extend_input_len = lambda extend_input_len: setattr(

From be72b26f7ecc2131fd10bbc9a5cf09299fc62e8f Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 19 May 2026 23:11:54 +0800
Subject: [PATCH 41/52] Fix Scheduler.pp_size refs: use ps.pp_size after
 ParallelState refactor

Upstream PR #25444 moved Scheduler.pp_size onto a frozen ParallelState
container (self.ps.pp_size). My branch's chunked-resume PP code still
referenced the old direct attribute, causing
AttributeError: 'Scheduler' object has no attribute 'pp_size'
in _in_flight_other_mb_rids and abort_request.
---
 python/sglang/srt/managers/scheduler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 7e33f060c999..dd3375971ab8 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2278,7 +2278,7 @@ def _in_flight_other_mb_rids(self) -> set:
         delay them by 1 mb step for no correctness gain, so we skip them
         here and only return rids of reqs that genuinely need protection.
         """
-        if self.pp_size <= 1 or not hasattr(self, "mbs"):
+        if self.ps.pp_size <= 1 or not hasattr(self, "mbs"):
             return set()
         rids = set()
         for mb in self.mbs:
@@ -3469,7 +3469,7 @@ def abort_request(self, recv_req: AbortReq):
         # output processor would crash on a None req_pool_idx (or, with
         # pending_middle_outputs cleared to 0, mistake the middle-chunk
         # result for a full output and append garbage tokens).
-        if self.pp_size > 1 and hasattr(self, "mbs"):
+        if self.ps.pp_size > 1 and hasattr(self, "mbs"):
             for mb_list in (self.mbs, self.last_mbs, self.running_mbs):
                 for mb in mb_list:
                     if mb is not None and not mb.is_empty():

From 2a07502c3f49c74254bfe2523094ff29d5b7d177 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 11:45:03 +0800
Subject: [PATCH 42/52] Refactor: Introduce Scheduler.active_reqs ownership
 tracker (C1)

Adds a by-rid dict tracking sync-mode reqs scheduler currently owns
the lifecycle of (admitted, not finished, not retracted). Runs as a
parallel tracker alongside existing waiting_queue / running_batch
without changing scheduler behavior. DEBUG_INVARIANTS=1 enables
_assert_invariants checks at get_next_batch_to_run boundaries.

Part of waiting_queue refactor plan, commit 1/7. See agent-drafts/
2026-05-25-waiting-queue-refactor-plan.md.
---
 python/sglang/srt/managers/scheduler.py       | 92 +++++++++++++++++++
 .../batch_result_processor.py                 |  9 ++
 2 files changed, 101 insertions(+)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 72f2f413007c..614a5ba77a6a 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -698,6 +698,7 @@ def __init__(
             ),
             output_streamer=self.output_streamer,
             abort_request=self.abort_request,
+            deactivate_req=self._deactivate,
         )
 
         self.is_initializing = False
@@ -996,6 +997,12 @@ def init_model_worker(self):
 
     def init_running_status(self):
         self.waiting_queue: List[Req] = []
+        # By-rid ownership tracker for sync-mode reqs the scheduler currently
+        # owns the lifecycle of (admitted, not finished, not retracted). Runs
+        # as a parallel tracker alongside waiting_queue / running_batch.reqs /
+        # chunked retention without changing scheduler behavior. See
+        # agent-drafts/2026-05-25-waiting-queue-refactor-plan.md (C1).
+        self.active_reqs: Dict[str, Req] = {}
         # The running decoding batch for continuous batching
         self.running_batch: ScheduleBatch = ScheduleBatch(reqs=[], batch_is_full=False)
         # The current forward batch
@@ -1013,6 +1020,60 @@ def init_running_status(self):
         self.forward_sleep_time = None
         self._engine_paused = False
 
+    def _activate(self, req: Req) -> None:
+        """Mark req as entering active lifecycle (initial admission).
+
+        Caller must ensure req.rid is not already in active_reqs (chunked-resume
+        re-admit is filtered at the call site). See refactor plan §C1.
+        """
+        assert req.rid not in self.active_reqs, f"already active: {req.rid}"
+        self.active_reqs[req.rid] = req
+
+    def _deactivate(self, req: Req) -> None:
+        """Mark req as leaving active lifecycle (finish / abort / retract).
+
+        Important: this function ONLY pops from active_reqs dict.
+        - Does not clear req.req_pool_idx: batch_result_processor.py:774-787 PP
+          cross-mb idempotency guard relies on it as an "already released"
+          sentinel.
+        - Does not clear req.has_pending_chunk / req.pending_middle_outputs:
+          owned by the semantic finish/abort/retract sites.
+        - Does not call release_kv_cache: that is the responsibility of
+          release_req / abort / finish paths.
+        This function only answers "scheduler no longer owns this req's
+        lifecycle".
+        """
+        self.active_reqs.pop(req.rid, None)
+
+    def _assert_invariants(self) -> None:
+        """Debug-only invariant checks for active_reqs ownership tracking.
+
+        Gated by DEBUG_INVARIANTS=1 to avoid slowing down normal runs. Skipped
+        in disagg modes (Q1=(c): disagg has its own ownership model).
+        """
+        if not os.environ.get("DEBUG_INVARIANTS"):
+            return
+        if self.disaggregation_mode != DisaggregationMode.NULL:
+            return
+        waiting_rids = {r.rid for r in self.waiting_queue}
+        active_rids = set(self.active_reqs.keys())
+        running_rids = {r.rid for r in self.running_batch.reqs}
+
+        # sync mode: chunked-resume reqs still live in waiting_queue until C4
+        # deletes the retention. Relax waiting ∩ active here: any rid in the
+        # intersection must be a chunked-resume req.
+        intersection_rids = waiting_rids & active_rids
+        for rid in intersection_rids:
+            assert self.active_reqs[
+                rid
+            ].has_pending_chunk, (
+                f"{rid} in both waiting and active but not chunked-resume"
+            )
+
+        assert (
+            running_rids <= active_rids
+        ), f"running not subset of active: {running_rids - active_rids}"
+
     def init_chunked_prefill(self):
         self.chunked_prefill_size = self.server_args.chunked_prefill_size
         uses_transformers_backend = (
@@ -2331,6 +2392,7 @@ def _in_flight_other_mb_rids(self) -> set:
         return rids
 
     def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
+        self._assert_invariants()
         if self.enable_fpm:
             self._fpm_batch_t0 = time.monotonic()
         self._abort_on_waiting_timeout()
@@ -2467,6 +2529,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             if self.enable_fpm:
                 ret.fpm_start_time = self._fpm_batch_t0
 
+        self._assert_invariants()
         return ret
 
     def get_num_allocatable_reqs(self, running_bs):
@@ -2676,6 +2739,14 @@ def _get_new_batch_prefill_raw(
         if len(can_run_list) == 0:
             return None
 
+        # audit A1: mark newly-admitted reqs as active. Filter chunked-resume
+        # re-admit (already active from a prior iter) — _activate's internal
+        # assert would trip otherwise. Filter can be removed once C3+C4 move
+        # chunked-resume out of the main admission loop.
+        for req in can_run_list:
+            if req.rid not in self.active_reqs:
+                self._activate(req)
+
         # Drop admitted reqs from waiting_queue, but KEEP chunked-resume reqs
         # (has_pending_chunk == True after admission) so they stay at the head
         # for the next iter's stash + admission. Single-flight is preserved
@@ -2686,6 +2757,10 @@ def _get_new_batch_prefill_raw(
         ]
         if adder.preempt_list:
             for req in adder.preempt_list:
+                # audit R2: PrefillAdder.preempt_to_schedule already released
+                # the victim's resources via running_batch.release_req. Drop
+                # from active_reqs before re-enqueueing as a waiting req.
+                self._deactivate(req)
                 self._add_request_to_queue(req)
 
         # Bump pending_middle_outputs for every admitted req that's still
@@ -2859,7 +2934,13 @@ def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
                 )
             logger.warning(msg_prefix + msg_details)
 
+            # audit R1: retract_decode released row + KV via release_req for
+            # both retracted_reqs (re-enqueued as waiting) and reqs_to_abort
+            # (final OOM eviction). Drop both from active_reqs.
+            for req in reqs_to_abort:
+                self._deactivate(req)
             for req in retracted_reqs:
+                self._deactivate(req)
                 self._add_request_to_queue(req, is_retracted=True)
         else:
             self.new_token_ratio_tracker.decay_step()
@@ -3567,6 +3648,9 @@ def abort_request(self, recv_req: AbortReq):
                 # stale reference can't trigger Stage A re-stash of the freed row.
                 req.has_pending_chunk = False
                 req.pending_middle_outputs = 0
+                # audit D6: orphan release in waiting_queue (sync mode mamba or
+                # chunked-resume mid-prefill); drop from active set.
+                self._deactivate(req)
             logger.debug(f"Abort queued request. {req.rid=}")
 
         # Delete the requests in the grammar queue
@@ -3676,7 +3760,11 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
                 self.running_batch.filter_batch(v1_spec_info_filtered=True)
                 if len(self.running_batch.reqs) != 0:
                     retracted_reqs = self.running_batch.retract_all(self.server_args)
+                    # audit R3: retract_all released resources via release_req
+                    # for every running req; drop from active_reqs before
+                    # re-enqueueing as waiting.
                     for req in retracted_reqs:
+                        self._deactivate(req)
                         self._add_request_to_queue(req)
 
                 self.running_batch.batch_is_full = False
@@ -3702,6 +3790,10 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
                         req.disagg_kv_sender = None
                     release_kv_cache(req, self.tree_cache, is_insert=False)
                     req.reset_for_retract()
+                    # audit D7: chunked-resume req released via reset_for_retract
+                    # stays in waiting_queue for re-prefill but no longer holds
+                    # row/KV, so it leaves the active set.
+                    self._deactivate(req)
 
     def continue_generation(self, recv_req: ContinueGenerationReqInput):
         if recv_req.torch_empty_cache:
diff --git a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py
index 9385d2d03d4c..06f8aa29826e 100644
--- a/python/sglang/srt/managers/scheduler_components/batch_result_processor.py
+++ b/python/sglang/srt/managers/scheduler_components/batch_result_processor.py
@@ -78,6 +78,7 @@ class SchedulerBatchResultProcessor:
     logprob_result_processor: "SchedulerLogprobResultProcessor"
     output_streamer: "SchedulerOutputStreamer"
     abort_request: Callable
+    deactivate_req: Callable
 
     def process_batch_result_prebuilt(self, batch: ScheduleBatch):
         assert self.disaggregation_mode == DisaggregationMode.DECODE
@@ -231,6 +232,8 @@ def process_batch_result_prefill(
                         self._maybe_collect_routed_experts(req)
                         self._maybe_collect_indexer_topk(req)
                         release_kv_cache(req, self.tree_cache)
+                        # audit D1: sync prefill finish
+                        self.deactivate_req(req)
                         req.time_stats.set_completion_time()
                     elif not batch.decoding_reqs or req not in batch.decoding_reqs:
                         maybe_cache_unfinished_req(req, self.tree_cache)
@@ -315,6 +318,8 @@ def process_batch_result_prefill(
 
                     if req.finished():
                         release_kv_cache(req, self.tree_cache)
+                        # audit D2: embedding/reward prefill finish
+                        self.deactivate_req(req)
                         req.time_stats.set_completion_time()
                     else:
                         maybe_cache_unfinished_req(req, self.tree_cache)
@@ -800,6 +805,10 @@ def _handle_finished_req(
                 if self.server_args.enable_hisparse:
                     self.hisparse_coordinator.request_finished(req)
                 release_kv_cache(req, self.tree_cache)
+                # audit D3: sync decode finish (non-offload path). The DECODE
+                # offload branch (D4) does not call _deactivate — disagg DECODE
+                # is not in active_reqs (Q1=(c)).
+                self.deactivate_req(req)
 
             req.time_stats.set_completion_time()
 

From c8cb8eed9d0772f03acf4f627837621475c18791 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 14:14:04 +0800
Subject: [PATCH 43/52] Refactor: Scan chunked_reqs() in Stage A instead of
 waiting_queue (C2)

Eliminates H3 hack (Stage A scanning the full waiting_queue to find
chunked-resume reqs). Now scans the chunked_reqs() view derived from
active_reqs. Behavior identical to C1 because C1's retention keeps
waiting_queue and active_reqs in sync for chunked-resume reqs.

Part of waiting_queue refactor plan, commit 2/7.
---
 python/sglang/srt/managers/scheduler.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 614a5ba77a6a..935d27c36951 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -25,7 +25,7 @@
 from contextlib import contextmanager, nullcontext
 from functools import partial
 from http import HTTPStatus
-from typing import Any, Deque, Dict, List, Optional, Tuple, Union
+from typing import Any, Deque, Dict, Iterable, List, Optional, Tuple, Union
 
 from sglang.srt.utils.common import suppress_noisy_warnings
 
@@ -1074,6 +1074,12 @@ def _assert_invariants(self) -> None:
             running_rids <= active_rids
         ), f"running not subset of active: {running_rids - active_rids}"
 
+    def chunked_reqs(self) -> Iterable[Req]:
+        """active_reqs 中 has_pending_chunk=True 的派生 view。
+        Single-flight 不变量（Q5，§7-Q5）：len(list(chunked_reqs())) <= 1，
+        在 _get_new_batch_prefill_raw 顶端断言（C3 引入）。"""
+        return (r for r in self.active_reqs.values() if r.has_pending_chunk)
+
     def init_chunked_prefill(self):
         self.chunked_prefill_size = self.server_args.chunked_prefill_size
         uses_transformers_backend = (
@@ -2414,8 +2420,11 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
         # for the duration of the scheduling pass. vLLM / TokenSpeed do not
         # need this because their admission reads a single monotone counter
         # (num_computed_tokens / FSM state), not a prefix-indices splice.
-        for req in self.waiting_queue:
-            if req.has_pending_chunk and not req.is_dllm():
+        # audit P1: Stage A — stash chunked-resume KV into radix tree at iter
+        # boundary. Switch from scanning waiting_queue (H3 hack) to iterating
+        # the chunked_reqs() view directly.
+        for req in self.chunked_reqs():
+            if not req.is_dllm():
                 maybe_cache_unfinished_req(req, self.tree_cache, chunked=True)
 
         if self.dllm_config is not None and self.dllm_manager.any_staging_reqs():

From 0810ca8a269d2863f2c2915da8aee7b1c24c30b9 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 14:20:39 +0800
Subject: [PATCH 44/52] Refactor: Inline chunked admission, strip main-loop
 chunked branches (C3)

Adds an inline chunked admission block at the top of
_get_new_batch_prefill_raw that consumes chunked_reqs() directly.
Strips has_pending_chunk branches from the main waiting_queue loop
(H6 LoRA drainer bypass, H7 init_next_round_input split). The
waiting_queue retention for chunked-resume is still in place; it is
removed in C4. Single-flight assertion enforced at the inline
admission entry.

Part of waiting_queue refactor plan, commit 3/7.
---
 python/sglang/srt/managers/scheduler.py | 105 ++++++++++++++++++++----
 1 file changed, 87 insertions(+), 18 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 935d27c36951..540d1800bd36 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -156,6 +156,7 @@
     ScheduleBatch,
 )
 from sglang.srt.managers.schedule_policy import (
+    CLIP_MAX_NEW_TOKENS,
     AddReqResult,
     PrefillAdder,
     SchedulePolicy,
@@ -2569,6 +2570,11 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
     def _get_new_batch_prefill_raw(
         self, prefill_delayer_single_pass: Optional[PrefillDelayerSinglePassExecutor]
     ) -> Optional[ScheduleBatch]:
+        # Chunked-resume admission: handled by the inline block below
+        # (`chunked_reqs()` filter + adder bookkeeping). The main
+        # waiting_queue loop further down admits ONLY truly-waiting reqs (no
+        # has_pending_chunk paths). See agent-drafts/
+        # 2026-05-25-waiting-queue-refactor-plan.md §C3.
         # Check if the grammar is ready in the grammar queue
         if self.grammar_manager.has_waiting_grammars():
             ready_grammar_requests = self.grammar_manager.get_ready_grammar_requests()
@@ -2649,6 +2655,73 @@ def _get_new_batch_prefill_raw(
             waiting_queue_len=len(self.waiting_queue),
         )
 
+        # audit Q5: single-flight invariant — at most one chunked-resume req
+        # in active at any time.
+        chunked_in_active = list(self.chunked_reqs())
+        assert len(chunked_in_active) <= 1, (
+            f"single-flight violated: {len(chunked_in_active)} chunked reqs "
+            f"in active ({[r.rid for r in chunked_in_active]})"
+        )
+
+        # Inline chunked admission (Plan §C3, do NOT recreate
+        # PrefillAdder.add_chunked_req per §10 decision). The chunked-resume
+        # req keeps its row/KV/lock_ref from prior admission, so
+        # init_next_round_input must NOT re-match prefix (no tree_cache arg,
+        # H7 elimination). Budget bookkeeping is inlined here — minor LOC;
+        # intentionally not extracted. Mirrors the body of upstream/main
+        # PrefillAdder.add_chunked_req.
+        if chunked_in_active:
+            chunked_req = chunked_in_active[0]
+            chunked_req.init_next_round_input()
+            if adder.dllm_config is not None:
+                _rem_tokens = adder._get_dllm_remain_tokens()
+            else:
+                _rem_tokens = min(adder.rem_chunk_tokens, int(adder.rem_total_tokens))
+                if adder.is_hybrid_swa:
+                    # alloc_extend needs extend_num_tokens + page_size per
+                    # request, so reserve one page here to avoid OOM.
+                    _rem_tokens = min(
+                        _rem_tokens, int(adder.rem_swa_tokens) - adder.page_size
+                    )
+                # The chunked_req must be added to the list; otherwise, it
+                # will cause a memory leak. Therefore, in certain cases where
+                # _rem_tokens <= 0, it should be replaced with
+                # rem_chunk_tokens. Under hybrid_swa with no room, skip this
+                # iter — the chunked req stays in active_reqs and is retried
+                # next iter (mirrors upstream `return req`).
+                if _rem_tokens <= 0:
+                    if adder.is_hybrid_swa:
+                        _rem_tokens = None
+                    else:
+                        _rem_tokens = adder.rem_chunk_tokens
+
+            if _rem_tokens is not None:
+                truncated = chunked_req.extend_input_len > _rem_tokens
+                chunked_req.set_extend_input_len(
+                    min(chunked_req.extend_input_len, _rem_tokens)
+                )
+                chunked_req.fill_ids = chunked_req.fill_ids[
+                    : len(chunked_req.prefix_indices) + chunked_req.extend_input_len
+                ]
+                adder.can_run_list.append(chunked_req)
+                adder._update_prefill_budget(
+                    0,
+                    chunked_req.extend_input_len,
+                    (
+                        min(
+                            chunked_req.sampling_params.max_new_tokens,
+                            CLIP_MAX_NEW_TOKENS,
+                        )
+                        if not truncated
+                        else 0
+                    ),
+                )
+                # has_pending_chunk: persistent flag carrying chunked-resume
+                # state across iters. When truncated=False, this was the last
+                # chunk — clear the flag so the req exits chunked_reqs().
+                if not chunked_req.is_dllm():
+                    chunked_req.has_pending_chunk = truncated
+
         if self.enable_lora:
             running_loras = {req.lora_id for req in self.running_batch.reqs}
 
@@ -2660,16 +2733,17 @@ def _get_new_batch_prefill_raw(
 
         # Get requests from the waiting queue to a new prefill batch
         for req in self.waiting_queue:
-            # Chunked-resume reqs hold a row + tree lock_ref from their prior
-            # admission. If the LoRA drainer rejects them mid-prefill, they
-            # stay in waiting_queue forever — deadlock + KV leak. Their LoRA
-            # adapter was already accepted on the first admission, so the
-            # drainer/validate check is moot for them.
-            if (
-                self.enable_lora
-                and not req.has_pending_chunk
-                and not self._can_schedule_lora_req(req, running_loras)
-            ):
+            # Chunked-resume req is admitted via the inline block above
+            # (Plan §C3). It still rides H2 retention in waiting_queue until
+            # C4 removes that — skip it here to avoid double-admit. Once C4
+            # drops the retention, this guard becomes a no-op and can be
+            # removed.
+            if req.has_pending_chunk:
+                continue
+
+            # audit H6: chunked-resume no longer flows through main loop;
+            # drainer check applies uniformly.
+            if self.enable_lora and not self._can_schedule_lora_req(req, running_loras):
                 continue
 
             running_bs = len(self.running_batch.reqs)
@@ -2698,14 +2772,9 @@ def _get_new_batch_prefill_raw(
                     req.rid
                 )
 
-            # Chunked-resume reqs must NOT re-match prefix at admission
-            # (would re-assign req.last_node without rebalancing lock_ref,
-            # corrupting cache_unfinished_req's dec_lock_ref/inc_lock_ref
-            # pairing). They keep last_node from previous stash.
-            if req.has_pending_chunk:
-                req.init_next_round_input()
-            else:
-                req.init_next_round_input(self.tree_cache)
+            # audit H7: chunked-resume handled in inline admission above;
+            # main loop unconditional.
+            req.init_next_round_input(self.tree_cache)
             res = adder.add_one_req(
                 req,
                 truncation_align_size=self.truncation_align_size,

From c19d510601b156681a7efc8c2759eadb52882946 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 14:24:14 +0800
Subject: [PATCH 45/52] Refactor: Remove waiting_queue retention for
 chunked-resume (C4)

Chunked-resume reqs no longer anchor in waiting_queue (H2 hack
elimination). The retention `or x.has_pending_chunk` is removed; the
transitional guard added in C3 to prevent double-admit is also
removed. After this commit, chunked-resume reqs live exclusively in
active_reqs and are re-admitted via the inline block at the top of
_get_new_batch_prefill_raw.

Part of waiting_queue refactor plan, commit 4/7.
---
 python/sglang/srt/managers/scheduler.py | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 540d1800bd36..bbd4ae1931a8 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2733,14 +2733,6 @@ def _get_new_batch_prefill_raw(
 
         # Get requests from the waiting queue to a new prefill batch
         for req in self.waiting_queue:
-            # Chunked-resume req is admitted via the inline block above
-            # (Plan §C3). It still rides H2 retention in waiting_queue until
-            # C4 removes that — skip it here to avoid double-admit. Once C4
-            # drops the retention, this guard becomes a no-op and can be
-            # removed.
-            if req.has_pending_chunk:
-                continue
-
             # audit H6: chunked-resume no longer flows through main loop;
             # drainer check applies uniformly.
             if self.enable_lora and not self._can_schedule_lora_req(req, running_loras):
@@ -2825,14 +2817,11 @@ def _get_new_batch_prefill_raw(
             if req.rid not in self.active_reqs:
                 self._activate(req)
 
-        # Drop admitted reqs from waiting_queue, but KEEP chunked-resume reqs
-        # (has_pending_chunk == True after admission) so they stay at the head
-        # for the next iter's stash + admission. Single-flight is preserved
-        # naturally by budget + priority.
+        # audit H2: retention removed. chunked-resume reqs are no longer
+        # anchored in waiting_queue — they live in active_reqs and are
+        # re-admitted via the inline chunked admission loop (C3).
         can_run_set = set(can_run_list)
-        self.waiting_queue = [
-            x for x in self.waiting_queue if x not in can_run_set or x.has_pending_chunk
-        ]
+        self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_set]
         if adder.preempt_list:
             for req in adder.preempt_list:
                 # audit R2: PrefillAdder.preempt_to_schedule already released

From b00d5f6bad0ca1f7e191978c6112066b0b1d38f9 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 14:27:25 +0800
Subject: [PATCH 46/52] Refactor: Remove early-exit / dynamic-chunking /
 abort-timeout chunked bypasses (C5)

Now that chunked-resume reqs live in active_reqs (post-C4), the
defensive bypasses that scanned waiting_queue for has_pending_chunk
become dead code. Eliminates H4 (early-exit has_chunked_resume scan),
H5 (dynamic-chunking lookup), AB7 (_abort_on_waiting_timeout
has_pending_chunk skip), plus a stale comment referencing the deleted
retention. Single chunked_in_active computation reused throughout
_get_new_batch_prefill_raw.

Part of waiting_queue refactor plan, commit 5/7.
---
 python/sglang/srt/managers/scheduler.py | 58 ++++++++++---------------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index bbd4ae1931a8..15e98f58a7b1 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2219,13 +2219,8 @@ def _abort_on_waiting_timeout(self):
 
         deleted_reqs = set()
         deadline = time.perf_counter() - timeout_s
+        # audit AB7: chunked-resume no longer in waiting_queue (C4), bypass removed.
         for req in self.waiting_queue:
-            # Chunked-resume reqs sit in waiting_queue across iters while
-            # actively prefilling — they are not idle. Their entry_time is
-            # from their original arrival, so a long prefill would falsely
-            # trigger the timeout and leak KV + row.
-            if req.has_pending_chunk:
-                continue
             entry_time = req.time_stats.wait_queue_entry_time
             if 0 < entry_time < deadline:
                 if self.enable_hicache_storage:
@@ -2454,9 +2449,8 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # Drop chunked-resume reqs before merging last_batch into
             # running_batch. running_batch runs decode forward and admitting
             # a mid-prefill req there breaks shapes + KV accounting. The
-            # dropped reqs persist in self.waiting_queue (retention at
-            # ~line 2775: `x not in can_run_set or x.has_pending_chunk`)
-            # and re-enter via next iter's Stage A stash + admission.
+            # dropped reqs persist in self.active_reqs and re-enter via the
+            # inline chunked admission in _get_new_batch_prefill_raw.
             #
             # PP cross-mb: also drop reqs whose LAST chunk forward is still
             # in flight in another mb (when more decodes will follow — i.e.,
@@ -2588,15 +2582,19 @@ def _get_new_batch_prefill_raw(
             # Reset batch_is_full to try preemption with a prefill adder.
             self.running_batch.batch_is_full = False
 
-        # Identify any in-flight chunked-resume req held in waiting_queue —
-        # priority + has_pending_chunk make it sit at the head, but its
-        # presence relaxes the "is queue empty / pool full" early exits below
-        # (we must keep scheduling it to make progress, or memory leaks).
-        has_chunked_resume = any(r.has_pending_chunk for r in self.waiting_queue)
+        # audit H4 + Q5: chunked-resume now lives in active_reqs (not
+        # waiting_queue, post-C4). Compute the single-flight view once here
+        # and reuse below for early-exit relaxation, dynamic chunking, and
+        # the inline chunked admission entry.
+        chunked_in_active = list(self.chunked_reqs())
+        assert len(chunked_in_active) <= 1, (
+            f"single-flight violated: {len(chunked_in_active)} chunked reqs "
+            f"in active ({[r.rid for r in chunked_in_active]})"
+        )
 
         if (
             self.running_batch.batch_is_full or len(self.waiting_queue) == 0
-        ) and not has_chunked_resume:
+        ) and not chunked_in_active:
             return None
 
         running_bs = len(self.running_batch.reqs)
@@ -2607,7 +2605,7 @@ def _get_new_batch_prefill_raw(
         # check should not block them.
         if (
             self.get_num_allocatable_reqs(running_bs) <= 0
-            and not has_chunked_resume
+            and not chunked_in_active
             and not self.enable_priority_preemption
         ):
             self.running_batch.batch_is_full = True
@@ -2624,17 +2622,15 @@ def _get_new_batch_prefill_raw(
 
         # Determine chunked_prefill_size for this batch
         chunked_prefill_size = self.chunked_prefill_size
-        if self.enable_dynamic_chunking:
-            # Single-flight invariant: at most one chunked-resume req in the
-            # queue at any time (priority + budget enforce this naturally).
-            chunked_resume = next(
-                (r for r in self.waiting_queue if r.has_pending_chunk), None
-            )
-            if chunked_resume is not None:
-                history_len = len(chunked_resume.prefix_indices)
-                dynamic_size = self.predict_next_chunk_size(history_len)
-                if dynamic_size is not None:
-                    chunked_prefill_size = dynamic_size
+        if self.enable_dynamic_chunking and chunked_in_active:
+            # audit H5: chunked-resume lives in active_reqs; reuse the
+            # single-flight view computed above instead of scanning
+            # waiting_queue.
+            chunked_resume = chunked_in_active[0]
+            history_len = len(chunked_resume.prefix_indices)
+            dynamic_size = self.predict_next_chunk_size(history_len)
+            if dynamic_size is not None:
+                chunked_prefill_size = dynamic_size
 
         # Prefill policy
         adder = PrefillAdder(
@@ -2655,14 +2651,6 @@ def _get_new_batch_prefill_raw(
             waiting_queue_len=len(self.waiting_queue),
         )
 
-        # audit Q5: single-flight invariant — at most one chunked-resume req
-        # in active at any time.
-        chunked_in_active = list(self.chunked_reqs())
-        assert len(chunked_in_active) <= 1, (
-            f"single-flight violated: {len(chunked_in_active)} chunked reqs "
-            f"in active ({[r.rid for r in chunked_in_active]})"
-        )
-
         # Inline chunked admission (Plan §C3, do NOT recreate
         # PrefillAdder.add_chunked_req per §10 decision). The chunked-resume
         # req keeps its row/KV/lock_ref from prior admission, so

From 294fb739ee0a247e16283de6bb5d5298d8c31819 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 14:31:42 +0800
Subject: [PATCH 47/52] Refactor: Simplify abort_request, handle stashed
 chunked-resume (C6)

Eliminates H1 (dual-existence comment) and H8 (defensive
has_pending_chunk / pending_middle_outputs reset on waiting-segment
orphan release). Post-C4 chunked-resume reqs no longer live in
waiting_queue, so the waiting-segment orphan branch is narrowed to
mamba-pool reqs only.

Critical: the active-segment loop now iterates active_reqs instead of
batch_reqs, distinguishing in-batch reqs (FINISH_ABORT via batch
result path) from stashed chunked-resume reqs (immediate release +
_deactivate, audit finding 2). Without this, aborting a chunked-
resume mid-prefill outside of any current batch would leak
row + KV + lock_ref.

Part of waiting_queue refactor plan, commit 6/7.
---
 python/sglang/srt/managers/scheduler.py | 66 ++++++++++++++++---------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 15e98f58a7b1..e7124fafbcaf 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -3638,10 +3638,7 @@ def handle_rpc_request(self, recv_req: RpcReqInput):
 
     def abort_request(self, recv_req: AbortReq):
         # todo hisparse, release resources for abort requests in hisparse coordinator
-        # Build batch rid set: chunked-resume reqs may live in both waiting_queue
-        # and batch.reqs simultaneously (stateless-scheduler refactor). Skip the
-        # waiting_queue removal for those — let the to_finish path below handle
-        # them, otherwise we send_output / release_kv_cache twice.
+        # Post-C4: chunked-resume reqs live in active_reqs only, never in waiting_queue.
         if self.cur_batch is self.running_batch or self.cur_batch is None:
             batch_reqs = list(self.running_batch.reqs)
         else:
@@ -3690,21 +3687,18 @@ def abort_request(self, recv_req: AbortReq):
                     req, self.req_to_metadata_buffer_idx_allocator
                 )
 
-            # For mamba radix cache, or for chunked-resume reqs whose prior
-            # admissions already allocated a row + KV + radix lock_ref. Without
-            # this branch, aborting a chunked-resume req that is currently only
-            # in waiting_queue (not in any batch's reqs) leaks all three.
+            # audit AB4 simplified post-C4: only mamba radix cache reqs can be
+            # in waiting_queue with mamba_pool_idx held. Chunked-resume reqs
+            # are NOT in waiting_queue anymore (live in active_reqs); their
+            # abort-time release happens in the active_reqs loop below.
             if (
                 req.mamba_pool_idx is not None
-                or (req.has_pending_chunk and req.req_pool_idx is not None)
-            ) and self.disaggregation_mode != DisaggregationMode.DECODE:
+                and self.disaggregation_mode != DisaggregationMode.DECODE
+            ):
                 release_kv_cache(req, self.tree_cache, is_insert=False)
-                # Defensive: clear pending-chunk flags on the orphaned req so a
-                # stale reference can't trigger Stage A re-stash of the freed row.
-                req.has_pending_chunk = False
-                req.pending_middle_outputs = 0
-                # audit D6: orphan release in waiting_queue (sync mode mamba or
-                # chunked-resume mid-prefill); drop from active set.
+                # audit D6 (mamba branch): drop from active set if present.
+                # (mamba-radix path may or may not put req in active_reqs;
+                # _deactivate is idempotent.)
                 self._deactivate(req)
             logger.debug(f"Abort queued request. {req.rid=}")
 
@@ -3757,16 +3751,40 @@ def abort_request(self, recv_req: AbortReq):
                         remaining_retracted.append(decode_req)
                 self.disagg_decode_prealloc_queue.retracted_queue = remaining_retracted
 
-        # Delete requests in the running batch (reuse batch_reqs built above)
-        for req in batch_reqs:
-            if not req.finished() and (
-                recv_req.abort_all or req.rid.startswith(recv_req.rid)
-            ):
-                # Abort method 3: set `to_finish`
-                # The request will still run one decode forward pass.
-                # Then we reuse all existing code to clean up the KV cache allocation.
+        # audit finding 2 (Plan §C6 Edit 3): iterate active_reqs instead of
+        # batch_reqs so that stashed chunked-resume reqs (in active_reqs but
+        # NOT in any current batch) get their resources released immediately.
+        # batch_rids was built above and includes cur_batch + running_batch +
+        # PP mbs[*]; "in-batch" reqs go through to_finish, "stashed-chunked"
+        # reqs need explicit release because no batch result path will pick
+        # them up.
+        for rid in list(self.active_reqs.keys()):
+            req = self.active_reqs[rid]
+            if req.finished():
+                continue
+            if not (recv_req.abort_all or rid.startswith(recv_req.rid)):
+                continue
+
+            if rid in batch_rids:
+                # In some batch: standard to_finish path; release_kv_cache +
+                # _deactivate happen in process_batch_result_*.
                 logger.debug(f"Abort running request. {req.rid=}")
                 req.to_finish = FINISH_ABORT()
+            else:
+                # Active but not in any batch — the only legitimate case is
+                # a stashed chunked-resume mid-prefill (audit finding 2).
+                # Release immediately, else row+KV+lock_ref leak.
+                assert req.has_pending_chunk and req.req_pool_idx is not None, (
+                    f"unexpected active-but-not-in-batch req: {rid} "
+                    f"has_pending_chunk={req.has_pending_chunk} "
+                    f"req_pool_idx={req.req_pool_idx}"
+                )
+                if self.disaggregation_mode != DisaggregationMode.DECODE:
+                    release_kv_cache(req, self.tree_cache, is_insert=False)
+                    req.has_pending_chunk = False
+                    req.pending_middle_outputs = 0
+                    self._deactivate(req)
+                logger.debug(f"Abort stashed chunked-resume request. {req.rid=}")
 
     def _pause_engine(self) -> Tuple[List[Req], int]:
         raise NotImplementedError()

From 68cec4a1c39f74c7da736cde06e03f71e024ebcb Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 14:39:00 +0800
Subject: [PATCH 48/52] Refactor: Tighten invariants, finalize docstrings (C7)

- Tightens _assert_invariants: waiting_queue and active_reqs are now
  strictly disjoint (sync mode); C1's relaxed transitional clause
  removed.
- Removes C1's _activate idempotency filter at the admission call
  site; the main admission loop no longer produces re-admits after
  C3/C4.
- Adds comprehensive invariant documentation as field-level comments
  on Scheduler.active_reqs and method docstring on chunked_reqs().
- Migrates pause_generation(retract) chunked release path to iterate
  chunked_reqs() instead of scanning waiting_queue (dead post-C4),
  and flags a pre-existing latent bug (req not re-enqueued after
  reset_for_retract).

Concludes the waiting_queue refactor chain (commit 7/7). See
agent-drafts/2026-05-25-waiting-queue-refactor-plan.md and audit.
---
 python/sglang/srt/managers/scheduler.py | 107 ++++++++++++++++--------
 1 file changed, 73 insertions(+), 34 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index e7124fafbcaf..1267b8f603ca 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -998,11 +998,29 @@ def init_model_worker(self):
 
     def init_running_status(self):
         self.waiting_queue: List[Req] = []
-        # By-rid ownership tracker for sync-mode reqs the scheduler currently
-        # owns the lifecycle of (admitted, not finished, not retracted). Runs
-        # as a parallel tracker alongside waiting_queue / running_batch.reqs /
-        # chunked retention without changing scheduler behavior. See
-        # agent-drafts/2026-05-25-waiting-queue-refactor-plan.md (C1).
+        # `active_reqs`: sync-mode reqs the scheduler currently owns the
+        # lifecycle of (admitted, not finished, not retracted, not aborted-
+        # released). by-rid indexed.
+        #
+        # Definition (Plan §7-Q7): admitted via `_get_new_batch_prefill_raw`
+        # and not yet released through finish/retract/abort. Includes normal
+        # decode reqs AND mid-prefill chunked-resume reqs AND PP cross-mb
+        # in-flight reqs (the last two: NOT in running_batch.reqs but still
+        # holding row + KV + lock_ref).
+        #
+        # Invariants:
+        # * `waiting_queue ∩ active_reqs == ∅` (sync mode; disagg modes use
+        #    their own ownership managers, see Q1=(c)).
+        # * `set(running_batch.reqs) ⊆ active_reqs` (in-batch always active).
+        # * `set(chunked_reqs()) ⊆ active_reqs` (by definition).
+        # * `len(list(chunked_reqs())) <= 1` (Q5 single-flight; asserted at
+        #    inline chunked admission entry).
+        # * `active_reqs` keys are in 1:1 correspondence with allocated
+        #    `req_to_token_pool` rows (sync mode).
+        #
+        # Maintained at: `_activate` / `_deactivate` (only entry points).
+        # See agent-drafts/2026-05-25-waiting-queue-refactor-plan.md and
+        # 2026-05-25-scheduler-lifecycle-audit.md.
         self.active_reqs: Dict[str, Req] = {}
         # The running decoding batch for continuous batching
         self.running_batch: ScheduleBatch = ScheduleBatch(reqs=[], batch_is_full=False)
@@ -1060,25 +1078,30 @@ def _assert_invariants(self) -> None:
         active_rids = set(self.active_reqs.keys())
         running_rids = {r.rid for r in self.running_batch.reqs}
 
-        # sync mode: chunked-resume reqs still live in waiting_queue until C4
-        # deletes the retention. Relax waiting ∩ active here: any rid in the
-        # intersection must be a chunked-resume req.
-        intersection_rids = waiting_rids & active_rids
-        for rid in intersection_rids:
-            assert self.active_reqs[
-                rid
-            ].has_pending_chunk, (
-                f"{rid} in both waiting and active but not chunked-resume"
-            )
+        # sync mode: waiting_queue and active_reqs are strictly disjoint
+        # (C4 removed chunked-resume retention; chunked-resume now lives in
+        # active_reqs only).
+        assert not waiting_rids & active_rids, (
+            f"waiting_queue and active_reqs must be disjoint (sync mode); "
+            f"overlap: {waiting_rids & active_rids}"
+        )
 
         assert (
             running_rids <= active_rids
         ), f"running not subset of active: {running_rids - active_rids}"
 
     def chunked_reqs(self) -> Iterable[Req]:
-        """active_reqs 中 has_pending_chunk=True 的派生 view。
-        Single-flight 不变量（Q5，§7-Q5）：len(list(chunked_reqs())) <= 1，
-        在 _get_new_batch_prefill_raw 顶端断言（C3 引入）。"""
+        """Active reqs currently in mid-prefill (`has_pending_chunk=True`).
+
+        Derived view over `active_reqs` — no separate storage. Single-flight
+        invariant (Q5): `len(list(chunked_reqs())) <= 1` at any iter
+        boundary; asserted at the entry of the inline chunked admission
+        block in `_get_new_batch_prefill_raw`.
+
+        Iteration semantics: returns a fresh generator each call; consume
+        once or wrap in `list(...)`. Callers that mutate `active_reqs`
+        during iteration must `list(...)` first.
+        """
         return (r for r in self.active_reqs.values() if r.has_pending_chunk)
 
     def init_chunked_prefill(self):
@@ -2797,13 +2820,18 @@ def _get_new_batch_prefill_raw(
         if len(can_run_list) == 0:
             return None
 
-        # audit A1: mark newly-admitted reqs as active. Filter chunked-resume
-        # re-admit (already active from a prior iter) — _activate's internal
-        # assert would trip otherwise. Filter can be removed once C3+C4 move
-        # chunked-resume out of the main admission loop.
+        # audit A1: mark newly-admitted reqs as active. Post-C3/C4 the main
+        # admission loop (the for-loop over waiting_queue above) only
+        # produces brand-new admissions. The inline chunked admission block
+        # also appends to `can_run_list` for chunked-resume re-admit, and
+        # those reqs are already in active_reqs from a prior iter (the
+        # inline block does NOT call _activate). Skip them here so the
+        # strict `_activate` assert (post-C7) catches accidental
+        # double-admission for everything else.
         for req in can_run_list:
-            if req.rid not in self.active_reqs:
-                self._activate(req)
+            if req.rid in self.active_reqs:
+                continue
+            self._activate(req)
 
         # audit H2: retention removed. chunked-resume reqs are no longer
         # anchored in waiting_queue — they live in active_reqs and are
@@ -3842,14 +3870,18 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
 
                 self.running_batch.batch_is_full = False
 
-            # Chunked-resume reqs in waiting_queue still hold their row + KV +
-            # radix lock_ref from prior admissions. Without explicit release,
-            # pause(retract)'s 'flush_cache can succeed' contract (see
-            # PauseGenerationReqInput docstring) is violated. Release in-place
-            # and reset their chunked state so continue_generation re-prefills
-            # them from origin_input_ids.
-            for req in self.waiting_queue:
-                if req.has_pending_chunk and req.req_pool_idx is not None:
+            # Chunked-resume reqs still hold their row + KV + radix lock_ref
+            # from prior admissions. Without explicit release, pause(retract)'s
+            # 'flush_cache can succeed' contract (see PauseGenerationReqInput
+            # docstring) is violated. Release in-place and reset their chunked
+            # state so continue_generation re-prefills them from
+            # origin_input_ids.
+            #
+            # audit C7: chunked-resume lives in active_reqs (post-C4),
+            # iterate chunked_reqs() directly. list(...) because we mutate
+            # active_reqs via _deactivate inside the loop.
+            for req in list(self.chunked_reqs()):
+                if req.req_pool_idx is not None:
                     # Disagg-prefill: signal the decode side that the send was
                     # retracted and drop our sender ref so re-prefill rebuilds
                     # the bootstrap state. start_send_idx / tmp_end_idx are
@@ -3864,9 +3896,16 @@ def pause_generation(self, recv_req: PauseGenerationReqInput):
                     release_kv_cache(req, self.tree_cache, is_insert=False)
                     req.reset_for_retract()
                     # audit D7: chunked-resume req released via reset_for_retract
-                    # stays in waiting_queue for re-prefill but no longer holds
-                    # row/KV, so it leaves the active set.
+                    # no longer holds row/KV, so it leaves the active set.
                     self._deactivate(req)
+                    # TODO(post-refactor follow-up): plan §10 flag — after
+                    # reset_for_retract, this req is NOT re-enqueued to
+                    # waiting_queue. Either the design relies on the original
+                    # reference staying in waiting_queue (but C4 removed
+                    # retention!) or this is a pre-existing latent bug from
+                    # before the refactor. Investigate separately. See
+                    # agent-drafts/2026-05-25-waiting-queue-refactor-plan.md
+                    # §10.
 
     def continue_generation(self, recv_req: ContinueGenerationReqInput):
         if recv_req.torch_empty_cache:

From a1e67f78b28c477a7c223bd3fa1bfaccb4edc306 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 15:08:46 +0800
Subject: [PATCH 49/52] Fix: Gate Scheduler._activate to sync-mode non-DLLM
 only (C8)

Review of C1-C7 revealed two P0 bugs and one P1:

1. _activate fired unconditionally in _get_new_batch_prefill_raw,
   enrolling disagg PREFILL and DLLM reqs into active_reqs. Neither
   path has a corresponding _deactivate (disagg PREFILL uses
   process_batch_result_disagg_prefill; DLLM uses dllm/mixin paths),
   leaking active_reqs entries indefinitely and crashing abort_all
   via the new stashed-chunked assert (C6).

2. flush_cache cleared tree cache / pool but not active_reqs,
   leaving stale dict entries pointing at freed req_pool_idx.

Fix: gate _activate at the helper itself (single point of control)
to enforce the "sync-mode non-DLLM only" invariant that the plan +
audit always assumed but code didn't enforce. flush_cache.clear()
ensures the dict is reset alongside other ownership pools.

Also: rewrite two stale comments referencing pre-C4 waiting_queue
retention.

Part of waiting_queue refactor chain, commit 8/7 (post-review fix).
---
 python/sglang/srt/managers/scheduler.py | 38 ++++++++++++++++++-------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 1267b8f603ca..6924e6f23610 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1040,11 +1040,23 @@ def init_running_status(self):
         self._engine_paused = False
 
     def _activate(self, req: Req) -> None:
-        """Mark req as entering active lifecycle (initial admission).
-
-        Caller must ensure req.rid is not already in active_reqs (chunked-resume
-        re-admit is filtered at the call site). See refactor plan §C1.
+        """Mark req as entering active lifecycle.
+
+        Gated: only sync-mode non-DLLM reqs enter active_reqs. Disagg
+        PREFILL/DECODE reqs are owned by their respective queues
+        (disagg_*_queue); DLLM reqs are owned by dllm_manager.staging_queue.
+        See plan §2 Scope (Q1=(c)) and audit §1 总览.
+
+        Without this gate, _activate would enroll disagg PREFILL / DLLM
+        admits (they share _get_new_batch_prefill_raw with sync mode) into
+        active_reqs, but their finish paths don't call _deactivate, leading
+        to memory leak + abort_all crash on the active-segment stashed-
+        chunked assert (C6).
         """
+        if self.disaggregation_mode != DisaggregationMode.NULL:
+            return
+        if req.is_dllm():
+            return
         assert req.rid not in self.active_reqs, f"already active: {req.rid}"
         self.active_reqs[req.rid] = req
 
@@ -1122,11 +1134,13 @@ def init_chunked_prefill(self):
             self.chunked_prefill_size = None
         elif self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
             self.chunked_prefill_size = None
-        # Chunked-resume tracking is now per-req (Req.has_pending_chunk +
-        # pending_middle_outputs counter); the scheduler no longer holds a global pointer.
-        # Stage A stashes any waiting_queue req with has_pending_chunk; cache
-        # impls bound row reads by kv_committed_len so a stash after
-        # init_next_round_input is safe without the old gate.
+        # Chunked-resume tracking: per-Req (has_pending_chunk +
+        # pending_middle_outputs). After the C1-C7 refactor, chunked-resume
+        # reqs live exclusively in `active_reqs` (not waiting_queue); Stage A
+        # iterates `chunked_reqs()` derived from active_reqs. The inline
+        # chunked admission block at the top of `_get_new_batch_prefill_raw`
+        # re-admits them each iter. See agent-drafts/
+        # 2026-05-25-waiting-queue-refactor-plan.md.
         self.is_mixed_chunk = (
             self.chunked_prefill_size is not None
             and self.server_args.enable_mixed_chunk
@@ -2502,8 +2516,9 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             # Defensive exclude_chunked_req: the merge step above already
             # drops chunked-resume reqs from last_batch, so running_batch
             # shouldn't normally hold one. Keep the flag set so any leak in
-            # that invariant doesn't survive here; the dropped req still
-            # has its waiting_queue retention to re-admit next iter.
+            # that invariant doesn't survive here; the dropped req remains
+            # in active_reqs (post-C4) and is re-admitted next iter via the
+            # inline chunked admission block in _get_new_batch_prefill_raw.
             self.running_batch.filter_batch(
                 exclude_chunked_req=True,
                 exclude_in_flight_other_mb=self._in_flight_other_mb_rids(),
@@ -3537,6 +3552,7 @@ def flush_cache(self, empty_cache: bool = True):
             self.last_batch = None
             self.tree_cache.reset()
             self.req_to_token_pool.clear()
+            self.active_reqs.clear()  # audit: keep parallel to req_to_token_pool reset (C8)
             self.token_to_kv_pool_allocator.clear()
             self.grammar_manager.clear()
             self.metrics_reporter.reset_metrics()

From 404bdb7f104c20b12e8db28045d5d70939a3ca23 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 15:59:34 +0800
Subject: [PATCH 50/52] Refactor: Replace inline chunked budget bookkeeping
 with add_one_req (C9)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

C3 inlined the body of upstream's `PrefillAdder.add_chunked_req` into
`_get_new_batch_prefill_raw` to avoid resurrecting the special method.
But `add_one_req` already supports chunked-resume via its `is_resume`
path (`has_pending_chunk and not is_dllm`), which gates:
- budget_prefix=0 (no prefix double-count)
- skip _req_inc_lock_ref (already held from prior admission)
- update has_pending_chunk = truncated

So the inline manual budget code was a copy of logic that
`add_one_req` already encapsulates. C9 replaces the ~30-line inline
block with a single `adder.add_one_req(chunked_req, ...)` call;
chunked admission still runs BEFORE the main waiting_queue loop so it
skips LoRA drainer / hicache prefetch checks that don't apply to
in-flight chunked.

Removes scheduler.py access to PrefillAdder protected methods
(`_get_dllm_remain_tokens`, `_update_prefill_budget`) — these stay
encapsulated. Behavior change: `prefill_delayer_single_pass` /
`prefill_max_requests` / `dsa_prefill_cp_in_seq_split` early-exit
gates now apply to chunked too. Safe in practice: chunked runs first
so can_run_list is empty for `_max_requests` / `cp_in_seq_split`
checks; prefill_delayer blocking chunked just delays one iter.

Part of waiting_queue refactor chain, commit 9/7.
---
 python/sglang/srt/managers/scheduler.py | 83 +++++++------------------
 1 file changed, 22 insertions(+), 61 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 6924e6f23610..825b9b5a6be8 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -156,7 +156,6 @@
     ScheduleBatch,
 )
 from sglang.srt.managers.schedule_policy import (
-    CLIP_MAX_NEW_TOKENS,
     AddReqResult,
     PrefillAdder,
     SchedulePolicy,
@@ -2602,11 +2601,14 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
     def _get_new_batch_prefill_raw(
         self, prefill_delayer_single_pass: Optional[PrefillDelayerSinglePassExecutor]
     ) -> Optional[ScheduleBatch]:
-        # Chunked-resume admission: handled by the inline block below
-        # (`chunked_reqs()` filter + adder bookkeeping). The main
-        # waiting_queue loop further down admits ONLY truly-waiting reqs (no
-        # has_pending_chunk paths). See agent-drafts/
-        # 2026-05-25-waiting-queue-refactor-plan.md §C3.
+        # Chunked-resume admission: handled by the small block at the top of this
+        # method, which feeds the single chunked-resume req (if any) through
+        # `adder.add_one_req`. PrefillAdder.add_one_req detects chunked-resume via
+        # the `is_resume` flag (has_pending_chunk and not is_dllm) and handles all
+        # budget bookkeeping in one place — no special add_chunked_req method
+        # resurrected. The main waiting_queue loop below admits ONLY truly-waiting
+        # reqs. See agent-drafts/2026-05-25-waiting-queue-refactor-plan.md §C3 (and
+        # C9 follow-up).
         # Check if the grammar is ready in the grammar queue
         if self.grammar_manager.has_waiting_grammars():
             ready_grammar_requests = self.grammar_manager.get_ready_grammar_requests()
@@ -2689,64 +2691,23 @@ def _get_new_batch_prefill_raw(
             waiting_queue_len=len(self.waiting_queue),
         )
 
-        # Inline chunked admission (Plan §C3, do NOT recreate
-        # PrefillAdder.add_chunked_req per §10 decision). The chunked-resume
-        # req keeps its row/KV/lock_ref from prior admission, so
-        # init_next_round_input must NOT re-match prefix (no tree_cache arg,
-        # H7 elimination). Budget bookkeeping is inlined here — minor LOC;
-        # intentionally not extracted. Mirrors the body of upstream/main
-        # PrefillAdder.add_chunked_req.
         if chunked_in_active:
             chunked_req = chunked_in_active[0]
+            # No tree_cache: chunked-resume MUST NOT re-match prefix (H7).
+            # Its row + KV + lock_ref are already held from prior admission.
             chunked_req.init_next_round_input()
-            if adder.dllm_config is not None:
-                _rem_tokens = adder._get_dllm_remain_tokens()
-            else:
-                _rem_tokens = min(adder.rem_chunk_tokens, int(adder.rem_total_tokens))
-                if adder.is_hybrid_swa:
-                    # alloc_extend needs extend_num_tokens + page_size per
-                    # request, so reserve one page here to avoid OOM.
-                    _rem_tokens = min(
-                        _rem_tokens, int(adder.rem_swa_tokens) - adder.page_size
-                    )
-                # The chunked_req must be added to the list; otherwise, it
-                # will cause a memory leak. Therefore, in certain cases where
-                # _rem_tokens <= 0, it should be replaced with
-                # rem_chunk_tokens. Under hybrid_swa with no room, skip this
-                # iter — the chunked req stays in active_reqs and is retried
-                # next iter (mirrors upstream `return req`).
-                if _rem_tokens <= 0:
-                    if adder.is_hybrid_swa:
-                        _rem_tokens = None
-                    else:
-                        _rem_tokens = adder.rem_chunk_tokens
-
-            if _rem_tokens is not None:
-                truncated = chunked_req.extend_input_len > _rem_tokens
-                chunked_req.set_extend_input_len(
-                    min(chunked_req.extend_input_len, _rem_tokens)
-                )
-                chunked_req.fill_ids = chunked_req.fill_ids[
-                    : len(chunked_req.prefix_indices) + chunked_req.extend_input_len
-                ]
-                adder.can_run_list.append(chunked_req)
-                adder._update_prefill_budget(
-                    0,
-                    chunked_req.extend_input_len,
-                    (
-                        min(
-                            chunked_req.sampling_params.max_new_tokens,
-                            CLIP_MAX_NEW_TOKENS,
-                        )
-                        if not truncated
-                        else 0
-                    ),
-                )
-                # has_pending_chunk: persistent flag carrying chunked-resume
-                # state across iters. When truncated=False, this was the last
-                # chunk — clear the flag so the req exits chunked_reqs().
-                if not chunked_req.is_dllm():
-                    chunked_req.has_pending_chunk = truncated
+            # Use the standard adder.add_one_req — its `is_resume` branch
+            # (schedule_policy.py:811) handles chunked-resume correctly:
+            # - budget_prefix=0 (don't double-count prefix)
+            # - skip _req_inc_lock_ref (already held)
+            # - update has_pending_chunk = truncated
+            # By running BEFORE the main waiting_queue loop, the chunked req
+            # also skips LoRA drainer / hicache prefetch checks that the
+            # main loop applies to fresh reqs.
+            adder.add_one_req(
+                chunked_req,
+                truncation_align_size=self.truncation_align_size,
+            )
 
         if self.enable_lora:
             running_loras = {req.lora_id for req in self.running_batch.reqs}

From d5bf8baab40408fe933cafb695c86c33846fa8ba Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 16:30:30 +0800
Subject: [PATCH 51/52] Refactor: Clean cross-file chunked-in-waiting refs +
 fix disagg PREFILL leak (C10)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two motivations:

1. BUG: C8's `_activate` gate excluded ALL disagg modes, but disagg
   PREFILL shares _get_new_batch_prefill_raw with sync — chunked-resume
   reqs were admitted, then orphaned (out of waiting_queue per C4, not
   in active_reqs per C8), leaking row + KV + lock_ref. Fix: gate to
   DECODE only (which has its own prealloc/transfer queue ownership),
   then wire _deactivate at disagg/prefill.py's three release_kv_cache
   sites and migrate its Stage A loop to chunked_reqs().

2. CLEANUP: post-C4 chunked-resume never lives in waiting_queue, but
   several supporting files still split waiting_queue by
   has_pending_chunk (schedule_policy.py 3 sites,
   pool_stats_observer.py, invariant_checker.py, several stale
   comments). Revert/migrate to read active_reqs.

DECODE mode is still excluded from active_reqs (Q1=(c)); only PREFILL
is now correctly tracked.

Part of waiting_queue refactor chain, commit 10/7.
---
 python/sglang/srt/disaggregation/decode.py    |  2 +-
 python/sglang/srt/disaggregation/prefill.py   | 18 ++++++----
 python/sglang/srt/managers/schedule_batch.py  |  2 +-
 python/sglang/srt/managers/schedule_policy.py | 33 ++++---------------
 python/sglang/srt/managers/scheduler.py       | 20 ++++-------
 .../scheduler_components/invariant_checker.py | 13 ++++----
 .../pool_stats_observer.py                    |  9 ++---
 7 files changed, 40 insertions(+), 57 deletions(-)

diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index b552b66e487f..afad8b4adfc0 100644
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -1654,7 +1654,7 @@ def get_next_disagg_decode_batch_to_run(
         # Process pending prebuilt batch: output processing + filter + merge
         new_prebuilt_batch = self.get_new_prebuilt_batch()
         if new_prebuilt_batch:
-            assert not any(r.has_pending_chunk for r in self.waiting_queue)
+            # C10: dead assert removed — post-C4 chunked-resume not in waiting_queue.
             self.batch_result_processor.process_batch_result_prebuilt(
                 new_prebuilt_batch
             )
diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index ecf4bd784863..b4272ccebb57 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -556,6 +556,8 @@ def process_batch_result_disagg_prefill(
                         # This can happen if the grammar is not set correctly or the token is invalid.
                         error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
                         release_kv_cache(req, self.tree_cache)
+                        # audit D-prefill-1: disagg PREFILL release path
+                        self._deactivate(req)
                         prepare_abort(
                             req,
                             error_message,
@@ -640,6 +642,8 @@ def process_disagg_prefill_inflight_queue(
                 undone_reqs.append(req)
             elif poll == KVPoll.Success:  # transfer done
                 release_kv_cache(req, self.tree_cache)  # unlock the tree
+                # audit D-prefill-2: disagg PREFILL release path
+                self._deactivate(req)
                 req.finished_reason = FINISH_LENGTH(length=0)
                 # FIXME: clean up req's data in transfer engine
                 if hasattr(req.disagg_kv_sender, "clear"):
@@ -655,6 +659,8 @@ def process_disagg_prefill_inflight_queue(
                 logger.warning(error_message)
                 req.time_stats.trace_ctx.abort(abort_info={"reason": error_message})
                 release_kv_cache(req, self.tree_cache)  # unlock the tree
+                # audit D-prefill-3: disagg PREFILL release path
+                self._deactivate(req)
                 prepare_abort(
                     req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
                 )
@@ -725,10 +731,10 @@ def get_transferred_rids(self: Scheduler) -> List[str]:
         return transferred_rids
 
     def process_prefill_chunk(self: Scheduler) -> None:
-        # Per-req stash for any in-flight chunked-resume reqs (now sitting in
-        # the waiting_queue with has_pending_chunk == True).
-        for req in self.waiting_queue:
-            if req.has_pending_chunk and not req.is_dllm():
+        # audit C10: disagg PREFILL chunked-resume now lives in active_reqs
+        # (same as sync mode post-C4); iterate chunked_reqs() view.
+        for req in self.chunked_reqs():
+            if not req.is_dllm():
                 maybe_cache_unfinished_req(req, self.tree_cache, chunked=True)
                 if self.enable_overlap:
                     # Delay KV transfer to process_batch_result_disagg_prefill
@@ -746,8 +752,8 @@ def process_prefill_chunk(self: Scheduler) -> None:
             # Drop chunked-resume reqs from last_batch — running_batch runs
             # decode forward and admitting a mid-prefill req there breaks
             # shape + KV accounting. The dropped reqs stay in
-            # self.waiting_queue (chunked-resume retention) and re-enter via
-            # the next iter's Stage A stash + admission cycle.
+            # self.active_reqs and re-enter via the next iter's Stage A
+            # stash + admission cycle.
             self.last_batch.filter_batch(exclude_chunked_req=True)
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 11d70b306970..dd059014038c 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1351,7 +1351,7 @@ def reset_for_retract(self):
         # Disagg-prefill send-side bookkeeping. The pre-v2 retract path never
         # ran against a req that had started sending (retract only touched
         # running_batch), so these stayed at init values. After v2 added
-        # pause(retract) coverage for waiting chunked-resume reqs, a retracted
+        # pause(retract) coverage for active chunked-resume reqs, a retracted
         # disagg-prefill req's stale start_send_idx would index garbage in the
         # new row on re-prefill.
         self.start_send_idx = 0
diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index fb5892135a91..7654c4c230f4 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -234,13 +234,8 @@ def _compute_prefix_matches(
         temporary_deprioritized: Set[int] = set()
         self.waiting_queue_radix_tree.reset()
 
+        # C10: chunked-resume no longer in waiting_queue (post-C4); revert to main-upstream sort.
         for r in waiting_queue:
-            if r.has_pending_chunk:
-                # Chunked-resume reqs already have prefix_indices + last_node
-                # set by the prior chunk's Stage A stash, plus an inc'd
-                # lock_ref on last_node. Re-running match_prefix here would
-                # overwrite both, leaving the prior inc unbalanced.
-                continue
             prefix_ids = r.origin_input_ids + r.output_ids
             extra_key = r.extra_key
             match_result = match_prefix_for_req(self.tree_cache, r, prefix_ids)
@@ -283,19 +278,12 @@ def _sort_by_longest_prefix(
         waiting_queue: List[Req], temporary_deprioritized: Set[int]
     ) -> None:
         """Sorts the waiting queue based on the longest prefix match."""
-        # Chunked-resume reqs sort first: their prefix_indices length only
-        # reflects the chunks already prefilled (kv_committed_len), not the
-        # full prompt prefix they could have hit had they been fresh. Without
-        # this floor, a fresh req with a long cached prefix outranks them
-        # every iter, starving them under tight budget.
+        # C10: chunked-resume no longer in waiting_queue (post-C4); revert to main-upstream sort.
         waiting_queue.sort(
             key=lambda r: (
-                0 if r.has_pending_chunk else 1,
-                (
-                    -len(r.prefix_indices)
-                    if r.rid not in temporary_deprioritized
-                    else float("inf")
-                ),
+                -len(r.prefix_indices)
+                if r.rid not in temporary_deprioritized
+                else float("inf")
             )
         )
 
@@ -304,15 +292,9 @@ def _sort_by_dfs_weight(
         waiting_queue: List[Req], tree_cache: BasePrefixCache
     ) -> None:
         """Sorts the waiting queue based on a depth-first search weighting."""
-        # Pull chunked-resume reqs out before DFS — their last_node points at
-        # a mid-chunk stash node with weight 1 (no siblings share it), which
-        # otherwise drops them to a low DFS priority and starves them under
-        # tight budget. They go back to the front of the queue afterwards.
-        chunked_reqs = [req for req in waiting_queue if req.has_pending_chunk]
-        non_chunked_reqs = [req for req in waiting_queue if not req.has_pending_chunk]
-
+        # C10: chunked-resume no longer in waiting_queue (post-C4); revert to main-upstream sort.
         last_node_to_reqs = defaultdict(list)
-        for req in non_chunked_reqs:
+        for req in waiting_queue:
             last_node_to_reqs[req.last_node].append(req)
 
         node_to_weight = defaultdict(int)
@@ -321,7 +303,6 @@ def _sort_by_dfs_weight(
         SchedulePolicy._calc_weight(tree_cache.root_node, node_to_weight)
 
         waiting_queue.clear()
-        waiting_queue.extend(chunked_reqs)
         SchedulePolicy._get_dfs_priority(
             tree_cache.root_node,
             node_to_weight,
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 825b9b5a6be8..f0acc401857a 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -609,7 +609,7 @@ def __init__(
             max_total_num_tokens=self.max_total_num_tokens,
             get_last_batch=lambda: self.last_batch,
             get_running_batch=lambda: self.running_batch,
-            get_waiting_queue=lambda: self.waiting_queue,
+            get_active_reqs=lambda: self.active_reqs,
         )
 
         self.invariant_checker = SchedulerInvariantChecker(
@@ -627,7 +627,7 @@ def __init__(
             pool_stats_observer=self.pool_stats_observer,
             get_last_batch=lambda: self.last_batch,
             get_running_batch=lambda: self.running_batch,
-            get_waiting_queue=lambda: self.waiting_queue,
+            get_active_reqs=lambda: self.active_reqs,
         )
 
         self.kv_events_publisher = SchedulerKvEventsPublisher(
@@ -1041,18 +1041,12 @@ def init_running_status(self):
     def _activate(self, req: Req) -> None:
         """Mark req as entering active lifecycle.
 
-        Gated: only sync-mode non-DLLM reqs enter active_reqs. Disagg
-        PREFILL/DECODE reqs are owned by their respective queues
-        (disagg_*_queue); DLLM reqs are owned by dllm_manager.staging_queue.
-        See plan §2 Scope (Q1=(c)) and audit §1 总览.
-
-        Without this gate, _activate would enroll disagg PREFILL / DLLM
-        admits (they share _get_new_batch_prefill_raw with sync mode) into
-        active_reqs, but their finish paths don't call _deactivate, leading
-        to memory leak + abort_all crash on the active-segment stashed-
-        chunked assert (C6).
+        Gated: only sync mode + disagg PREFILL + non-DLLM reqs enter
+        active_reqs. Disagg DECODE has its own prealloc/transfer queue
+        ownership; DLLM has its own staging_queue. See plan §2 Scope and
+        C10 fix plan §2 (disagg PREFILL bug).
         """
-        if self.disaggregation_mode != DisaggregationMode.NULL:
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
             return
         if req.is_dllm():
             return
diff --git a/python/sglang/srt/managers/scheduler_components/invariant_checker.py b/python/sglang/srt/managers/scheduler_components/invariant_checker.py
index 9bebbe1dded9..8a236aba68d1 100644
--- a/python/sglang/srt/managers/scheduler_components/invariant_checker.py
+++ b/python/sglang/srt/managers/scheduler_components/invariant_checker.py
@@ -50,7 +50,7 @@ class SchedulerInvariantChecker:
     pool_stats_observer: SchedulerPoolStatsObserver
     get_last_batch: Callable
     get_running_batch: Callable
-    get_waiting_queue: Callable
+    get_active_reqs: Callable
     count_req_pool_leak_warnings: int = 0
     count_memory_leak_warnings: int = 0
 
@@ -163,20 +163,21 @@ def _get_total_uncached_sizes(
             and not self.get_running_batch().is_empty()
         ):
             req_groups.append(list(self.get_running_batch().reqs))
-        # Chunked-resume reqs in waiting_queue carry uncached tail
+        # Chunked-resume reqs in active_reqs carry uncached tail
         # (kv_committed_len - cache_protected_len, < page_size) that
         # filter_batch just removed from last_batch but haven't been
         # re-admitted to running_batch yet. The leak invariant must count it.
+        # C10: chunked-resume now lives in active_reqs (post-C4).
         seen_ids = {id(req) for group in req_groups for req in group}
-        chunked_in_queue = [
+        chunked_in_active = [
             req
-            for req in self.get_waiting_queue()
+            for req in self.get_active_reqs().values()
             if req.has_pending_chunk
             and req.req_pool_idx is not None
             and id(req) not in seen_ids
         ]
-        if chunked_in_queue:
-            req_groups.append(chunked_in_queue)
+        if chunked_in_active:
+            req_groups.append(chunked_in_active)
 
         full_uncached = 0
         swa_uncached = 0
diff --git a/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py b/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py
index 782147653c20..f01cee22814c 100644
--- a/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py
+++ b/python/sglang/srt/managers/scheduler_components/pool_stats_observer.py
@@ -153,7 +153,7 @@ class SchedulerPoolStatsObserver:
     max_total_num_tokens: int
     get_last_batch: Callable
     get_running_batch: Callable
-    get_waiting_queue: Callable
+    get_active_reqs: Callable
 
     def streaming_session_count(self) -> int:
         return sum(
@@ -164,7 +164,7 @@ def streaming_session_count(self) -> int:
 
     def active_pool_idxs(self) -> set:
         """Pool idxs currently owned by reqs in last_batch / running_batch or
-        held by chunked-resume reqs sitting in waiting_queue.
+        held by chunked-resume reqs in active_reqs.
 
         Used to decide which session slots' KV is owned by batch reqs
         (and thus counted via uncached_size, not session_held).
@@ -176,10 +176,11 @@ def active_pool_idxs(self) -> set:
             for req in batch.reqs:
                 if req.req_pool_idx is not None:
                     idxs.add(req.req_pool_idx)
-        # Chunked-resume reqs in waiting_queue still own their row across iters
+        # Chunked-resume reqs in active_reqs still own their row across iters
         # (filter_batch may have just moved them out of last_batch but they
         # haven't yet been re-admitted to running_batch).
-        for req in self.get_waiting_queue():
+        # C10: chunked-resume now lives in active_reqs (post-C4).
+        for req in self.get_active_reqs().values():
             if req.has_pending_chunk and req.req_pool_idx is not None:
                 idxs.add(req.req_pool_idx)
         return idxs

From e6a9f0771225a5251cc8f9eb694a8215dfb3f853 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 27 May 2026 16:41:55 +0800
Subject: [PATCH 52/52] Fix: abort_request stashed-chunked disagg PREFILL
 cleanup (C11)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

C10 narrowed _activate's gate to DECODE-only, so disagg PREFILL
chunked-resume reqs now enter active_reqs and can be reached by the
abort_request active段 stashed-chunked branch (C6). But that branch
only does release_kv_cache + _deactivate — missing two pieces of
disagg PREFILL cleanup that pause_generation(retract) does correctly:

1. disagg_kv_sender.abort() — without this, the peer decode node
   waits forever for the remaining chunks (hang).
2. release_req_to_metadata_buffer() — metadata buffer slot leak.

Mirrors pause_generation(retract) PREFILL handling and abort_request
waiting段 PREFILL handling.

Also: clean stale "assert above" comment in disagg/decode.py
(the assert was deleted in C10).

Part of waiting_queue refactor chain, commit 11/7.
---
 python/sglang/srt/disaggregation/decode.py |  6 +++---
 python/sglang/srt/managers/scheduler.py    | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index afad8b4adfc0..9f9046a2d050 100644
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -1660,9 +1660,9 @@ def get_next_disagg_decode_batch_to_run(
             )
             # Defensive: chunked prefill is a prefill-side concept; decode-side
             # prebuilt batches shouldn't carry has_pending_chunk reqs. The
-            # assert above already guards waiting_queue; this flag protects
-            # against any future code that would route a chunked req through
-            # the disagg decode path.
+            # waiting_queue invariant is checked by _assert_invariants in sync
+            # mode; this flag protects against any future code that would route
+            # a chunked req through the disagg decode path.
             new_prebuilt_batch.filter_batch(exclude_chunked_req=True)
             if not new_prebuilt_batch.is_empty():
                 if self.running_batch.is_empty():
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index f0acc401857a..f8b65e5ca070 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -3779,7 +3779,28 @@ def abort_request(self, recv_req: AbortReq):
                     f"req_pool_idx={req.req_pool_idx}"
                 )
                 if self.disaggregation_mode != DisaggregationMode.DECODE:
+                    # C11: disagg PREFILL stashed-chunked req has already been
+                    # sending KV chunks to the peer decode node. Signal abort so
+                    # the peer doesn't wait forever for the remaining chunks.
+                    # Mirrors pause_generation(retract) PREFILL handling
+                    # (scheduler.py pause section).
+                    if (
+                        self.disaggregation_mode == DisaggregationMode.PREFILL
+                        and req.disagg_kv_sender is not None
+                    ):
+                        if hasattr(req.disagg_kv_sender, "abort"):
+                            req.disagg_kv_sender.abort()
+                        req.disagg_kv_sender = None
+
                     release_kv_cache(req, self.tree_cache, is_insert=False)
+
+                    # C11: PREFILL mode also needs to release the metadata buffer
+                    # slot. Mirrors abort_request waiting-segment PREFILL handling.
+                    if self.disaggregation_mode == DisaggregationMode.PREFILL:
+                        release_req_to_metadata_buffer(
+                            req, self.req_to_metadata_buffer_idx_allocator
+                        )
+
                     req.has_pending_chunk = False
                     req.pending_middle_outputs = 0
                     self._deactivate(req)