pyc96 · pyc96 · May 26, 2026 · May 26, 2026
diff --git a/python/sglang/srt/arg_groups/speculative_hook.py b/python/sglang/srt/arg_groups/speculative_hook.py
@@ -237,10 +237,30 @@ def _handle_frozen_kv_mtp(server_args: "ServerArgs") -> None:
             "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
         )
 
-    server_args.disable_overlap_schedule = True
-    logger.warning(
-        "Overlap scheduler is disabled when using Frozen-KV MTP speculative decoding (spec v2 is not supported yet)."
-    )
+    # Spec V2 (overlap scheduling) for FROZEN_KV_MTP is experimental.
+    # The V2 worker (``FrozenKVMTPWorkerV2``) is wired through the
+    # dispatcher but the per-iteration ``next_draft_input`` ferry hits
+    # several scheduler-side assumptions about populated draft inputs
+    # that require additional plumbing (idle-batch DraftInput shape,
+    # empty-bonus-tokens stash) before the path is production-ready.
+    # Force V1 by default and require an explicit env var
+    # (``SGLANG_FROZEN_KV_MTP_EXPERIMENTAL_V2=1``) to opt into V2.
+    import os
+
+    if os.environ.get("SGLANG_FROZEN_KV_MTP_EXPERIMENTAL_V2", "0") != "1":
+        server_args.disable_overlap_schedule = True
+        logger.warning(
+            "Overlap scheduler is disabled for Frozen-KV MTP (spec V2 is "
+            "experimental; set SGLANG_FROZEN_KV_MTP_EXPERIMENTAL_V2=1 to "
+            "opt into FrozenKVMTPWorkerV2)."
+        )
+    else:
+        logger.warning(
+            "EXPERIMENTAL: Frozen-KV MTP V2 (overlap scheduling) is enabled "
+            "via SGLANG_FROZEN_KV_MTP_EXPERIMENTAL_V2=1. The path is not "
+            "production-ready: the per-iteration draft-input ferry has "
+            "known gaps with empty/idle batches and may crash."
+        )
 
     if server_args.enable_mixed_chunk:
         server_args.enable_mixed_chunk = False

diff --git a/python/sglang/srt/speculative/eagle_info.py b/python/sglang/srt/speculative/eagle_info.py
@@ -296,14 +296,18 @@ def verify(
             )
 
         # Apply penalty
+        # None-safe for spec-V2/overlap where penalizer is None (sampling
+        # and the penalty cumulate live on the schedule stream).
+        _penalizer_v2 = sampling_info.penalizer_orchestrator
         if (
-            sampling_info.penalizer_orchestrator.is_required
-            or sampling_info.logit_bias is not None
-        ):
+            _penalizer_v2 is not None and _penalizer_v2.is_required
+        ) or sampling_info.logit_bias is not None:
             # This is a relaxed version of penalties for speculative decoding.
-            sampling_info.penalizer_orchestrator.apply(
-                logits_output.next_token_logits, repeat=self.draft_token_num
-            )
+            if _penalizer_v2 is not None and _penalizer_v2.is_required:
+                _penalizer_v2.apply(
+                    logits_output.next_token_logits,
+                    repeat=self.draft_token_num,
+                )
             if sampling_info.logit_bias is not None:
                 logits_output.next_token_logits.add_(
                     torch.repeat_interleave(
@@ -443,9 +447,7 @@ def verify(
                     try:
                         req.grammar.accept_token(id)
                     except ValueError as e:
-                        logger.info(
-                            f"{i=}, {req=}\n" f"{accept_index=}\n" f"{predict=}\n"
-                        )
+                        logger.info(f"{i=}, {req=}\n{accept_index=}\n{predict=}\n")
                         raise e
                     req.update_finish_state()
                 if req.finished():

diff --git a/python/sglang/srt/speculative/frozen_kv_mtp_worker.py b/python/sglang/srt/speculative/frozen_kv_mtp_worker.py
@@ -136,8 +136,10 @@ def __init__(
         self.hot_token_id = None
 
         with (
-            empty_context()
-        ), speculative_moe_backend_context(), speculative_moe_a2a_backend_context():
+            empty_context(),
+            speculative_moe_backend_context(),
+            speculative_moe_a2a_backend_context(),
+        ):
             super().__init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
@@ -398,8 +400,9 @@ def _run_assistant_seed_step(
                 forward_batch.mm_input_embeds = mm_input_embeds
             self._set_positions(forward_batch)
             self._init_frozen_kv_metadata(forward_batch)
-            with self._target_kv_pool_view(forward_batch), forward_context(
-                ForwardContext(attn_backend=self.draft_attn_backend)
+            with (
+                self._target_kv_pool_view(forward_batch),
+                forward_context(ForwardContext(attn_backend=self.draft_attn_backend)),
             ):
                 logits_output = self.draft_model_runner.forward(
                     forward_batch, skip_attn_backend_init=True
@@ -582,10 +585,18 @@ def draft(self, batch: ScheduleBatch):
         spec_info = batch.spec_info
         assert isinstance(spec_info, FrozenKVMTPDraftInput)
 
-        if batch.sampling_info.penalizer_orchestrator.is_required:
-            batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
-                spec_info.bonus_tokens.to(torch.int64)
-            )
+        # Under spec-V2 / overlap scheduling, sampling (and therefore the
+        # penalizer cumulate) lives on the schedule stream via
+        # ``EagleDraftInputV2Mixin.prepare_for_decode``.  The worker side
+        # must skip the cumulate when the orchestrator is absent.
+        _sampling_info = batch.sampling_info
+        _penalizer = (
+            _sampling_info.penalizer_orchestrator
+            if _sampling_info is not None
+            else None
+        )
+        if _penalizer is not None and _penalizer.is_required:
+            _penalizer.cumulate_output_tokens(spec_info.bonus_tokens.to(torch.int64))
 
         spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         spec_info.num_tokens_per_req = self.topk
@@ -682,8 +693,9 @@ def draft_forward(
             forward_batch.spec_info.hidden_states = hidden_states
             self._set_positions(forward_batch)
 
-            with self._target_kv_pool_view(forward_batch), forward_context(
-                ForwardContext(attn_backend=self.draft_attn_backend)
+            with (
+                self._target_kv_pool_view(forward_batch),
+                forward_context(ForwardContext(attn_backend=self.draft_attn_backend)),
             ):
                 logits_output = self.draft_model_runner.forward(
                     forward_batch, skip_attn_backend_init=True