From bfc40cc88f9ad89eac78b5cc1717156824177d1f Mon Sep 17 00:00:00 2001
From: hnyls2002 <lsyincs@gmail.com>
Date: Tue, 28 Apr 2026 11:33:53 -0700
Subject: [PATCH 1/3] support asymmetric pd-mtp via decode-spec-algo flag

---
 python/sglang/srt/managers/scheduler.py       | 24 ++++++++---
 python/sglang/srt/server_args.py              | 18 +++++++++
 .../8-gpu-models/test_dsv4_pd_disagg_nixl.py  | 40 +++++++++----------
 3 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 91ad8c04d4cb..c283d6749711 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -946,19 +946,31 @@ def init_disaggregation(self):
             self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
                 buffer_size
             )
+            # Asymmetric P/D: prefill itself runs no spec module but the
+            # decode node does. Match the decode-side hidden-state buffer
+            # so the cross-node transfer aligns; prefill leaves the buffer
+            # zero-initialized and decode treats it as mock conditioning.
+            decode_spec_algo = (
+                self.server_args.disaggregation_decode_speculative_algorithm
+            )
+            decode_side_needs_hidden = decode_spec_algo in (
+                "EAGLE",
+                "EAGLE3",
+                "STANDALONE",
+            )
+            local_spec_needs_hidden = (
+                self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone()
+            )
+            needs_hidden = local_spec_needs_hidden or decode_side_needs_hidden
             self.disagg_metadata_buffers = MetadataBuffers(
                 buffer_size,
                 hidden_size=(
                     model_config.spec_hidden_size
-                    if self.spec_algorithm.is_eagle()
-                    or self.spec_algorithm.is_standalone()
+                    if needs_hidden
                     else 16  # minimal padding size for RDMA
                 ),
                 hidden_states_dtype=(
-                    model_config.dtype
-                    if self.spec_algorithm.is_eagle()
-                    or self.spec_algorithm.is_standalone()
-                    else torch.float32
+                    model_config.dtype if needs_hidden else torch.float32
                 ),
                 custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(),
             )
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 41a93d8bc66c..82e77cea04bb 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -665,6 +665,11 @@ class ServerArgs:
     disaggregation_bootstrap_port: int = 8998
     disaggregation_decode_tp: Optional[int] = None
     disaggregation_decode_dp: Optional[int] = None
+    # Tells a non-spec prefill node which speculative algorithm the decode
+    # node runs, so prefill sizes its metadata hidden-state buffer to match
+    # the decode-side spec module. Decode reads its own --speculative-*
+    # args and ignores this. Allowed values match --speculative-algorithm.
+    disaggregation_decode_speculative_algorithm: Optional[str] = None
     disaggregation_prefill_pp: Optional[int] = 1
     disaggregation_ib_device: Optional[str] = None
     disaggregation_decode_enable_offload_kvcache: bool = False
@@ -5030,6 +5035,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
             default=ServerArgs.disaggregation_decode_dp,
             help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
         )
+        parser.add_argument(
+            "--disaggregation-decode-speculative-algorithm",
+            type=str,
+            default=ServerArgs.disaggregation_decode_speculative_algorithm,
+            choices=["EAGLE", "EAGLE3", "STANDALONE"],
+            help="Speculative algorithm running on the decode node. Set on a "
+            "prefill node that itself does not run a draft model so the "
+            "metadata hidden-state buffer is sized to match the decode-side "
+            "spec module. Decode bootstraps spec_info from a zero-init "
+            "buffer (mock) and recovers real conditioning after the first "
+            "verify; only the first decode iteration's accept length is "
+            "affected.",
+        )
         parser.add_argument(
             "--disaggregation-prefill-pp",
             type=int,
diff --git a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
index 15a4c4f4bd21..b179502e273d 100644
--- a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
+++ b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
@@ -1,21 +1,23 @@
 """DSv4 Flash PD-disaggregation test with NIXL transfer backend.
 
+Asymmetric P/D config: prefill runs no spec module, decode runs EAGLE
+MTP. Prefill uses --disaggregation-decode-speculative-algorithm so it
+sizes the metadata hidden-state buffer to match the decode-side spec
+module; the buffer is shipped zero-initialized (prefill has no draft
+model to populate it). Decode treats the zeros as mock conditioning
+for the first draft step. Verified spec decoding makes those bad
+drafts get rejected by target; from the second iteration onward, real
+target hidden flows through normally. Amortized cost is ~1 wasted
+draft step per request, < 1% throughput hit on long generations.
+
 Topology (1 H200 node, 8 GPUs total):
-  - Prefill: GPU 0-3, tp=4 — pure TP, **no EP** (no deepep), no DP
-    attention. Optimized for throughput on long prompts; each rank
-    holds the full MoE weights, no all-to-all dispatch traffic.
-    Spec config matches decode (PD ferry currently assumes symmetric
-    spec on both sides) so the prefill -> decode metadata buffer
-    is sized correctly for the spec module's hidden shape.
+  - Prefill: GPU 0-3, tp=4 — pure TP, no EP (no deepep), no DP
+    attention, no draft model. Optimized for prompt-side throughput.
   - Decode:  GPU 4-7, tp=4 dp=4 enable-dp-attention + deepep + EAGLE
-    MTP — optimized for low-latency decode with spec decoding and
-    expert parallelism.
+    MTP — optimized for low-latency decode.
   - Mini load balancer fronting both.
 
-Both sides use DSv4 Flash FP8 weights. Transfer backend is NIXL
-(the focus of recent nixl/conn.py forward-delta work; this test is
-the e2e check that the generic `send_state` / shared buffer-pool
-changes do not break PD).
+Both sides use DSv4 Flash FP8 weights. Transfer backend is NIXL.
 """
 
 import unittest
@@ -65,9 +67,9 @@ def setUpClass(cls):
 
     @classmethod
     def start_prefill(cls):
-        # Prefill: TP=4 (no EP, no DP attention). EAGLE config mirrors decode
-        # so the metadata buffer is sized for the spec module's hidden shape;
-        # PD ferry currently assumes both sides agree on the spec algorithm.
+        # Prefill: TP=4, no EP, no DP attention, no spec module. The decode
+        # node runs EAGLE MTP; --disaggregation-decode-speculative-algorithm
+        # tells prefill to size the metadata hidden-state buffer to match.
         prefill_args = [
             "--trust-remote-code",
             "--disaggregation-mode",
@@ -86,14 +88,8 @@ def start_prefill(cls):
             "4",
             "--disaggregation-decode-dp",
             "4",
-            "--speculative-algorithm",
+            "--disaggregation-decode-speculative-algorithm",
             "EAGLE",
-            "--speculative-num-steps",
-            "3",
-            "--speculative-eagle-topk",
-            "1",
-            "--speculative-num-draft-tokens",
-            "4",
             *cls.transfer_backend,
             *cls.rdma_devices,
         ]

From a47e7f55570e15343da06d84f08a6eb0013907dd Mon Sep 17 00:00:00 2001
From: hnyls2002 <lsyincs@gmail.com>
Date: Tue, 28 Apr 2026 11:43:14 -0700
Subject: [PATCH 2/3] drop server arg; always size pd hidden buffer to
 spec_hidden_size

---
 python/sglang/srt/managers/scheduler.py       | 49 ++++++-------------
 python/sglang/srt/server_args.py              | 18 -------
 .../8-gpu-models/test_dsv4_pd_disagg_nixl.py  | 25 +++++-----
 3 files changed, 27 insertions(+), 65 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index c283d6749711..a0d51f4d64d1 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -892,18 +892,16 @@ def init_disaggregation(self):
             self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
                 buffer_size
             )
+            # Always allocate the spec hidden buffer at full size so the
+            # decode side aligns with prefill regardless of which side
+            # actually runs the spec module (asymmetric P/D). When neither
+            # side runs spec the buffer is unused but the allocation cost
+            # is bounded (~few MB) and the alternative would be a wire
+            # protocol mismatch in asymmetric configs.
             self.disagg_metadata_buffers = MetadataBuffers(
                 buffer_size,
-                hidden_size=(
-                    model_config.spec_hidden_size
-                    if self.spec_algorithm.is_eagle()
-                    else 16  # minimal padding size for RDMA
-                ),
-                hidden_states_dtype=(
-                    model_config.dtype
-                    if self.spec_algorithm.is_eagle()
-                    else torch.float32
-                ),
+                hidden_size=model_config.spec_hidden_size,
+                hidden_states_dtype=model_config.dtype,
                 custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(),
             )
 
@@ -946,32 +944,15 @@ def init_disaggregation(self):
             self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
                 buffer_size
             )
-            # Asymmetric P/D: prefill itself runs no spec module but the
-            # decode node does. Match the decode-side hidden-state buffer
-            # so the cross-node transfer aligns; prefill leaves the buffer
-            # zero-initialized and decode treats it as mock conditioning.
-            decode_spec_algo = (
-                self.server_args.disaggregation_decode_speculative_algorithm
-            )
-            decode_side_needs_hidden = decode_spec_algo in (
-                "EAGLE",
-                "EAGLE3",
-                "STANDALONE",
-            )
-            local_spec_needs_hidden = (
-                self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone()
-            )
-            needs_hidden = local_spec_needs_hidden or decode_side_needs_hidden
+            # Always allocate the spec hidden buffer at full size; see the
+            # matching comment on the decode branch above. When prefill has
+            # no spec module of its own, the buffer stays zero-initialized
+            # and decode treats it as mock conditioning for the first draft
+            # step (verified spec keeps the output token correct).
             self.disagg_metadata_buffers = MetadataBuffers(
                 buffer_size,
-                hidden_size=(
-                    model_config.spec_hidden_size
-                    if needs_hidden
-                    else 16  # minimal padding size for RDMA
-                ),
-                hidden_states_dtype=(
-                    model_config.dtype if needs_hidden else torch.float32
-                ),
+                hidden_size=model_config.spec_hidden_size,
+                hidden_states_dtype=model_config.dtype,
                 custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(),
             )
 
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 82e77cea04bb..41a93d8bc66c 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -665,11 +665,6 @@ class ServerArgs:
     disaggregation_bootstrap_port: int = 8998
     disaggregation_decode_tp: Optional[int] = None
     disaggregation_decode_dp: Optional[int] = None
-    # Tells a non-spec prefill node which speculative algorithm the decode
-    # node runs, so prefill sizes its metadata hidden-state buffer to match
-    # the decode-side spec module. Decode reads its own --speculative-*
-    # args and ignores this. Allowed values match --speculative-algorithm.
-    disaggregation_decode_speculative_algorithm: Optional[str] = None
     disaggregation_prefill_pp: Optional[int] = 1
     disaggregation_ib_device: Optional[str] = None
     disaggregation_decode_enable_offload_kvcache: bool = False
@@ -5035,19 +5030,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
             default=ServerArgs.disaggregation_decode_dp,
             help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
         )
-        parser.add_argument(
-            "--disaggregation-decode-speculative-algorithm",
-            type=str,
-            default=ServerArgs.disaggregation_decode_speculative_algorithm,
-            choices=["EAGLE", "EAGLE3", "STANDALONE"],
-            help="Speculative algorithm running on the decode node. Set on a "
-            "prefill node that itself does not run a draft model so the "
-            "metadata hidden-state buffer is sized to match the decode-side "
-            "spec module. Decode bootstraps spec_info from a zero-init "
-            "buffer (mock) and recovers real conditioning after the first "
-            "verify; only the first decode iteration's accept length is "
-            "affected.",
-        )
         parser.add_argument(
             "--disaggregation-prefill-pp",
             type=int,
diff --git a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
index b179502e273d..e11f3eb2d9b1 100644
--- a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
+++ b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
@@ -1,14 +1,14 @@
 """DSv4 Flash PD-disaggregation test with NIXL transfer backend.
 
 Asymmetric P/D config: prefill runs no spec module, decode runs EAGLE
-MTP. Prefill uses --disaggregation-decode-speculative-algorithm so it
-sizes the metadata hidden-state buffer to match the decode-side spec
-module; the buffer is shipped zero-initialized (prefill has no draft
-model to populate it). Decode treats the zeros as mock conditioning
-for the first draft step. Verified spec decoding makes those bad
-drafts get rejected by target; from the second iteration onward, real
-target hidden flows through normally. Amortized cost is ~1 wasted
-draft step per request, < 1% throughput hit on long generations.
+MTP. The metadata hidden-state buffer is allocated at full
+spec_hidden_size on both sides unconditionally; prefill has no draft
+model to populate it, so the wire data stays zero, and decode treats
+the zeros as mock conditioning for the first draft step. Verified
+spec decoding makes those bad drafts get rejected by target; from
+iteration 2 onward, real target hidden flows through normally.
+Amortized cost is ~1 wasted draft step per request, < 1% throughput
+hit on long generations.
 
 Topology (1 H200 node, 8 GPUs total):
   - Prefill: GPU 0-3, tp=4 — pure TP, no EP (no deepep), no DP
@@ -67,9 +67,10 @@ def setUpClass(cls):
 
     @classmethod
     def start_prefill(cls):
-        # Prefill: TP=4, no EP, no DP attention, no spec module. The decode
-        # node runs EAGLE MTP; --disaggregation-decode-speculative-algorithm
-        # tells prefill to size the metadata hidden-state buffer to match.
+        # Prefill: TP=4, no EP, no DP attention, no spec module. The
+        # metadata hidden-state buffer is sized to spec_hidden_size on
+        # both sides automatically (see scheduler.py), so no flag tells
+        # prefill about the decode-side spec config.
         prefill_args = [
             "--trust-remote-code",
             "--disaggregation-mode",
@@ -88,8 +89,6 @@ def start_prefill(cls):
             "4",
             "--disaggregation-decode-dp",
             "4",
-            "--disaggregation-decode-speculative-algorithm",
-            "EAGLE",
             *cls.transfer_backend,
             *cls.rdma_devices,
         ]

From deecaaef85f635c41a07bf051b413fc61e93119d Mon Sep 17 00:00:00 2001
From: hnyls2002 <lsyincs@gmail.com>
Date: Tue, 28 Apr 2026 12:29:26 -0700
Subject: [PATCH 3/3] trim comments

---
 python/sglang/srt/managers/scheduler.py       | 15 +++-----
 .../8-gpu-models/test_dsv4_pd_disagg_nixl.py  | 34 +++----------------
 2 files changed, 9 insertions(+), 40 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index a0d51f4d64d1..4f510b61cd7b 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -892,12 +892,8 @@ def init_disaggregation(self):
             self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
                 buffer_size
             )
-            # Always allocate the spec hidden buffer at full size so the
-            # decode side aligns with prefill regardless of which side
-            # actually runs the spec module (asymmetric P/D). When neither
-            # side runs spec the buffer is unused but the allocation cost
-            # is bounded (~few MB) and the alternative would be a wire
-            # protocol mismatch in asymmetric configs.
+            # Full-size buffer on both sides so the wire layout aligns
+            # under asymmetric P/D where one side may not run spec.
             self.disagg_metadata_buffers = MetadataBuffers(
                 buffer_size,
                 hidden_size=model_config.spec_hidden_size,
@@ -944,11 +940,8 @@ def init_disaggregation(self):
             self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
                 buffer_size
             )
-            # Always allocate the spec hidden buffer at full size; see the
-            # matching comment on the decode branch above. When prefill has
-            # no spec module of its own, the buffer stays zero-initialized
-            # and decode treats it as mock conditioning for the first draft
-            # step (verified spec keeps the output token correct).
+            # See decode branch above. Asymmetric P/D: prefill without a
+            # spec module ships zeros, decode mocks first-step conditioning.
             self.disagg_metadata_buffers = MetadataBuffers(
                 buffer_size,
                 hidden_size=model_config.spec_hidden_size,
diff --git a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
index e11f3eb2d9b1..970868d6e35c 100644
--- a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
+++ b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py
@@ -1,24 +1,7 @@
-"""DSv4 Flash PD-disaggregation test with NIXL transfer backend.
-
-Asymmetric P/D config: prefill runs no spec module, decode runs EAGLE
-MTP. The metadata hidden-state buffer is allocated at full
-spec_hidden_size on both sides unconditionally; prefill has no draft
-model to populate it, so the wire data stays zero, and decode treats
-the zeros as mock conditioning for the first draft step. Verified
-spec decoding makes those bad drafts get rejected by target; from
-iteration 2 onward, real target hidden flows through normally.
-Amortized cost is ~1 wasted draft step per request, < 1% throughput
-hit on long generations.
-
-Topology (1 H200 node, 8 GPUs total):
-  - Prefill: GPU 0-3, tp=4 — pure TP, no EP (no deepep), no DP
-    attention, no draft model. Optimized for prompt-side throughput.
-  - Decode:  GPU 4-7, tp=4 dp=4 enable-dp-attention + deepep + EAGLE
-    MTP — optimized for low-latency decode.
-  - Mini load balancer fronting both.
-
-Both sides use DSv4 Flash FP8 weights. Transfer backend is NIXL.
-"""
+"""DSv4 Flash PD-disagg with NIXL backend, asymmetric: prefill is pure
+TP with no spec module, decode runs EAGLE MTP. Prefill ships a zero-
+init hidden buffer; decode mocks first-step conditioning, verify keeps
+the output correct."""
 
 import unittest
 from types import SimpleNamespace
@@ -40,9 +23,7 @@
 
 DSV4_FLASH_ENV = {
     "SGLANG_DSV4_FP4_EXPERTS": "0",
-    # Decode side runs MTP with num_draft_tokens=4 → dispatch input scales
-    # by ~4x, so default 256 overflows once cuda-graph-max-bs * draft > 256.
-    # 1024 covers bs=128 * 4 with headroom (no-op on prefill which has no EP).
+    # MTP num_draft_tokens=4 scales dispatch by ~4x; 256 overflows at bs=128.
     "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK": "1024",
     "SGLANG_JIT_DEEPGEMM_PRECOMPILE": "0",
 }
@@ -67,10 +48,6 @@ def setUpClass(cls):
 
     @classmethod
     def start_prefill(cls):
-        # Prefill: TP=4, no EP, no DP attention, no spec module. The
-        # metadata hidden-state buffer is sized to spec_hidden_size on
-        # both sides automatically (see scheduler.py), so no flag tells
-        # prefill about the decode-side spec config.
         prefill_args = [
             "--trust-remote-code",
             "--disaggregation-mode",
@@ -102,7 +79,6 @@ def start_prefill(cls):
 
     @classmethod
     def start_decode(cls):
-        # Decode: TP=4 + DP=4 attention + deepep EP + EAGLE MTP.
         decode_args = [
             "--trust-remote-code",
             "--disaggregation-mode",