From bfc40cc88f9ad89eac78b5cc1717156824177d1f Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Tue, 28 Apr 2026 11:33:53 -0700 Subject: [PATCH 1/3] support asymmetric pd-mtp via decode-spec-algo flag --- python/sglang/srt/managers/scheduler.py | 24 ++++++++--- python/sglang/srt/server_args.py | 18 +++++++++ .../8-gpu-models/test_dsv4_pd_disagg_nixl.py | 40 +++++++++---------- 3 files changed, 54 insertions(+), 28 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 91ad8c04d4cb..c283d6749711 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -946,19 +946,31 @@ def init_disaggregation(self): self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator( buffer_size ) + # Asymmetric P/D: prefill itself runs no spec module but the + # decode node does. Match the decode-side hidden-state buffer + # so the cross-node transfer aligns; prefill leaves the buffer + # zero-initialized and decode treats it as mock conditioning. + decode_spec_algo = ( + self.server_args.disaggregation_decode_speculative_algorithm + ) + decode_side_needs_hidden = decode_spec_algo in ( + "EAGLE", + "EAGLE3", + "STANDALONE", + ) + local_spec_needs_hidden = ( + self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone() + ) + needs_hidden = local_spec_needs_hidden or decode_side_needs_hidden self.disagg_metadata_buffers = MetadataBuffers( buffer_size, hidden_size=( model_config.spec_hidden_size - if self.spec_algorithm.is_eagle() - or self.spec_algorithm.is_standalone() + if needs_hidden else 16 # minimal padding size for RDMA ), hidden_states_dtype=( - model_config.dtype - if self.spec_algorithm.is_eagle() - or self.spec_algorithm.is_standalone() - else torch.float32 + model_config.dtype if needs_hidden else torch.float32 ), custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(), ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 41a93d8bc66c..82e77cea04bb 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -665,6 +665,11 @@ class ServerArgs: disaggregation_bootstrap_port: int = 8998 disaggregation_decode_tp: Optional[int] = None disaggregation_decode_dp: Optional[int] = None + # Tells a non-spec prefill node which speculative algorithm the decode + # node runs, so prefill sizes its metadata hidden-state buffer to match + # the decode-side spec module. Decode reads its own --speculative-* + # args and ignores this. Allowed values match --speculative-algorithm. + disaggregation_decode_speculative_algorithm: Optional[str] = None disaggregation_prefill_pp: Optional[int] = 1 disaggregation_ib_device: Optional[str] = None disaggregation_decode_enable_offload_kvcache: bool = False @@ -5030,6 +5035,19 @@ def add_cli_args(parser: argparse.ArgumentParser): default=ServerArgs.disaggregation_decode_dp, help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.", ) + parser.add_argument( + "--disaggregation-decode-speculative-algorithm", + type=str, + default=ServerArgs.disaggregation_decode_speculative_algorithm, + choices=["EAGLE", "EAGLE3", "STANDALONE"], + help="Speculative algorithm running on the decode node. Set on a " + "prefill node that itself does not run a draft model so the " + "metadata hidden-state buffer is sized to match the decode-side " + "spec module. Decode bootstraps spec_info from a zero-init " + "buffer (mock) and recovers real conditioning after the first " + "verify; only the first decode iteration's accept length is " + "affected.", + ) parser.add_argument( "--disaggregation-prefill-pp", type=int, diff --git a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py index 15a4c4f4bd21..b179502e273d 100644 --- a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py +++ b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py @@ -1,21 +1,23 @@ """DSv4 Flash PD-disaggregation test with NIXL transfer backend. +Asymmetric P/D config: prefill runs no spec module, decode runs EAGLE +MTP. Prefill uses --disaggregation-decode-speculative-algorithm so it +sizes the metadata hidden-state buffer to match the decode-side spec +module; the buffer is shipped zero-initialized (prefill has no draft +model to populate it). Decode treats the zeros as mock conditioning +for the first draft step. Verified spec decoding makes those bad +drafts get rejected by target; from the second iteration onward, real +target hidden flows through normally. Amortized cost is ~1 wasted +draft step per request, < 1% throughput hit on long generations. + Topology (1 H200 node, 8 GPUs total): - - Prefill: GPU 0-3, tp=4 — pure TP, **no EP** (no deepep), no DP - attention. Optimized for throughput on long prompts; each rank - holds the full MoE weights, no all-to-all dispatch traffic. - Spec config matches decode (PD ferry currently assumes symmetric - spec on both sides) so the prefill -> decode metadata buffer - is sized correctly for the spec module's hidden shape. + - Prefill: GPU 0-3, tp=4 — pure TP, no EP (no deepep), no DP + attention, no draft model. Optimized for prompt-side throughput. - Decode: GPU 4-7, tp=4 dp=4 enable-dp-attention + deepep + EAGLE - MTP — optimized for low-latency decode with spec decoding and - expert parallelism. + MTP — optimized for low-latency decode. - Mini load balancer fronting both. -Both sides use DSv4 Flash FP8 weights. Transfer backend is NIXL -(the focus of recent nixl/conn.py forward-delta work; this test is -the e2e check that the generic `send_state` / shared buffer-pool -changes do not break PD). +Both sides use DSv4 Flash FP8 weights. Transfer backend is NIXL. """ import unittest @@ -65,9 +67,9 @@ def setUpClass(cls): @classmethod def start_prefill(cls): - # Prefill: TP=4 (no EP, no DP attention). EAGLE config mirrors decode - # so the metadata buffer is sized for the spec module's hidden shape; - # PD ferry currently assumes both sides agree on the spec algorithm. + # Prefill: TP=4, no EP, no DP attention, no spec module. The decode + # node runs EAGLE MTP; --disaggregation-decode-speculative-algorithm + # tells prefill to size the metadata hidden-state buffer to match. prefill_args = [ "--trust-remote-code", "--disaggregation-mode", @@ -86,14 +88,8 @@ def start_prefill(cls): "4", "--disaggregation-decode-dp", "4", - "--speculative-algorithm", + "--disaggregation-decode-speculative-algorithm", "EAGLE", - "--speculative-num-steps", - "3", - "--speculative-eagle-topk", - "1", - "--speculative-num-draft-tokens", - "4", *cls.transfer_backend, *cls.rdma_devices, ] From a47e7f55570e15343da06d84f08a6eb0013907dd Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Tue, 28 Apr 2026 11:43:14 -0700 Subject: [PATCH 2/3] drop server arg; always size pd hidden buffer to spec_hidden_size --- python/sglang/srt/managers/scheduler.py | 49 ++++++------------- python/sglang/srt/server_args.py | 18 ------- .../8-gpu-models/test_dsv4_pd_disagg_nixl.py | 25 +++++----- 3 files changed, 27 insertions(+), 65 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index c283d6749711..a0d51f4d64d1 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -892,18 +892,16 @@ def init_disaggregation(self): self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator( buffer_size ) + # Always allocate the spec hidden buffer at full size so the + # decode side aligns with prefill regardless of which side + # actually runs the spec module (asymmetric P/D). When neither + # side runs spec the buffer is unused but the allocation cost + # is bounded (~few MB) and the alternative would be a wire + # protocol mismatch in asymmetric configs. self.disagg_metadata_buffers = MetadataBuffers( buffer_size, - hidden_size=( - model_config.spec_hidden_size - if self.spec_algorithm.is_eagle() - else 16 # minimal padding size for RDMA - ), - hidden_states_dtype=( - model_config.dtype - if self.spec_algorithm.is_eagle() - else torch.float32 - ), + hidden_size=model_config.spec_hidden_size, + hidden_states_dtype=model_config.dtype, custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(), ) @@ -946,32 +944,15 @@ def init_disaggregation(self): self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator( buffer_size ) - # Asymmetric P/D: prefill itself runs no spec module but the - # decode node does. Match the decode-side hidden-state buffer - # so the cross-node transfer aligns; prefill leaves the buffer - # zero-initialized and decode treats it as mock conditioning. - decode_spec_algo = ( - self.server_args.disaggregation_decode_speculative_algorithm - ) - decode_side_needs_hidden = decode_spec_algo in ( - "EAGLE", - "EAGLE3", - "STANDALONE", - ) - local_spec_needs_hidden = ( - self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone() - ) - needs_hidden = local_spec_needs_hidden or decode_side_needs_hidden + # Always allocate the spec hidden buffer at full size; see the + # matching comment on the decode branch above. When prefill has + # no spec module of its own, the buffer stays zero-initialized + # and decode treats it as mock conditioning for the first draft + # step (verified spec keeps the output token correct). self.disagg_metadata_buffers = MetadataBuffers( buffer_size, - hidden_size=( - model_config.spec_hidden_size - if needs_hidden - else 16 # minimal padding size for RDMA - ), - hidden_states_dtype=( - model_config.dtype if needs_hidden else torch.float32 - ), + hidden_size=model_config.spec_hidden_size, + hidden_states_dtype=model_config.dtype, custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(), ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 82e77cea04bb..41a93d8bc66c 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -665,11 +665,6 @@ class ServerArgs: disaggregation_bootstrap_port: int = 8998 disaggregation_decode_tp: Optional[int] = None disaggregation_decode_dp: Optional[int] = None - # Tells a non-spec prefill node which speculative algorithm the decode - # node runs, so prefill sizes its metadata hidden-state buffer to match - # the decode-side spec module. Decode reads its own --speculative-* - # args and ignores this. Allowed values match --speculative-algorithm. - disaggregation_decode_speculative_algorithm: Optional[str] = None disaggregation_prefill_pp: Optional[int] = 1 disaggregation_ib_device: Optional[str] = None disaggregation_decode_enable_offload_kvcache: bool = False @@ -5035,19 +5030,6 @@ def add_cli_args(parser: argparse.ArgumentParser): default=ServerArgs.disaggregation_decode_dp, help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.", ) - parser.add_argument( - "--disaggregation-decode-speculative-algorithm", - type=str, - default=ServerArgs.disaggregation_decode_speculative_algorithm, - choices=["EAGLE", "EAGLE3", "STANDALONE"], - help="Speculative algorithm running on the decode node. Set on a " - "prefill node that itself does not run a draft model so the " - "metadata hidden-state buffer is sized to match the decode-side " - "spec module. Decode bootstraps spec_info from a zero-init " - "buffer (mock) and recovers real conditioning after the first " - "verify; only the first decode iteration's accept length is " - "affected.", - ) parser.add_argument( "--disaggregation-prefill-pp", type=int, diff --git a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py index b179502e273d..e11f3eb2d9b1 100644 --- a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py +++ b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py @@ -1,14 +1,14 @@ """DSv4 Flash PD-disaggregation test with NIXL transfer backend. Asymmetric P/D config: prefill runs no spec module, decode runs EAGLE -MTP. Prefill uses --disaggregation-decode-speculative-algorithm so it -sizes the metadata hidden-state buffer to match the decode-side spec -module; the buffer is shipped zero-initialized (prefill has no draft -model to populate it). Decode treats the zeros as mock conditioning -for the first draft step. Verified spec decoding makes those bad -drafts get rejected by target; from the second iteration onward, real -target hidden flows through normally. Amortized cost is ~1 wasted -draft step per request, < 1% throughput hit on long generations. +MTP. The metadata hidden-state buffer is allocated at full +spec_hidden_size on both sides unconditionally; prefill has no draft +model to populate it, so the wire data stays zero, and decode treats +the zeros as mock conditioning for the first draft step. Verified +spec decoding makes those bad drafts get rejected by target; from +iteration 2 onward, real target hidden flows through normally. +Amortized cost is ~1 wasted draft step per request, < 1% throughput +hit on long generations. Topology (1 H200 node, 8 GPUs total): - Prefill: GPU 0-3, tp=4 — pure TP, no EP (no deepep), no DP @@ -67,9 +67,10 @@ def setUpClass(cls): @classmethod def start_prefill(cls): - # Prefill: TP=4, no EP, no DP attention, no spec module. The decode - # node runs EAGLE MTP; --disaggregation-decode-speculative-algorithm - # tells prefill to size the metadata hidden-state buffer to match. + # Prefill: TP=4, no EP, no DP attention, no spec module. The + # metadata hidden-state buffer is sized to spec_hidden_size on + # both sides automatically (see scheduler.py), so no flag tells + # prefill about the decode-side spec config. prefill_args = [ "--trust-remote-code", "--disaggregation-mode", @@ -88,8 +89,6 @@ def start_prefill(cls): "4", "--disaggregation-decode-dp", "4", - "--disaggregation-decode-speculative-algorithm", - "EAGLE", *cls.transfer_backend, *cls.rdma_devices, ] From deecaaef85f635c41a07bf051b413fc61e93119d Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Tue, 28 Apr 2026 12:29:26 -0700 Subject: [PATCH 3/3] trim comments --- python/sglang/srt/managers/scheduler.py | 15 +++----- .../8-gpu-models/test_dsv4_pd_disagg_nixl.py | 34 +++---------------- 2 files changed, 9 insertions(+), 40 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index a0d51f4d64d1..4f510b61cd7b 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -892,12 +892,8 @@ def init_disaggregation(self): self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator( buffer_size ) - # Always allocate the spec hidden buffer at full size so the - # decode side aligns with prefill regardless of which side - # actually runs the spec module (asymmetric P/D). When neither - # side runs spec the buffer is unused but the allocation cost - # is bounded (~few MB) and the alternative would be a wire - # protocol mismatch in asymmetric configs. + # Full-size buffer on both sides so the wire layout aligns + # under asymmetric P/D where one side may not run spec. self.disagg_metadata_buffers = MetadataBuffers( buffer_size, hidden_size=model_config.spec_hidden_size, @@ -944,11 +940,8 @@ def init_disaggregation(self): self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator( buffer_size ) - # Always allocate the spec hidden buffer at full size; see the - # matching comment on the decode branch above. When prefill has - # no spec module of its own, the buffer stays zero-initialized - # and decode treats it as mock conditioning for the first draft - # step (verified spec keeps the output token correct). + # See decode branch above. Asymmetric P/D: prefill without a + # spec module ships zeros, decode mocks first-step conditioning. self.disagg_metadata_buffers = MetadataBuffers( buffer_size, hidden_size=model_config.spec_hidden_size, diff --git a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py index e11f3eb2d9b1..970868d6e35c 100644 --- a/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py +++ b/test/registered/8-gpu-models/test_dsv4_pd_disagg_nixl.py @@ -1,24 +1,7 @@ -"""DSv4 Flash PD-disaggregation test with NIXL transfer backend. - -Asymmetric P/D config: prefill runs no spec module, decode runs EAGLE -MTP. The metadata hidden-state buffer is allocated at full -spec_hidden_size on both sides unconditionally; prefill has no draft -model to populate it, so the wire data stays zero, and decode treats -the zeros as mock conditioning for the first draft step. Verified -spec decoding makes those bad drafts get rejected by target; from -iteration 2 onward, real target hidden flows through normally. -Amortized cost is ~1 wasted draft step per request, < 1% throughput -hit on long generations. - -Topology (1 H200 node, 8 GPUs total): - - Prefill: GPU 0-3, tp=4 — pure TP, no EP (no deepep), no DP - attention, no draft model. Optimized for prompt-side throughput. - - Decode: GPU 4-7, tp=4 dp=4 enable-dp-attention + deepep + EAGLE - MTP — optimized for low-latency decode. - - Mini load balancer fronting both. - -Both sides use DSv4 Flash FP8 weights. Transfer backend is NIXL. -""" +"""DSv4 Flash PD-disagg with NIXL backend, asymmetric: prefill is pure +TP with no spec module, decode runs EAGLE MTP. Prefill ships a zero- +init hidden buffer; decode mocks first-step conditioning, verify keeps +the output correct.""" import unittest from types import SimpleNamespace @@ -40,9 +23,7 @@ DSV4_FLASH_ENV = { "SGLANG_DSV4_FP4_EXPERTS": "0", - # Decode side runs MTP with num_draft_tokens=4 → dispatch input scales - # by ~4x, so default 256 overflows once cuda-graph-max-bs * draft > 256. - # 1024 covers bs=128 * 4 with headroom (no-op on prefill which has no EP). + # MTP num_draft_tokens=4 scales dispatch by ~4x; 256 overflows at bs=128. "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK": "1024", "SGLANG_JIT_DEEPGEMM_PRECOMPILE": "0", } @@ -67,10 +48,6 @@ def setUpClass(cls): @classmethod def start_prefill(cls): - # Prefill: TP=4, no EP, no DP attention, no spec module. The - # metadata hidden-state buffer is sized to spec_hidden_size on - # both sides automatically (see scheduler.py), so no flag tells - # prefill about the decode-side spec config. prefill_args = [ "--trust-remote-code", "--disaggregation-mode", @@ -102,7 +79,6 @@ def start_prefill(cls): @classmethod def start_decode(cls): - # Decode: TP=4 + DP=4 attention + deepep EP + EAGLE MTP. decode_args = [ "--trust-remote-code", "--disaggregation-mode",