From 18986f4fa5ee4fcdd5778010e2434682f188f7be Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 17 Dec 2025 23:20:18 +0000 Subject: [PATCH 1/5] partial revert Signed-off-by: Lucas Wilkinson --- vllm/config/compilation.py | 48 ++++++++++++-------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 4a98494b3c7b..6155e67422bd 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -958,44 +958,26 @@ def set_splitting_ops_for_v1( self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] - # split MoE ops for cudagraph - moe_ops = [ - "vllm::moe_forward", - "vllm::moe_forward_shared", - ] + # Disable CUDA graphs for DeepEP high-throughput with DP > 1 backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND dp_size = data_parallel_size if data_parallel_size is not None else 1 - need_moe_splitting = ( + if ( backend == "deepep_high_throughput" and dp_size > 1 - # pure attn-fusion without inductor partition deliberately disables - # piecewise graphs and MoE splitting. - and not ( - self.pass_config.fuse_attn_quant - and not self.use_inductor_graph_partition + and self.cudagraph_mode != CUDAGraphMode.NONE + ): + # TODO: Piecewise Cuda graph might be enabled + # if torch compile cache key issue fixed + # See https://github.com/vllm-project/vllm/pull/25093 + logger.info( + "WideEP: Disabling CUDA Graphs since DeepEP high-throughput " + "kernels are optimized for prefill and are incompatible with " + "CUDA Graphs. " + "In order to use CUDA Graphs for decode-optimized workloads, " + "use --all2all-backend with another option, such as " + "deepep_low_latency, pplx, or allgather_reducescatter." ) - ) - - if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE: - # if we just initialized default splitting_ops for this config, - # automatically append the MoE ops - if added_default_splitting_ops: - for op in moe_ops: - if op not in self.splitting_ops: - self.splitting_ops.append(op) - - # make sure MoE ops are split out - if not any(op in self.splitting_ops for op in moe_ops): - self.cudagraph_mode = CUDAGraphMode.NONE - logger.warning_once( - "DeepEP high throughput backend with data_parallel_size > 1 " - "requires splitting MoE ops from cudagraphs. Please ensure " - "'vllm::moe_forward' or 'vllm::moe_forward_shared' are " - "present in CompilationConfig.splitting_ops." - ) - elif self.cudagraph_mode.has_full_cudagraphs(): - # fall back to piecewise when MoE splitting is required. - self.cudagraph_mode = CUDAGraphMode.PIECEWISE + self.cudagraph_mode = CUDAGraphMode.NONE def set_splitting_ops_for_attn_fusion(self): assert self.pass_config.fuse_attn_quant From ef34bf9cecf01ab673ef8dd7de4bbd22043ca6ea Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 17 Dec 2025 23:22:12 +0000 Subject: [PATCH 2/5] wip Signed-off-by: Lucas Wilkinson --- tests/compile/test_config.py | 38 ------------------------------------ 1 file changed, 38 deletions(-) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 04bb56ecb647..6435d87ba763 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -233,24 +233,6 @@ def test_splitting_ops_dynamic(): assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE -def test_moe_splitting_ops_deepep_ht_piecewise(): - # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1 - # should add MoE ops to splitting_ops on top of attention ops. - config = VllmConfig( - parallel_config=ParallelConfig( - all2all_backend="deepep_high_throughput", - data_parallel_size=8, - ), - compilation_config=CompilationConfig( - mode=CompilationMode.VLLM_COMPILE, - ), - ) - splitting_ops = config.compilation_config.splitting_ops - assert splitting_ops is not None - assert "vllm::moe_forward" in splitting_ops - assert "vllm::moe_forward_shared" in splitting_ops - - def test_moe_splitting_ops_deepep_ht_inductor_partition(): # Inductor partition case: user-provided splitting_ops should be # preserved and MoE ops should be appended for DeepEP HT with dp>1. @@ -277,26 +259,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition(): ] -def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor(): - # Pure attn-fusion case without inductor partition: even with - # DeepEP HT and dp>1, we should not re-enable piecewise compilation - # or add MoE ops into splitting_ops. - config = VllmConfig( - parallel_config=ParallelConfig( - all2all_backend="deepep_high_throughput", - data_parallel_size=8, - ), - compilation_config=CompilationConfig( - mode=CompilationMode.VLLM_COMPILE, - pass_config={"fuse_attn_quant": True, "eliminate_noops": True}, - custom_ops=["+quant_fp8"], - cudagraph_mode=CUDAGraphMode.PIECEWISE, - ), - ) - assert config.compilation_config.splitting_ops == [] - assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL - - def test_should_split(): import torch From 417e0dc02b5a476b8886d83619e10dc2ea5fadcf Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 17 Dec 2025 18:30:11 -0500 Subject: [PATCH 3/5] Update vllm/config/compilation.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Lucas Wilkinson --- vllm/config/compilation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 6155e67422bd..488d2dae8e9b 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -970,7 +970,7 @@ def set_splitting_ops_for_v1( # if torch compile cache key issue fixed # See https://github.com/vllm-project/vllm/pull/25093 logger.info( - "WideEP: Disabling CUDA Graphs since DeepEP high-throughput " + "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput " "kernels are optimized for prefill and are incompatible with " "CUDA Graphs. " "In order to use CUDA Graphs for decode-optimized workloads, " From 9388297e6cbdbd3e2ec70bde0764f3d9e30bca60 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 17 Dec 2025 23:34:43 +0000 Subject: [PATCH 4/5] remove added Signed-off-by: Lucas Wilkinson --- vllm/config/compilation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 488d2dae8e9b..6a43ffbd93d5 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -915,8 +915,6 @@ def set_splitting_ops_for_v1( "mode is CompilationMode.VLLM_COMPILE" ) - added_default_splitting_ops = False - if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition: self.set_splitting_ops_for_attn_fusion() else: @@ -930,7 +928,6 @@ def set_splitting_ops_for_v1( # for details. Make a copy to avoid mutating the class-level # list via reference. self.splitting_ops = list(self._attention_ops) - added_default_splitting_ops = True elif len(self.splitting_ops) == 0: if ( self.cudagraph_mode == CUDAGraphMode.PIECEWISE From ee7bd133f3383c27e76889cedf0ce978ed904a2c Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 17 Dec 2025 23:38:12 +0000 Subject: [PATCH 5/5] cleanup Signed-off-by: Lucas Wilkinson --- vllm/config/compilation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 6a43ffbd93d5..c3cbc4647a4d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -955,7 +955,7 @@ def set_splitting_ops_for_v1( self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] - # Disable CUDA graphs for DeepEP high-throughput with DP > 1 + # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND dp_size = data_parallel_size if data_parallel_size is not None else 1 if ( @@ -967,9 +967,8 @@ def set_splitting_ops_for_v1( # if torch compile cache key issue fixed # See https://github.com/vllm-project/vllm/pull/25093 logger.info( - "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput " - "kernels are optimized for prefill and are incompatible with " - "CUDA Graphs. " + "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels " + "are optimized for prefill and are incompatible with CUDA Graphs. " "In order to use CUDA Graphs for decode-optimized workloads, " "use --all2all-backend with another option, such as " "deepep_low_latency, pplx, or allgather_reducescatter."