From 18986f4fa5ee4fcdd5778010e2434682f188f7be Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Wed, 17 Dec 2025 23:20:18 +0000
Subject: [PATCH 1/5] partial revert

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/config/compilation.py | 48 ++++++++++++--------------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 4a98494b3c7b..6155e67422bd 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -958,44 +958,26 @@ def set_splitting_ops_for_v1(
                     self.cudagraph_mode = CUDAGraphMode.FULL
                 self.splitting_ops = []
 
-        # split MoE ops for cudagraph
-        moe_ops = [
-            "vllm::moe_forward",
-            "vllm::moe_forward_shared",
-        ]
+        # Disable CUDA graphs for DeepEP high-throughput with DP > 1
         backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
         dp_size = data_parallel_size if data_parallel_size is not None else 1
-        need_moe_splitting = (
+        if (
             backend == "deepep_high_throughput"
             and dp_size > 1
-            # pure attn-fusion without inductor partition deliberately disables
-            # piecewise graphs and MoE splitting.
-            and not (
-                self.pass_config.fuse_attn_quant
-                and not self.use_inductor_graph_partition
+            and self.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            # TODO: Piecewise Cuda graph might be enabled
+            # if torch compile cache key issue fixed
+            # See https://github.com/vllm-project/vllm/pull/25093
+            logger.info(
+                "WideEP: Disabling CUDA Graphs since DeepEP high-throughput "
+                "kernels are optimized for prefill and are incompatible with "
+                "CUDA Graphs. "
+                "In order to use CUDA Graphs for decode-optimized workloads, "
+                "use --all2all-backend with another option, such as "
+                "deepep_low_latency, pplx, or allgather_reducescatter."
             )
-        )
-
-        if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE:
-            # if we just initialized default splitting_ops for this config,
-            # automatically append the MoE ops
-            if added_default_splitting_ops:
-                for op in moe_ops:
-                    if op not in self.splitting_ops:
-                        self.splitting_ops.append(op)
-
-            # make sure MoE ops are split out
-            if not any(op in self.splitting_ops for op in moe_ops):
-                self.cudagraph_mode = CUDAGraphMode.NONE
-                logger.warning_once(
-                    "DeepEP high throughput backend with data_parallel_size > 1 "
-                    "requires splitting MoE ops from cudagraphs. Please ensure "
-                    "'vllm::moe_forward' or 'vllm::moe_forward_shared' are "
-                    "present in CompilationConfig.splitting_ops."
-                )
-            elif self.cudagraph_mode.has_full_cudagraphs():
-                # fall back to piecewise when MoE splitting is required.
-                self.cudagraph_mode = CUDAGraphMode.PIECEWISE
+            self.cudagraph_mode = CUDAGraphMode.NONE
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.fuse_attn_quant

From ef34bf9cecf01ab673ef8dd7de4bbd22043ca6ea Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Wed, 17 Dec 2025 23:22:12 +0000
Subject: [PATCH 2/5] wip

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/compile/test_config.py | 38 ------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 04bb56ecb647..6435d87ba763 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -233,24 +233,6 @@ def test_splitting_ops_dynamic():
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
-def test_moe_splitting_ops_deepep_ht_piecewise():
-    # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
-    # should add MoE ops to splitting_ops on top of attention ops.
-    config = VllmConfig(
-        parallel_config=ParallelConfig(
-            all2all_backend="deepep_high_throughput",
-            data_parallel_size=8,
-        ),
-        compilation_config=CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-        ),
-    )
-    splitting_ops = config.compilation_config.splitting_ops
-    assert splitting_ops is not None
-    assert "vllm::moe_forward" in splitting_ops
-    assert "vllm::moe_forward_shared" in splitting_ops
-
-
 def test_moe_splitting_ops_deepep_ht_inductor_partition():
     # Inductor partition case: user-provided splitting_ops should be
     # preserved and MoE ops should be appended for DeepEP HT with dp>1.
@@ -277,26 +259,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
     ]
 
 
-def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
-    # Pure attn-fusion case without inductor partition: even with
-    # DeepEP HT and dp>1, we should not re-enable piecewise compilation
-    # or add MoE ops into splitting_ops.
-    config = VllmConfig(
-        parallel_config=ParallelConfig(
-            all2all_backend="deepep_high_throughput",
-            data_parallel_size=8,
-        ),
-        compilation_config=CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-            pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
-            custom_ops=["+quant_fp8"],
-            cudagraph_mode=CUDAGraphMode.PIECEWISE,
-        ),
-    )
-    assert config.compilation_config.splitting_ops == []
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
-
-
 def test_should_split():
     import torch
 

From 417e0dc02b5a476b8886d83619e10dc2ea5fadcf Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 17 Dec 2025 18:30:11 -0500
Subject: [PATCH 3/5] Update vllm/config/compilation.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 vllm/config/compilation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 6155e67422bd..488d2dae8e9b 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -970,7 +970,7 @@ def set_splitting_ops_for_v1(
             # if torch compile cache key issue fixed
             # See https://github.com/vllm-project/vllm/pull/25093
             logger.info(
-                "WideEP: Disabling CUDA Graphs since DeepEP high-throughput "
+                "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput "
                 "kernels are optimized for prefill and are incompatible with "
                 "CUDA Graphs. "
                 "In order to use CUDA Graphs for decode-optimized workloads, "

From 9388297e6cbdbd3e2ec70bde0764f3d9e30bca60 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Wed, 17 Dec 2025 23:34:43 +0000
Subject: [PATCH 4/5] remove added

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/config/compilation.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 488d2dae8e9b..6a43ffbd93d5 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -915,8 +915,6 @@ def set_splitting_ops_for_v1(
             "mode is CompilationMode.VLLM_COMPILE"
         )
 
-        added_default_splitting_ops = False
-
         if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
             self.set_splitting_ops_for_attn_fusion()
         else:
@@ -930,7 +928,6 @@ def set_splitting_ops_for_v1(
                 # for details. Make a copy to avoid mutating the class-level
                 # list via reference.
                 self.splitting_ops = list(self._attention_ops)
-                added_default_splitting_ops = True
             elif len(self.splitting_ops) == 0:
                 if (
                     self.cudagraph_mode == CUDAGraphMode.PIECEWISE

From ee7bd133f3383c27e76889cedf0ce978ed904a2c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Wed, 17 Dec 2025 23:38:12 +0000
Subject: [PATCH 5/5] cleanup

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/config/compilation.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 6a43ffbd93d5..c3cbc4647a4d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -955,7 +955,7 @@ def set_splitting_ops_for_v1(
                     self.cudagraph_mode = CUDAGraphMode.FULL
                 self.splitting_ops = []
 
-        # Disable CUDA graphs for DeepEP high-throughput with DP > 1
+        # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
         backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
         dp_size = data_parallel_size if data_parallel_size is not None else 1
         if (
@@ -967,9 +967,8 @@ def set_splitting_ops_for_v1(
             # if torch compile cache key issue fixed
             # See https://github.com/vllm-project/vllm/pull/25093
             logger.info(
-                "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput "
-                "kernels are optimized for prefill and are incompatible with "
-                "CUDA Graphs. "
+                "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels "
+                "are optimized for prefill and are incompatible with CUDA Graphs. "
                 "In order to use CUDA Graphs for decode-optimized workloads, "
                 "use --all2all-backend with another option, such as "
                 "deepep_low_latency, pplx, or allgather_reducescatter."