From 8ab651f129635eba2b5a85d8801b57c6c28145e1 Mon Sep 17 00:00:00 2001
From: chx96642264 <chenhaoxuan2@h-partners.com>
Date: Sat, 28 Mar 2026 18:15:34 +0800
Subject: [PATCH 1/2] NPU can use piece cuda graph when the piece cuda graph is
 explicitly declared

---
 python/sglang/srt/server_args.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index c770f3d161f4..673d1eb75c4c 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -629,6 +629,7 @@ class ServerArgs:
     enable_single_batch_overlap: bool = False
     tbo_token_distribution_threshold: float = 0.48
     enable_torch_compile: bool = False
+    enable_piecewise_cuda_graph: bool = False
     disable_piecewise_cuda_graph: bool = False
     enforce_piecewise_cuda_graph: bool = False
     enable_torch_compile_debug_mode: bool = False
@@ -1117,6 +1118,10 @@ def _handle_piecewise_cuda_graph(self):
         if self.enable_eplb or self.expert_distribution_recorder_mode is not None:
             self.disable_piecewise_cuda_graph = True
 
+        # NPU can use this function when the piece cuda graph is explicitly declared
+        if self.enable_piecewise_cuda_graph:
+            self.disable_piecewise_cuda_graph = False
+
     def _handle_gpu_memory_settings(self, gpu_mem):
         """
         Configure GPU memory-dependent settings including
@@ -5384,6 +5389,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Enable debug mode for torch compile",
         )
+        parser.add_argument(
+            "--enable-piecewise-cuda-graph",
+            action="store_true",
+            help="Optimize the model with piecewise cuda graph for extend/prefill only.",
+        )
         parser.add_argument(
             "--disable-piecewise-cuda-graph",
             action="store_true",

From e11ed5393fe53d236131b3c967665756aa263ffe Mon Sep 17 00:00:00 2001
From: chx96642264 <chenhaoxuan2@h-partners.com>
Date: Sat, 28 Mar 2026 18:23:44 +0800
Subject: [PATCH 2/2] NPU can use piece cuda graph when the piece cuda graph is
 explicitly declared.-2

---
 python/sglang/srt/server_args.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 673d1eb75c4c..e7db9bf9f579 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -5389,11 +5389,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Enable debug mode for torch compile",
         )
-        parser.add_argument(
-            "--enable-piecewise-cuda-graph",
-            action="store_true",
-            help="Optimize the model with piecewise cuda graph for extend/prefill only.",
-        )
         parser.add_argument(
             "--disable-piecewise-cuda-graph",
             action="store_true",
@@ -5401,8 +5396,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
         )
         parser.add_argument(
             "--enable-piecewise-cuda-graph",
-            action=DeprecatedAction,
-            help="Deprecated: Piecewise cuda graph is enabled by default. Use --enforce-piecewise-cuda-graph to skip auto-disable conditions.",
+            action="store_true",
+            help="Optimize the model with piecewise cuda graph for extend/prefill only.",
         )
         parser.add_argument(
             "--enforce-piecewise-cuda-graph",