From 8ab651f129635eba2b5a85d8801b57c6c28145e1 Mon Sep 17 00:00:00 2001 From: chx96642264 Date: Sat, 28 Mar 2026 18:15:34 +0800 Subject: [PATCH 1/2] NPU can use piece cuda graph when the piece cuda graph is explicitly declared --- python/sglang/srt/server_args.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index c770f3d161f4..673d1eb75c4c 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -629,6 +629,7 @@ class ServerArgs: enable_single_batch_overlap: bool = False tbo_token_distribution_threshold: float = 0.48 enable_torch_compile: bool = False + enable_piecewise_cuda_graph: bool = False disable_piecewise_cuda_graph: bool = False enforce_piecewise_cuda_graph: bool = False enable_torch_compile_debug_mode: bool = False @@ -1117,6 +1118,10 @@ def _handle_piecewise_cuda_graph(self): if self.enable_eplb or self.expert_distribution_recorder_mode is not None: self.disable_piecewise_cuda_graph = True + # NPU can use this function when the piece cuda graph is explicitly declared + if self.enable_piecewise_cuda_graph: + self.disable_piecewise_cuda_graph = False + def _handle_gpu_memory_settings(self, gpu_mem): """ Configure GPU memory-dependent settings including @@ -5384,6 +5389,11 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Enable debug mode for torch compile", ) + parser.add_argument( + "--enable-piecewise-cuda-graph", + action="store_true", + help="Optimize the model with piecewise cuda graph for extend/prefill only.", + ) parser.add_argument( "--disable-piecewise-cuda-graph", action="store_true", From e11ed5393fe53d236131b3c967665756aa263ffe Mon Sep 17 00:00:00 2001 From: chx96642264 Date: Sat, 28 Mar 2026 18:23:44 +0800 Subject: [PATCH 2/2] NPU can use piece cuda graph when the piece cuda graph is explicitly declared.-2 --- python/sglang/srt/server_args.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 673d1eb75c4c..e7db9bf9f579 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -5389,11 +5389,6 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Enable debug mode for torch compile", ) - parser.add_argument( - "--enable-piecewise-cuda-graph", - action="store_true", - help="Optimize the model with piecewise cuda graph for extend/prefill only.", - ) parser.add_argument( "--disable-piecewise-cuda-graph", action="store_true", @@ -5401,8 +5396,8 @@ def add_cli_args(parser: argparse.ArgumentParser): ) parser.add_argument( "--enable-piecewise-cuda-graph", - action=DeprecatedAction, - help="Deprecated: Piecewise cuda graph is enabled by default. Use --enforce-piecewise-cuda-graph to skip auto-disable conditions.", + action="store_true", + help="Optimize the model with piecewise cuda graph for extend/prefill only.", ) parser.add_argument( "--enforce-piecewise-cuda-graph",