diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 584b15bf40a5..154a6de7f5fd 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -787,12 +787,13 @@ def forward( gemm_output_zero_allocator: BumpAllocator = None, ) -> torch.Tensor: if not self._enable_a2a_moe: - DUAL_STREAM_TOKEN_THRESHOLD = 1024 + from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode + if ( self.alt_stream is not None and self.num_fused_shared_experts == 0 and hidden_states.shape[0] > 0 - and hidden_states.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD + and get_is_capture_mode() ): return self.forward_normal_dual_stream( hidden_states,