diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index de7f06468d9c..6b253168c40d 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -883,6 +883,12 @@ def __post_init__(self): # Handle memory-related, chunked prefill, and CUDA graph batch size configurations. self._handle_gpu_memory_settings(gpu_mem) + # enforce_disable_flashinfer_allreduce_fusion must be set before + # _handle_model_specific_adjustments, which auto-enables the fusion + # for several SM90/SM100 MoE arches. + if self.enable_deterministic_inference: + self.enforce_disable_flashinfer_allreduce_fusion = True + # Apply model-specific adjustments. self._handle_model_specific_adjustments() @@ -4020,6 +4026,12 @@ def _handle_deterministic_inference(self): ) self.enable_aiter_allreduce_fusion = False + if self.enable_flashinfer_allreduce_fusion: + logger.warning( + "Disable --enable-flashinfer-allreduce-fusion because deterministic inference is enabled." + ) + self.enable_flashinfer_allreduce_fusion = False + # Check sampling backend self.sampling_backend = "pytorch" logger.warning(