diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 5abee0a3744b..552067c3db7e 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1284,20 +1284,6 @@ def _handle_model_specific_adjustments(self): # Set moe backend for DeepSeek if is_sm100_supported(): quant_method = get_quantization_config(hf_config) - quant_cfg = getattr(hf_config, "quantization_config", None) or {} - config_groups = quant_cfg.get("config_groups", {}) - group0 = config_groups.get("group_0", {}) - weights_cfg = group0.get("weights", {}) - # this also apply to kimi k2.5 - # since it follow the compressed tensor int4 recipe - # but not kimi k2 instruct or 0905 instruct. - is_kimi_k2_k25_thinking_int4 = ( - quant_method == "compressed-tensors" - and weights_cfg.get("num_bits") == 4 - and weights_cfg.get("group_size") == 32 - and weights_cfg.get("strategy") == "group" - and weights_cfg.get("type") == "int" - ) if self.quantization is None: # Default DeepSeek V3/R1 native FP8 when not explicitly set, # Because we need this condition for an assertion in @@ -1312,20 +1298,12 @@ def _handle_model_specific_adjustments(self): if ( self.moe_a2a_backend == "none" and self.moe_runner_backend == "auto" - and ( - self.quantization in ["fp8", "modelopt_fp8", "modelopt_fp4"] - or is_kimi_k2_k25_thinking_int4 - ) + and self.quantization in ["fp8", "modelopt_fp8", "modelopt_fp4"] ): self.moe_runner_backend = "flashinfer_trtllm" - if is_kimi_k2_k25_thinking_int4: - logger.info( - "Use flashinfer_trtllm as MoE runner backend on Blackwell for Kimi K2 / K2.5 thinking int4" - ) - else: - logger.info( - "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM" - ) + logger.info( + "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM" + ) elif is_hip(): if not self.enable_dp_attention and self.nnodes == 1: # TODO (Hubert): Put this back later