diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 4d860505ca4d..933c36fe8bbd 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1411,15 +1411,10 @@ def _set_default_nsa_kv_cache_dtype(self, major: int, quantization: str) -> str: ) if self.kv_cache_dtype == "auto": - # TODO: Temporarily set default dtype on B200 as bfloat16 to avoid performance regression. - # TODO: Remove this after the performance regression is fixed. (Ref: https://github.com/sgl-project/sglang/issues/21291) - if quantization == "modelopt_fp4" and major >= 10 and self.dp_size > 1: + if major >= 10: self.kv_cache_dtype = "fp8_e4m3" else: self.kv_cache_dtype = "bfloat16" - # self.kv_cache_dtype = ( - # "fp8_e4m3" if (major >= 10 and self.dp_size > 1) else "bfloat16" - # ) logger.warning( f"Setting KV cache dtype to {self.kv_cache_dtype} for DeepSeek DSA on SM{major} device." ) @@ -1450,7 +1445,7 @@ def _set_default_nsa_backends(self, kv_cache_dtype: str, major: int) -> str: self.nsa_prefill_backend = "tilelang" self.nsa_decode_backend = "tilelang" elif kv_cache_dtype == "fp8_e4m3": - if self.dp_size == 1 and major >= 10: + if major >= 10: self.nsa_prefill_backend = "trtllm" self.nsa_decode_backend = "trtllm" else: