[Bugfix][CUDA] fixes CUDA FP8 kv cache dtype supported (#21420)

elvischenv · web-flow · commit 2dec7c1a5df9 · 2025-07-22T20:34:50.000-07:00
Signed-off-by: elvischenv &lt;219235043+elvischenv@users.noreply.github.com&gt;
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -456,6 +456,19 @@ def stateless_init_device_torch_dist_pg(
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
 
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+        fp8_attention = kv_cache_dtype.startswith("fp8")
+        will_use_fa = (not envs.is_set("VLLM_ATTENTION_BACKEND")
+                       ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
+        supported = False
+        if cls.is_device_capability(100):
+            supported = True
+        elif fp8_attention and will_use_fa:
+            from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
+            supported = flash_attn_supports_fp8()
+        return supported
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
@@ -583,19 +596,6 @@ def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
             " not found. Assuming no NVLink available.")
         return False
 
-    @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
-        fp8_attention = kv_cache_dtype.startswith("fp8")
-        will_use_fa = (not envs.is_set("VLLM_ATTENTION_BACKEND")
-                       ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
-        supported = False
-        if cls.is_device_capability(100):
-            supported = True
-        elif fp8_attention and will_use_fa:
-            from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
-            supported = flash_attn_supports_fp8()
-        return supported
-
 
 # Autodetect either NVML-enabled or non-NVML platform
 # based on whether NVML is available.