diff --git a/vllm/model_executor/kernels/linear/scaled_mm/pytorch.py b/vllm/model_executor/kernels/linear/scaled_mm/pytorch.py index 2fb6e87413aa..235756729c7f 100644 --- a/vllm/model_executor/kernels/linear/scaled_mm/pytorch.py +++ b/vllm/model_executor/kernels/linear/scaled_mm/pytorch.py @@ -89,13 +89,8 @@ def is_supported( if not current_platform.is_rocm(): return False, "requires ROCm." - from vllm.platforms.rocm import on_mi3xx - - if not on_mi3xx(): - return False, "requires MI3xx." - - if compute_capability is not None and compute_capability < 94: - return False, "requires compute capability 94 and above." + if not current_platform.supports_fp8(): + return False, "requires FP8-capable GPU." return True, None diff --git a/vllm/model_executor/kernels/linear/scaled_mm/rocm.py b/vllm/model_executor/kernels/linear/scaled_mm/rocm.py index c8370dff512c..fba01c0b760e 100644 --- a/vllm/model_executor/kernels/linear/scaled_mm/rocm.py +++ b/vllm/model_executor/kernels/linear/scaled_mm/rocm.py @@ -79,10 +79,8 @@ def is_supported( if not current_platform.is_rocm(): return False, "requires ROCm." - from vllm.platforms.rocm import on_mi3xx - - if not on_mi3xx(): - return False, "requires MI3xx." + if not current_platform.supports_fp8(): + return False, "requires FP8-capable GPU." if not envs.VLLM_ROCM_USE_SKINNY_GEMM: return False, "requires VLLM_ROCM_USE_SKINNY_GEMM to be enabled." diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index e2b5a8f6764e..89cd6be24798 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -916,17 +916,7 @@ def _supports_quant_scheme( weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - p = current_platform - if p.is_rocm(): - from vllm.platforms.rocm import on_gfx9 - - is_rocm_on_gfx9 = on_gfx9() - else: - is_rocm_on_gfx9 = False - - device_supports_fp8 = is_rocm_on_gfx9 or ( - p.is_cuda() and p.has_device_capability((8, 9)) - ) + device_supports_fp8 = current_platform.supports_fp8() SUPPORTED_W_A_FP8 = [ (kFp8Static128BlockSym, kFp8Dynamic128Sym), diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index dccdc52bc4a9..20fcbc52d66b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1921,19 +1921,7 @@ def _supports_quant_scheme( weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - p = current_platform - if p.is_rocm(): - from vllm.platforms.rocm import on_gfx9 - - is_rocm_on_gfx9 = on_gfx9() - else: - is_rocm_on_gfx9 = False - - device_supports_fp8 = ( - is_rocm_on_gfx9 - or (p.is_cuda() and p.has_device_capability((8, 9))) - or p.is_xpu() - ) + device_supports_fp8 = current_platform.supports_fp8() if not device_supports_fp8: return (weight_key, activation_key) == (None, None)