diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py index 2ad949577664..73515450a9a3 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py @@ -19,7 +19,7 @@ kNvfp4Dynamic, kNvfp4Static, ) -from vllm.platforms import current_platform +from vllm.platforms import is_blackwell_cuda from vllm.utils.flashinfer import ( flashinfer_cutedsl_grouped_gemm_nt_masked, scaled_fp4_grouped_quantize, @@ -54,8 +54,8 @@ def activation_format() -> mk.FusedMoEActivationFormat: @staticmethod def _supports_current_device() -> bool: - p = current_platform - return p.is_cuda() and p.is_device_capability_family(100) + """Supports Blackwell-family GPUs (SM100/110/120).""" + return is_blackwell_cuda() @staticmethod def _supports_no_act_and_mul() -> bool: diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index a066535c51eb..1950ccc06aff 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -19,7 +19,7 @@ kFp8Static128BlockSym, kFp8StaticTensorSym, ) -from vllm.platforms import current_platform +from vllm.platforms import is_blackwell_cuda from vllm.utils.torch_utils import direct_register_custom_op # @@ -28,9 +28,8 @@ def _supports_current_device() -> bool: - """Supports only Blackwell-family GPUs.""" - p = current_platform - return p.is_cuda() and p.is_device_capability_family(100) + """Supports Blackwell-family GPUs (SM100/110/120).""" + return is_blackwell_cuda() def _supports_no_act_and_mul() -> bool: diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 4783ca5e0e05..45e7e9c4b48f 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -22,7 +22,7 @@ kNvfp4Dynamic, kNvfp4Static, ) -from vllm.platforms import current_platform +from vllm.platforms import is_blackwell_cuda if TYPE_CHECKING: from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import ( @@ -42,9 +42,8 @@ def _supports_current_device() -> bool: - """Supports only Blackwell-family GPUs.""" - p = current_platform - return p.is_cuda() and p.is_device_capability_family(100) + """Supports Blackwell-family GPUs (SM100/110/120).""" + return is_blackwell_cuda() def _supports_no_act_and_mul() -> bool: diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 56c90aa86426..79be00d0f743 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -6,7 +6,7 @@ from vllm import envs from vllm.logger import init_logger -from vllm.platforms import current_platform +from vllm.platforms import current_platform, is_blackwell_cuda from vllm.utils.math_utils import round_up logger = init_logger(__name__) @@ -169,10 +169,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND if flashinfer_moe_backend in backend_map: - if ( - flashinfer_moe_backend == "latency" - and not current_platform.is_device_capability_family(100) - ): + if flashinfer_moe_backend == "latency" and not is_blackwell_cuda(): logger.info_once( "Flashinfer TRTLLM MOE backend is only supported on " "SM100 and later, using CUTLASS backend instead", diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index a0e5af1aba51..058165842c4f 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -276,4 +276,27 @@ def __setattr__(name: str, value): raise AttributeError(f"No attribute named '{name}' exists in {__name__}.") -__all__ = ["Platform", "PlatformEnum", "current_platform", "CpuArchEnum", "_init_trace"] +def is_blackwell_cuda() -> bool: + """Check if running on a Blackwell-family CUDA GPU (SM100/110/120). + + This includes: + - SM100: Data center Blackwell (B100, B200) + - SM110: Future Blackwell variant + - SM120: Consumer Blackwell (RTX 5090) + - SM121: DGX Spark (GB10) + """ + # Access current_platform via __getattr__ by using globals() + p = __getattr__("current_platform") + return p.is_cuda() and any( + p.is_device_capability_family(sm) for sm in (100, 110, 120) + ) + + +__all__ = [ + "Platform", + "PlatformEnum", + "current_platform", + "CpuArchEnum", + "_init_trace", + "is_blackwell_cuda", +]