From cc4955920318883a225b96edaafb502bfabd376b Mon Sep 17 00:00:00 2001
From: khairulkabir1661 <khairulkabir1661@users.noreply.github.com>
Date: Mon, 16 Feb 2026 19:55:00 +0000
Subject: [PATCH] [ROCm] Add hardware detection for FP4 BMM to prevent MI300X
 crashes

Fixes #34641

Problem:
- vLLM crashes on MI300X (gfx942) with default settings
- VLLM_ROCM_USE_AITER_FP4BMM defaults to True for all AMD GPUs
- MI300X doesn't support FP4, only MI325X/MI350X (gfx950) do
- vLLM only checked env vars, not hardware capability

Solution:
- Added hardware detection to is_fp4bmm_enabled() method
- Query AITER's is_fp4_avail() before enabling FP4
- Auto-disable FP4 on unsupported hardware (gfx942)
- Log informative message when falling back to FP8
- Graceful error handling if AITER arch_info unavailable

Impact:
- Fixes crash on MI300X/MI300A
- Works automatically without user intervention
- Clear logging explains what's happening
- Maintains FP4 support on MI325X/MI350X

Testing:
- Tested on MI300X (gfx942) - FP4 correctly disabled
- Verified FP8 fallback works as expected
- Confirmed logging messages appear correctly

Took Help using Claude
---
 vllm/_aiter_ops.py | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index c544d2d3d195..648c848d7808 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -7,6 +7,7 @@
 from torch._ops import OpOverload
 
 import vllm.envs as envs
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
@@ -14,6 +15,8 @@
     rocm_aiter_sparse_attn_indexer_fake,
 )
 
+logger = init_logger(__name__)
+
 # fp8_dtype is not cached.
 # on ROCm the fp8_dtype always calls is_fp8_fnuz
 # which is a host op, so we cache it once here.
@@ -999,7 +1002,42 @@ def is_fp8bmm_enabled(cls) -> bool:
     @classmethod
     @if_aiter_supported
     def is_fp4bmm_enabled(cls) -> bool:
-        return cls._AITER_ENABLED and cls._FP4BMM_ENABLED
+        """Check if FP4 BMM is enabled and supported by hardware.
+
+        FP4 (MXFP4) is only supported on AMD MI325X/MI350X (gfx950).
+        MI300X/MI300A (gfx942) do not support FP4.
+
+        This method checks both environment variables AND hardware capability
+        to prevent runtime errors on unsupported hardware.
+
+        Returns:
+            bool: True if FP4 BMM is both requested and hardware-supported.
+        """
+        if not (cls._AITER_ENABLED and cls._FP4BMM_ENABLED):
+            return False
+
+        # Check hardware support before enabling FP4
+        try:
+            from aiter.ops.triton.utils._triton.arch_info import (
+                get_arch,
+                is_fp4_avail,
+            )
+
+            if not is_fp4_avail():
+                arch = get_arch()
+                logger.info(
+                    "FP4BMM requested via VLLM_ROCM_USE_AITER_FP4BMM but not "
+                    f"supported on {arch}. FP4 requires gfx950 "
+                    "(MI325X/MI350X). Falling back to FP8."
+                )
+                return False
+            return True
+        except ImportError:
+            logger.warning(
+                "AITER arch_info not available. Disabling FP4BMM to avoid "
+                "potential runtime errors."
+            )
+            return False
 
     @classmethod
     @if_aiter_supported