fix aot

qsang-nv · qsang-nv · commit 82957fcf3e66 · 2025-10-13T19:56:55.000-07:00
Signed-off-by: Qidi Sang &lt;200703406+qsang-nv@users.noreply.github.com&gt;
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
@@ -363,25 +363,37 @@ def gen_xqa(
     head_grp_size_: List[int],
     use_sliding_window_: List[bool],
     has_sm90: bool,
+    has_sm100: bool,
+    has_sm120: bool,
 ) -> Iterator[JitSpec]:
     """Generate XQA modules for various configurations."""
-    if not has_sm90:
+    if not has_sm90 and not has_sm100 and not has_sm120:
         return  # XQA requires SM90+
 
+    sm_versions = []
+    if has_sm90:
+        sm_versions.append(90)
+    if has_sm100:
+        sm_versions.append(100)
+    if has_sm120:
+        sm_versions.append(120)
+
     for (
         fp16_input,
         fp8_kv_cache,
         token_per_page,
         head_size,
         head_grp_size,
         use_sliding_window,
+        sm_version,
     ) in product(
         fp16_input_,
         fp8_kv_cache_,
         token_per_page_,
         head_size_,
         head_grp_size_,
         use_sliding_window_,
+        sm_versions,
     ):
         # Skip invalid configurations
         if head_size % 16 != 0 or head_size > 256 or head_size < 16:
@@ -396,6 +408,7 @@ def gen_xqa(
             head_size=head_size,
             head_grp_size=head_grp_size,
             use_sliding_window=use_sliding_window,
+            sm_version=sm_version,
         )
 
 
@@ -527,6 +540,8 @@ def gen_all_modules(
                 xqa_head_grp_size_,
                 use_sliding_window_,
                 has_sm90,
+                has_sm100,
+                has_sm120,
             )
         )
 
diff --git a/flashinfer/jit/xqa.py b/flashinfer/jit/xqa.py
@@ -22,8 +22,6 @@
     sm100a_nvcc_flags,
     sm120a_nvcc_flags,
 )
-from ..utils import get_compute_capability
-import torch
 
 xqa_nvcc_flags = [
     "-DNDEBUG=1",
@@ -42,6 +40,7 @@ def gen_xqa_module(
     head_size: int,
     head_grp_size: int,
     use_sliding_window: bool,
+    sm_version: int = 90,
 ) -> JitSpec:
     if fp16_input:
         flag_data_type = ["-DINPUT_FP16=1", "-DDTYPE=__half"]
@@ -72,15 +71,15 @@ def gen_xqa_module(
     else:
         flag_sliding_window = ["-DSLIDING_WINDOW=0"]
 
-    if get_compute_capability(torch.device(device="cuda"))[0] == 10:
+    if sm_version == 100:
         sm_nvcc_flags = sm100a_nvcc_flags
-    elif get_compute_capability(torch.device(device="cuda"))[0] == 12:
+    elif sm_version == 120:
         sm_nvcc_flags = sm120a_nvcc_flags
     else:
         sm_nvcc_flags = sm90a_nvcc_flags
 
     return gen_jit_spec(
-        f"xqa_fp16_input_{fp16_input}_fp8_kv_cache_{fp8_kv_cache}_token_per_page_{token_per_page}_head_size_{head_size}_head_grp_size_{head_grp_size}_use_sliding_window_{use_sliding_window}_sm_{get_compute_capability(torch.device(device='cuda'))[0]}0",
+        f"xqa_fp16_input_{fp16_input}_fp8_kv_cache_{fp8_kv_cache}_token_per_page_{token_per_page}_head_size_{head_size}_head_grp_size_{head_grp_size}_use_sliding_window_{use_sliding_window}_sm_{sm_version}",
         [
             jit_env.FLASHINFER_CSRC_DIR / "xqa/mha.cu",
             jit_env.FLASHINFER_CSRC_DIR / "xqa/mha_sm90.cu",
diff --git a/flashinfer/xqa.py b/flashinfer/xqa.py
@@ -35,6 +35,7 @@ def get_xqa_module(
     head_size: int,
     head_grp_size: int,
     use_sliding_window: bool,
+    sm_version: int = 90,
 ):
     module = gen_xqa_module(
         fp16_input,
@@ -43,10 +44,11 @@ def get_xqa_module(
         head_size,
         head_grp_size,
         use_sliding_window,
+        sm_version,
     ).build_and_load()
 
     @register_custom_op(
-        f"flashinfer::xqa_fp16_input_{fp16_input}_fp8_kv_cache_{fp8_kv_cache}_token_per_page_{token_per_page}_head_size_{head_size}_head_grp_size_{head_grp_size}_use_sliding_window_{use_sliding_window}",
+        f"flashinfer::xqa_fp16_input_{fp16_input}_fp8_kv_cache_{fp8_kv_cache}_token_per_page_{token_per_page}_head_size_{head_size}_head_grp_size_{head_grp_size}_use_sliding_window_{use_sliding_window}_sm_{sm_version}",
         mutates_args=("output", "scratch"),
     )
     def xqa(
@@ -87,7 +89,7 @@ def xqa(
         )
 
     @register_fake_op(
-        f"flashinfer::xqa_fp16_input_{fp16_input}_fp8_kv_cache_{fp8_kv_cache}_token_per_page_{token_per_page}_head_size_{head_size}_head_grp_size_{head_grp_size}_use_sliding_window_{use_sliding_window}"
+        f"flashinfer::xqa_fp16_input_{fp16_input}_fp8_kv_cache_{fp8_kv_cache}_token_per_page_{token_per_page}_head_size_{head_size}_head_grp_size_{head_grp_size}_use_sliding_window_{use_sliding_window}_sm_{sm_version}"
     )
     def _fake_xqa(
         run_fp8_mha: bool,
@@ -140,13 +142,15 @@ def xqa(
 ) -> None:
     if get_compute_capability(torch.device(device="cuda"))[0] not in [9, 10, 12]:
         raise RuntimeError("XQA is only supported on SM90, SM100, SM120 GPUs")
+    sm_version = int(get_compute_capability(torch.device(device="cuda"))[0] * 10)
     xqa_module = get_xqa_module(
         fp16_input,
         fp8_kv_cache,
         token_per_page,
         head_size,
         head_grp_size,
         use_sliding_window,
+        sm_version,
     )
     xqa_module.xqa(
         run_fp8_mha,