flashinfer-ai · yzh119 · Dec 5, 2025 · Dec 4, 2025 · Dec 4, 2025 · yzh119
@@ -3711,10 +3711,10 @@ def generate_files(specs_names):
     ]
     if "CUDA_PATH" in os.environ:
         cmd[0] = os.environ["CUDA_PATH"] + "/bin/" + cmd[0]
-    print('Running command "{}" to build "bin/print_traits.exe":'.format(" ".join(cmd)))
+    # print('Running command "{}" to build "bin/print_traits.exe":'.format(" ".join(cmd)))
     process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
     output, error = process.communicate()
-    print('Running "bin/print_traits.exe":')
+    # print('Running "bin/print_traits.exe":')
     process = subprocess.Popen(
         "bin/print_traits.exe", stdin=subprocess.PIPE, stdout=subprocess.PIPE
     )

@@ -1901,9 +1901,10 @@ def gen_trtllm_fmha_v2_module() -> JitSpec:
     source_paths = kernel_paths + [binding_source_path]
 
     nvcc_flags = current_compilation_context.get_nvcc_flags_list(
-        supported_major_versions=[10, 11, 12]
+        supported_major_versions=[12]
     )
     nvcc_flags.append(f"-I{jit_env.FLASHINFER_CSRC_DIR / 'fmha_v2'}")
+    nvcc_flags.append("-Wno-deprecated-gpu-targets")
 
     return gen_jit_spec(
         uri,

@@ -3603,6 +3603,8 @@ def fmha_v2_prefill_deepseek(
         If return_lse is True, the output will be a tuple of two tensors, the first is the output tensor, the second is the lse tensor.
         If return_lse is False, the output will be a single tensor.
     """
+    if not is_sm120a_supported(query.device):
+        raise ValueError("fmha_v2_prefill_deepseek is only supported on SM120 GPUs.")
     assert query.shape[3] == 192 and key.shape[3] == 192 and value.shape[3] == 128, (
         "currently only support deepseek r1 192 query and 128 value"
     )

@@ -5,6 +5,7 @@
 
 from flashinfer.prefill import fmha_v2_prefill_deepseek
 from tests.utils_fp8 import to_float8
+from flashinfer.utils import is_sm120a_supported
 
 
 def attention_ref(
@@ -56,6 +57,8 @@ def attention_ref(
 def test_fmha_v2_prefill_deepseek(
     batch_size, num_heads, head_dim_qk, head_dim_v, seq_len, qkv_dtype, o_dtype
 ):
+    if not is_sm120a_supported(torch.device("cuda")):
+        pytest.skip("fmha_v2_prefill_deepseek is only supported on SM120 GPUs.")
     torch.manual_seed(42)
 
     def initialize_tensors(batch_size, num_heads, head_dim_qk, head_dim_v, seq_len):