[Fix][Chore][Qwen3] fix bug of using fp4 on sm120 (NVIDIA#6065)

byshiue · Ransiki · commit 03e70642fbeb · 2025-07-28T19:22:48.000-07:00
Signed-off-by: bhsueh &lt;11360707+byshiue@users.noreply.github.com&gt;
Signed-off-by: Ransiki Zhang &lt;ransikiz@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -671,7 +671,8 @@ bool attention_supports_nvfp4_output(int64_t const num_heads, int64_t const num_
     bool const use_paged_context_fmha, bool is_mla_enable)
 {
     // Only Blackwell supports NVFP4 output.
-    if (tensorrt_llm::common::getSMVersion() < 100)
+    // SM 120 does not support NVFP4 output.
+    if (tensorrt_llm::common::getSMVersion() < 100 || tensorrt_llm::common::getSMVersion() == 120)
     {
         return false;
     }
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
@@ -70,7 +70,7 @@ In addition, there are two shared files in the parent folder [`examples`](../../
 | Qwen2.5-72B(-Instruct)|     Y   |   Y   |    -    |   Y   |   Y*  |   Y   |   Y   |   Y   |   Y   |   -   | Ampere+ |
 | QwQ-32B            |     Y      |   Y   |    -    |   Y   |   Y   |   Y   |   Y   |   Y   |   Y   |   -   | Ampere+ |
 | Qwen3-32B          |     Y      |   Y   |    Y    |   -   |   -   |   -   |   -   |   Y   |   -   |   Y   | Hopper+ |
-| Qwen3-235B-A3B     |     Y      |   Y   |    Y    |   -   |   -   |   -   |   -   |   Y   |   -   |   Y   | Hopper+ |
+| Qwen3-235B-A22B    |     Y      |   Y   |    Y    |   -   |   -   |   -   |   -   |   Y   |   -   |   Y   | Hopper+ |
 
 Please note that Y* sign means that the model does not support all the AWQ + TP combination.
 
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1844,7 +1844,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
         with LLM(
                 f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
                 tensor_parallel_size=tp_size,
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -399,8 +399,7 @@ examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp
 test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075)
 examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488)
 accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043)
-full:B200/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5355219)
-full:B200/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5355219)
+full:B200/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5401163)
 examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5355054)
 examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] SKIP (https://nvbugs/5355054)
 examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086)

Original file line number	Diff line number	Diff line change
`@@ -671,7 +671,8 @@ bool attention_supports_nvfp4_output(int64_t const num_heads, int64_t const num_`
`671`	`671`	`bool const use_paged_context_fmha, bool is_mla_enable)`
`672`	`672`	`{`
`673`	`673`	`// Only Blackwell supports NVFP4 output.`
`674`		`- if (tensorrt_llm::common::getSMVersion() < 100)`
	`674`	`+ // SM 120 does not support NVFP4 output.`
	`675`	`+ if (tensorrt_llm::common::getSMVersion() < 100 \|\| tensorrt_llm::common::getSMVersion() == 120)`
`675`	`676`	`{`
`676`	`677`	`return false;`
`677`	`678`	`}`