diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp index f377220be88..df0effece76 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.cpp +++ b/cpp/tensorrt_llm/thop/attentionOp.cpp @@ -671,7 +671,8 @@ bool attention_supports_nvfp4_output(int64_t const num_heads, int64_t const num_ bool const use_paged_context_fmha, bool is_mla_enable) { // Only Blackwell supports NVFP4 output. - if (tensorrt_llm::common::getSMVersion() < 100) + // SM 120 does not support NVFP4 output. + if (tensorrt_llm::common::getSMVersion() < 100 || tensorrt_llm::common::getSMVersion() == 120) { return false; } diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md index 83e0eab5284..308f009bf1e 100644 --- a/examples/models/core/qwen/README.md +++ b/examples/models/core/qwen/README.md @@ -70,7 +70,7 @@ In addition, there are two shared files in the parent folder [`examples`](../../ | Qwen2.5-72B(-Instruct)| Y | Y | - | Y | Y* | Y | Y | Y | Y | - | Ampere+ | | QwQ-32B | Y | Y | - | Y | Y | Y | Y | Y | Y | - | Ampere+ | | Qwen3-32B | Y | Y | Y | - | - | - | - | Y | - | Y | Hopper+ | -| Qwen3-235B-A3B | Y | Y | Y | - | - | - | - | Y | - | Y | Hopper+ | +| Qwen3-235B-A22B | Y | Y | Y | - | - | - | - | Y | - | Y | Hopper+ | Please note that Y* sign means that the model does not support all the AWQ + TP combination. diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index fc0ff003cff..45c67a63112 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1844,7 +1844,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, moe_config=MoeConfig(backend=moe_backend)) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) with LLM( f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", tensor_parallel_size=tp_size, diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index d1ed978c99e..ad8ceb57f2a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -400,8 +400,7 @@ examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075) examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488) accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043) -full:B200/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5355219) -full:B200/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5355219) +full:B200/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5401163) examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5355054) examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] SKIP (https://nvbugs/5355054) examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086)