diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py index dfb7ae76b9e..ae6a1b37873 100644 --- a/tensorrt_llm/quantization/quantize.py +++ b/tensorrt_llm/quantization/quantize.py @@ -28,14 +28,16 @@ def quantize_layers( quant_map, preprocess_init_params=None, ): - exclude_modules = quant_config.exclude_modules or [ - '*lm_head', - '*router', - '*vocab_embedding', - '*position_embedding', - '*block_embedding', - '*shared_expert_gate', - ] + exclude_modules = quant_config.exclude_modules + if exclude_modules is None: + exclude_modules = [ + '*lm_head', + '*router', + '*vocab_embedding', + '*position_embedding', + '*block_embedding', + '*shared_expert_gate', + ] for name, module, parent in model.named_modules_with_parent(): module_name = name.rsplit('.', 1)[-1] @@ -244,9 +246,12 @@ def fp8_rowwise_quantize(model, quant_config: QuantConfig): Attention: Fp8RowwiseAttention, } + exclude_modules = quant_config.exclude_modules + if exclude_modules is None: + exclude_modules = [] + # Always exclude these modules for FP8 rowwise exclude_modules = list( - set((quant_config.exclude_modules or []) + - ['*ln_f', '*ln_embed', '*lm_head'])) + set(exclude_modules + ['*ln_f', '*ln_embed', '*lm_head'])) def extract_layer_idx(name): ss = name.split('.') diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 4e62b50e40d..c72fb4108cd 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -419,7 +419,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bflo examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) -accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8_lm_head SKIP (https://nvbugs/5214229) examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5214239) examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697) examples/test_gpt.py::test_llm_gpt2_santacoder_1node_4gpus[parallel_build-enable_fmha-enable_gemm_plugin-enable_attention_plugin] SKIP (https://nvbugs/5219531)