diff --git a/tests/model_executor/models/test_encoder_quant_config.py b/tests/model_executor/models/test_encoder_quant_config.py new file mode 100644 index 00000000000..80201849863 --- /dev/null +++ b/tests/model_executor/models/test_encoder_quant_config.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Regression test for #2686: pre-quantized methods must not apply +quant config to vision / audio encoders. + +For modelopt FP8/FP4/MXFP8 checkpoints the Thinker LM is the only +quantized component. Vision and audio encoder weights are BF16 with no +FP8 scale tensors — passing quant_config to them causes FP8 kernels to +run on BF16 weights, producing garbage embeddings. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest + +from vllm_omni.quantization.component_config import ( + PRE_QUANTIZED_METHODS, + ComponentQuantizationConfig, + resolve_encoder_quant_config, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +# --------------------------------------------------------------------------- +# resolve_encoder_quant_config — the core routing logic for encoder quant +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("method", sorted(PRE_QUANTIZED_METHODS)) +def test_pre_quantized_returns_none(method: str) -> None: + """visual_quant_config and audio_quant_config must be None for + pre-quantized methods (modelopt, modelopt_fp4, modelopt_mxfp8).""" + mock_config = MagicMock() + mock_config.get_name.return_value = method + + assert resolve_encoder_quant_config(mock_config) is None + + +@pytest.mark.parametrize("method", ["fp8", "awq", "gptq", "bitsandbytes"]) +def test_non_pre_quantized_preserves_config(method: str) -> None: + """Non-pre-quantized methods should pass through the original config.""" + mock_config = MagicMock() + mock_config.get_name.return_value = method + + assert resolve_encoder_quant_config(mock_config) is mock_config + + +def test_none_input_returns_none() -> None: + """No quantization → None for encoders.""" + assert resolve_encoder_quant_config(None) is None + + +def test_component_config_passed_through() -> None: + """ComponentQuantizationConfig should be returned as-is so the caller + can call .resolve() with the appropriate prefix.""" + inner = MagicMock() + inner.get_name.return_value = "modelopt" # would be None if not Component + component = ComponentQuantizationConfig( + component_configs={"language_model": inner}, + default_config=None, + ) + + result = resolve_encoder_quant_config(component) + assert result is component + + +# --------------------------------------------------------------------------- +# PRE_QUANTIZED_METHODS constant — exhaustiveness check +# --------------------------------------------------------------------------- + + +def test_pre_quantized_methods_contains_expected() -> None: + """Guard against accidental removal of a known pre-quantized method.""" + expected = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"} + assert PRE_QUANTIZED_METHODS == expected diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py index 0307034089c..617f0f9e325 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py @@ -64,6 +64,10 @@ ) from vllm.sequence import IntermediateTensors +from vllm_omni.quantization.component_config import ( + resolve_encoder_quant_config, +) + try: import flash_attn except (ImportError, ModuleNotFoundError): @@ -359,6 +363,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config + # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize + # the Thinker LM. Vision encoder weights remain in BF16 with no FP8 + # scale tensors; passing quant_config causes FP8 kernels to run on + # BF16 weights, producing garbage embeddings. Keep None for encoders. + visual_quant_config = resolve_encoder_quant_config(quant_config) + with self._mark_tower_model(vllm_config, "audio"): if multimodal_config.get_limit_per_prompt("audio"): self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config) @@ -370,7 +380,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.visual = Qwen2_5_VisionTransformer( vision_config=thinker_config.vision_config, norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), - quant_config=quant_config, + quant_config=visual_quant_config, prefix=maybe_prefix(prefix, "visual"), ) else: diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index 671ffb6cb16..d03a96fd85a 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -119,7 +119,10 @@ from vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker import ( Qwen2_5OmniConditionalGenerationMixin, ) -from vllm_omni.quantization.component_config import ComponentQuantizationConfig +from vllm_omni.quantization.component_config import ( + PRE_QUANTIZED_METHODS, + ComponentQuantizationConfig, +) try: import flash_attn @@ -1114,21 +1117,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.multimodal_config = multimodal_config self.quant_config = quant_config - # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) quantize the - # entire thinker — audio tower, visual encoder, and language model - # all share the same quant method. Dynamic quantization methods - # (e.g. --quantization fp8) should only target the language model. - _PRE_QUANTIZED_METHODS = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"} + # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize + # the Thinker LM (language model). Vision and audio encoder weights + # remain in BF16 and have no corresponding scale tensors in the + # checkpoint. Dynamic quantization methods (e.g. --quantization fp8) + # should also only target the language model. if isinstance(quant_config, ComponentQuantizationConfig): audio_quant_config = quant_config.resolve("audio_tower") visual_quant_config = quant_config.resolve("visual") language_quant_config = quant_config.resolve("language_model") elif quant_config is not None: - if quant_config.get_name() in _PRE_QUANTIZED_METHODS: - # Pre-quantized: pass quant_config to all subcomponents. - audio_quant_config = quant_config - visual_quant_config = quant_config + if quant_config.get_name() in PRE_QUANTIZED_METHODS: + # Pre-quantized: only the Thinker LM is quantized. + # Vision/audio encoder weights are BF16 with no FP8 scales; + # passing quant_config to them causes FP8 kernels to run on + # BF16 weights (producing garbage embeddings). Keep None. + audio_quant_config = None + visual_quant_config = None language_quant_config = quant_config else: # Dynamic quantization: scope to language_model only. diff --git a/vllm_omni/quantization/component_config.py b/vllm_omni/quantization/component_config.py index 7986da8850b..f9286079be1 100644 --- a/vllm_omni/quantization/component_config.py +++ b/vllm_omni/quantization/component_config.py @@ -23,6 +23,31 @@ ) +# Pre-quantized checkpoints (modelopt FP8/FP4/MXFP8) only quantize the +# Thinker LM. Vision and audio encoder weights remain in BF16 with no +# corresponding scale tensors in the checkpoint. +PRE_QUANTIZED_METHODS: frozenset[str] = frozenset({"modelopt", "modelopt_fp4", "modelopt_mxfp8"}) + + +def resolve_encoder_quant_config( + quant_config: QuantizationConfig | None, +) -> QuantizationConfig | None: + """Resolve quantization config for vision / audio encoders. + + Returns *None* for pre-quantized methods so that FP8 kernels are never + applied to BF16 encoder weights (which lack scale tensors). All other + configs — including ``ComponentQuantizationConfig`` and ``None`` — are + returned as-is so the caller can handle them. + """ + if ( + quant_config is not None + and not isinstance(quant_config, ComponentQuantizationConfig) + and quant_config.get_name() in PRE_QUANTIZED_METHODS + ): + return None + return quant_config + + class ComponentQuantizationConfig(QuantizationConfig): """Routes quantization to different configs by layer prefix."""