vllm-project · lishunyang12 · Apr 14, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 14, 2026
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Regression test for #2686: pre-quantized methods must not apply
+quant config to vision / audio encoders.
+
+For modelopt FP8/FP4/MXFP8 checkpoints the Thinker LM is the only
+quantized component.  Vision and audio encoder weights are BF16 with no
+FP8 scale tensors — passing quant_config to them causes FP8 kernels to
+run on BF16 weights, producing garbage embeddings.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm_omni.quantization.component_config import (
+    PRE_QUANTIZED_METHODS,
+    ComponentQuantizationConfig,
+    resolve_encoder_quant_config,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+# ---------------------------------------------------------------------------
+# resolve_encoder_quant_config — the core routing logic for encoder quant
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("method", sorted(PRE_QUANTIZED_METHODS))
+def test_pre_quantized_returns_none(method: str) -> None:
+    """visual_quant_config and audio_quant_config must be None for
+    pre-quantized methods (modelopt, modelopt_fp4, modelopt_mxfp8)."""
+    mock_config = MagicMock()
+    mock_config.get_name.return_value = method
+
+    assert resolve_encoder_quant_config(mock_config) is None
+
+
+@pytest.mark.parametrize("method", ["fp8", "awq", "gptq", "bitsandbytes"])
+def test_non_pre_quantized_preserves_config(method: str) -> None:
+    """Non-pre-quantized methods should pass through the original config."""
+    mock_config = MagicMock()
+    mock_config.get_name.return_value = method
+
+    assert resolve_encoder_quant_config(mock_config) is mock_config
+
+
+def test_none_input_returns_none() -> None:
+    """No quantization → None for encoders."""
+    assert resolve_encoder_quant_config(None) is None
+
+
+def test_component_config_passed_through() -> None:
+    """ComponentQuantizationConfig should be returned as-is so the caller
+    can call .resolve() with the appropriate prefix."""
+    inner = MagicMock()
+    inner.get_name.return_value = "modelopt"  # would be None if not Component
+    component = ComponentQuantizationConfig(
+        component_configs={"language_model": inner},
+        default_config=None,
+    )
+
+    result = resolve_encoder_quant_config(component)
+    assert result is component
+
+
+# ---------------------------------------------------------------------------
+# PRE_QUANTIZED_METHODS constant — exhaustiveness check
+# ---------------------------------------------------------------------------
+
+
+def test_pre_quantized_methods_contains_expected() -> None:
+    """Guard against accidental removal of a known pre-quantized method."""
+    expected = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"}
+    assert PRE_QUANTIZED_METHODS == expected
@@ -64,6 +64,10 @@
 )
 from vllm.sequence import IntermediateTensors
 
+from vllm_omni.quantization.component_config import (
+    resolve_encoder_quant_config,
+)
+
 try:
     import flash_attn
 except (ImportError, ModuleNotFoundError):
@@ -359,6 +363,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.quant_config = quant_config
 
+        # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize
+        # the Thinker LM. Vision encoder weights remain in BF16 with no FP8
+        # scale tensors; passing quant_config causes FP8 kernels to run on
+        # BF16 weights, producing garbage embeddings. Keep None for encoders.
+        visual_quant_config = resolve_encoder_quant_config(quant_config)
+
         with self._mark_tower_model(vllm_config, "audio"):
             if multimodal_config.get_limit_per_prompt("audio"):
                 self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
@@ -370,7 +380,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 self.visual = Qwen2_5_VisionTransformer(
                     vision_config=thinker_config.vision_config,
                     norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
-                    quant_config=quant_config,
+                    quant_config=visual_quant_config,
                     prefix=maybe_prefix(prefix, "visual"),
                 )
             else:

@@ -119,7 +119,10 @@
 from vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker import (
     Qwen2_5OmniConditionalGenerationMixin,
 )
-from vllm_omni.quantization.component_config import ComponentQuantizationConfig
+from vllm_omni.quantization.component_config import (
+    PRE_QUANTIZED_METHODS,
+    ComponentQuantizationConfig,
+)
 
 try:
     import flash_attn
@@ -1114,21 +1117,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
 
-        # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) quantize the
-        # entire thinker — audio tower, visual encoder, and language model
-        # all share the same quant method.  Dynamic quantization methods
-        # (e.g. --quantization fp8) should only target the language model.
-        _PRE_QUANTIZED_METHODS = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"}
+        # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize
+        # the Thinker LM (language model). Vision and audio encoder weights
+        # remain in BF16 and have no corresponding scale tensors in the
+        # checkpoint. Dynamic quantization methods (e.g. --quantization fp8)
+        # should also only target the language model.
 
         if isinstance(quant_config, ComponentQuantizationConfig):
             audio_quant_config = quant_config.resolve("audio_tower")
             visual_quant_config = quant_config.resolve("visual")
             language_quant_config = quant_config.resolve("language_model")
         elif quant_config is not None:
-            if quant_config.get_name() in _PRE_QUANTIZED_METHODS:
-                # Pre-quantized: pass quant_config to all subcomponents.
-                audio_quant_config = quant_config
-                visual_quant_config = quant_config
+            if quant_config.get_name() in PRE_QUANTIZED_METHODS:
+                # Pre-quantized: only the Thinker LM is quantized.
+                # Vision/audio encoder weights are BF16 with no FP8 scales;
+                # passing quant_config to them causes FP8 kernels to run on
+                # BF16 weights (producing garbage embeddings). Keep None.
+                audio_quant_config = None
+                visual_quant_config = None
                 language_quant_config = quant_config
             else:
                 # Dynamic quantization: scope to language_model only.

@@ -23,6 +23,31 @@
     )
 
 
+# Pre-quantized checkpoints (modelopt FP8/FP4/MXFP8) only quantize the
+# Thinker LM.  Vision and audio encoder weights remain in BF16 with no
+# corresponding scale tensors in the checkpoint.
+PRE_QUANTIZED_METHODS: frozenset[str] = frozenset({"modelopt", "modelopt_fp4", "modelopt_mxfp8"})
+
+
+def resolve_encoder_quant_config(
+    quant_config: QuantizationConfig | None,
+) -> QuantizationConfig | None:
+    """Resolve quantization config for vision / audio encoders.
+
+    Returns *None* for pre-quantized methods so that FP8 kernels are never
+    applied to BF16 encoder weights (which lack scale tensors).  All other
+    configs — including ``ComponentQuantizationConfig`` and ``None`` — are
+    returned as-is so the caller can handle them.
+    """
+    if (
+        quant_config is not None
+        and not isinstance(quant_config, ComponentQuantizationConfig)
+        and quant_config.get_name() in PRE_QUANTIZED_METHODS
+    ):
+        return None
+    return quant_config
+
+
 class ComponentQuantizationConfig(QuantizationConfig):
     """Routes quantization to different configs by layer prefix."""