Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions tests/model_executor/models/test_encoder_quant_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Regression test for #2686: pre-quantized methods must not apply
quant config to vision / audio encoders.

For modelopt FP8/FP4/MXFP8 checkpoints the Thinker LM is the only
quantized component. Vision and audio encoder weights are BF16 with no
FP8 scale tensors — passing quant_config to them causes FP8 kernels to
run on BF16 weights, producing garbage embeddings.
"""

from __future__ import annotations

from unittest.mock import MagicMock

import pytest

from vllm_omni.quantization.component_config import (
PRE_QUANTIZED_METHODS,
ComponentQuantizationConfig,
resolve_encoder_quant_config,
)

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]

# ---------------------------------------------------------------------------
# resolve_encoder_quant_config — the core routing logic for encoder quant
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("method", sorted(PRE_QUANTIZED_METHODS))
def test_pre_quantized_returns_none(method: str) -> None:
"""visual_quant_config and audio_quant_config must be None for
pre-quantized methods (modelopt, modelopt_fp4, modelopt_mxfp8)."""
mock_config = MagicMock()
mock_config.get_name.return_value = method

assert resolve_encoder_quant_config(mock_config) is None


@pytest.mark.parametrize("method", ["fp8", "awq", "gptq", "bitsandbytes"])
def test_non_pre_quantized_preserves_config(method: str) -> None:
"""Non-pre-quantized methods should pass through the original config."""
mock_config = MagicMock()
mock_config.get_name.return_value = method

assert resolve_encoder_quant_config(mock_config) is mock_config


def test_none_input_returns_none() -> None:
"""No quantization → None for encoders."""
assert resolve_encoder_quant_config(None) is None


def test_component_config_passed_through() -> None:
"""ComponentQuantizationConfig should be returned as-is so the caller
can call .resolve() with the appropriate prefix."""
inner = MagicMock()
inner.get_name.return_value = "modelopt" # would be None if not Component
component = ComponentQuantizationConfig(
component_configs={"language_model": inner},
default_config=None,
)

result = resolve_encoder_quant_config(component)
assert result is component


# ---------------------------------------------------------------------------
# PRE_QUANTIZED_METHODS constant — exhaustiveness check
# ---------------------------------------------------------------------------


def test_pre_quantized_methods_contains_expected() -> None:
"""Guard against accidental removal of a known pre-quantized method."""
expected = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"}
assert PRE_QUANTIZED_METHODS == expected
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@
)
from vllm.sequence import IntermediateTensors

from vllm_omni.quantization.component_config import (
resolve_encoder_quant_config,
)

try:
import flash_attn
except (ImportError, ModuleNotFoundError):
Expand Down Expand Up @@ -359,6 +363,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

self.quant_config = quant_config

# Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize
# the Thinker LM. Vision encoder weights remain in BF16 with no FP8
# scale tensors; passing quant_config causes FP8 kernels to run on
# BF16 weights, producing garbage embeddings. Keep None for encoders.
visual_quant_config = resolve_encoder_quant_config(quant_config)

with self._mark_tower_model(vllm_config, "audio"):
if multimodal_config.get_limit_per_prompt("audio"):
self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
Expand All @@ -370,7 +380,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.visual = Qwen2_5_VisionTransformer(
vision_config=thinker_config.vision_config,
norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
quant_config=visual_quant_config,
prefix=maybe_prefix(prefix, "visual"),
)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,10 @@
from vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker import (
Qwen2_5OmniConditionalGenerationMixin,
)
from vllm_omni.quantization.component_config import ComponentQuantizationConfig
from vllm_omni.quantization.component_config import (
PRE_QUANTIZED_METHODS,
ComponentQuantizationConfig,
)

try:
import flash_attn
Expand Down Expand Up @@ -1114,21 +1117,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.multimodal_config = multimodal_config
self.quant_config = quant_config

# Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) quantize the
# entire thinker — audio tower, visual encoder, and language model
# all share the same quant method. Dynamic quantization methods
# (e.g. --quantization fp8) should only target the language model.
_PRE_QUANTIZED_METHODS = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"}
# Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize
# the Thinker LM (language model). Vision and audio encoder weights
# remain in BF16 and have no corresponding scale tensors in the
# checkpoint. Dynamic quantization methods (e.g. --quantization fp8)
# should also only target the language model.

if isinstance(quant_config, ComponentQuantizationConfig):
audio_quant_config = quant_config.resolve("audio_tower")
visual_quant_config = quant_config.resolve("visual")
language_quant_config = quant_config.resolve("language_model")
elif quant_config is not None:
if quant_config.get_name() in _PRE_QUANTIZED_METHODS:
# Pre-quantized: pass quant_config to all subcomponents.
audio_quant_config = quant_config
visual_quant_config = quant_config
if quant_config.get_name() in PRE_QUANTIZED_METHODS:
# Pre-quantized: only the Thinker LM is quantized.
# Vision/audio encoder weights are BF16 with no FP8 scales;
# passing quant_config to them causes FP8 kernels to run on
# BF16 weights (producing garbage embeddings). Keep None.
audio_quant_config = None
visual_quant_config = None
language_quant_config = quant_config
else:
# Dynamic quantization: scope to language_model only.
Expand Down
25 changes: 25 additions & 0 deletions vllm_omni/quantization/component_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,31 @@
)


# Pre-quantized checkpoints (modelopt FP8/FP4/MXFP8) only quantize the
# Thinker LM. Vision and audio encoder weights remain in BF16 with no
# corresponding scale tensors in the checkpoint.
PRE_QUANTIZED_METHODS: frozenset[str] = frozenset({"modelopt", "modelopt_fp4", "modelopt_mxfp8"})
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have modelopt fp4 and modelopt mxfp8 checkpoint for now.



def resolve_encoder_quant_config(
quant_config: QuantizationConfig | None,
) -> QuantizationConfig | None:
"""Resolve quantization config for vision / audio encoders.

Returns *None* for pre-quantized methods so that FP8 kernels are never
applied to BF16 encoder weights (which lack scale tensors). All other
configs — including ``ComponentQuantizationConfig`` and ``None`` — are
returned as-is so the caller can handle them.
"""
if (
quant_config is not None
and not isinstance(quant_config, ComponentQuantizationConfig)
and quant_config.get_name() in PRE_QUANTIZED_METHODS
):
return None
return quant_config


class ComponentQuantizationConfig(QuantizationConfig):
"""Routes quantization to different configs by layer prefix."""

Expand Down
Loading