Support Qwen3 and Qwen2.5 Omni model quantization by lvliang-intel · Pull Request #1404 · intel/auto-round

lvliang-intel · 2026-02-04T14:48:19Z

Description

This update adds quantization support for Qwen3-Omni by integrating a custom MLLM processor and template, implementing dedicated forward logic for thinker/talker calibration, and introducing model-specific block discovery.

Note: This feature requires Transformers >= 5.1.0, as earlier versions contain compatibility issues with Qwen3-Omni.

Type of Change

Related Issues

#1387

Fixes or relates to #

Checklist Before Submitting

My code has been tested locally.
Documentation has been updated as needed.
New or updated tests are included where applicable.

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

for more information, see https://pre-commit.ci

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

wenhuach21 · 2026-02-05T01:39:44Z

Thank you for the PR! Could you help verify all inferences (vLLM, Transformers 4, and Transformers 5) before merging?

for more information, see https://pre-commit.ci

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

lvliang-intel · 2026-02-11T11:09:03Z

Quantize:

from auto_round import AutoRound

model_name_or_path = "Qwen/Qwen3-Omni-30B-A3B-Instruct"

ar = AutoRound(
    model=model_name_or_path,
    scheme="W4A16",
    lr=5e-3,
    iters=100,
)
ar.quantize_and_save(format="auto_round", output_dir="tmp_qwen_omni_w4a16")

Inference with transformers 5.1.0

#!/usr/bin/env python3
"""Verify a quantized Qwen3-Omni model with transformers.

Tests text-only, image, audio, and video inputs.
"""

import argparse
import os
import sys
import traceback

import soundfile as sf
import torch
from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info


USE_AUDIO_IN_VIDEO = True

# Demo resources from Qwen
DEMO_IMAGE = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"
DEMO_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"
DEMO_VIDEO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"


TEST_CASES = {
    "text_only": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What is the capital of France? Answer in one short sentence."},
                ],
            },
        ],
    },
    "image": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "text", "text": "Describe this image in one short sentence."},
                ],
            },
        ],
    },
    "audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What sound can you hear? Answer in one short sentence."},
                ],
            },
        ],
    },
    "video": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": DEMO_VIDEO},
                    {"type": "text", "text": "Describe what happens in this video in one short sentence."},
                ],
            },
        ],
    },
    "image_audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What can you see and hear? Answer in one short sentence."},
                ],
            },
        ],
    },
}


def run_test(model, processor, test_name, conversation, max_new_tokens, enable_audio_output):
    """Run a single test case and return the result."""
    print(f"\n{'='*60}")
    print(f"Test: {test_name}")
    print(f"{'='*60}")

    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)

    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
    )
    inputs = inputs.to(model.device).to(model.dtype)

    generate_kwargs = dict(
        **inputs,
        max_new_tokens=max_new_tokens,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
        do_sample=False,
    )
    if enable_audio_output:
        generate_kwargs["speaker"] = "Ethan"
        generate_kwargs["thinker_return_dict_in_generate"] = True

    output = model.generate(**generate_kwargs)

    # Qwen3-Omni generate() always returns (text_ids, audio) tuple
    if isinstance(output, tuple):
        text_ids, audio_out = output
    else:
        text_ids, audio_out = output, None

    # With thinker_return_dict_in_generate=True, text_ids has .sequences
    if hasattr(text_ids, "sequences"):
        decode_ids = text_ids.sequences[:, inputs["input_ids"].shape[1]:]
    else:
        decode_ids = text_ids[:, inputs["input_ids"].shape[1]:]

    generated_text = processor.batch_decode(
        decode_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    print(f"Output: {generated_text}")

    if audio_out is not None and enable_audio_output:
        wav_path = f"output_{test_name}.wav"
        sf.write(wav_path, audio_out.reshape(-1).detach().cpu().numpy(), samplerate=24000)
        print(f"Audio saved to {wav_path}")

    if len(generated_text.strip()) == 0:
        print(f"WARNING: Empty output for test '{test_name}'")
        return False
    return True


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Verify a quantized Qwen3-Omni model with text, image, audio, and video inputs."
    )
    parser.add_argument(
        "--model-dir",
        required=True,
        help="Path to the quantized model directory.",
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=256,
        help="Maximum number of new tokens to generate.",
    )
    parser.add_argument(
        "--tests",
        nargs="+",
        default=list(TEST_CASES.keys()),
        choices=list(TEST_CASES.keys()),
        help="Which tests to run (default: all).",
    )
    parser.add_argument(
        "--enable-audio-output",
        action="store_true",
        default=False,
        help="Enable audio output generation (requires talker model).",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    model_dir = os.path.abspath(args.model_dir)
    if not os.path.isdir(model_dir):
        print(f"Model directory not found: {model_dir}")
        return 1

    print(f"Loading model from {model_dir} ...")
    model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
        model_dir,
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
    )
    processor = Qwen3OmniMoeProcessor.from_pretrained(model_dir)
    print("Model and processor loaded.\n")

    passed, failed = [], []
    for test_name in args.tests:
        tc = TEST_CASES[test_name]
        try:
            ok = run_test(
                model, processor, test_name, tc["conversation"],
                args.max_new_tokens, args.enable_audio_output,
            )
            (passed if ok else failed).append(test_name)
        except Exception as exc:
            print(f"ERROR in test '{test_name}': {exc}")
            traceback.print_exc()
            failed.append(test_name)

    print(f"\n{'='*60}")
    print(f"Results: {len(passed)} passed, {len(failed)} failed out of {len(args.tests)} tests")
    if passed:
        print(f"  Passed: {', '.join(passed)}")
    if failed:
        print(f"  Failed: {', '.join(failed)}")
    print(f"{'='*60}")

    del model
    torch.cuda.empty_cache()
    return 1 if failed else 0


if __name__ == "__main__":
    sys.exit(main())

CUDA_VISIBLE_DEVICES=0 python verify_quantized_transformers.py --model-dir ./tmp_qwen_omni_w4a16/
Loading model from /mnt/disk1/lvl/auto-round/tmp_qwen_omni_w4a16 ...
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
2026-02-11 18:50:27 WARNING modeling_utils.py L4356: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-02-11 18:53:41 INFO moe_experts_interface.py L432: Unfused 68 MOE experts modules for quantization
2026-02-11 18:53:41 INFO replace_modules.py L80: Prepared 68 MOE modules for quantization
2026-02-11 18:53:45 WARNING backend.py L1088: Better backend is found, please install all the following requirements to enable it.
2026-02-11 18:53:45 WARNING backend.py L1088: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|█| 54786/54786 [00:15<00:00, 3647.06it/s, Materiali
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Model and processor loaded.

Test: text_only
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
2026-02-11 18:54:24 WARNING utils.py L2088: The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Output: The capital of France is Paris.

Test: image
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: This composite image displays four different luxury vehicles: a white Rolls-Royce Phantom, a grey Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911.

Test: audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The sound of a person coughing is heard.

Test: video
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person uses a stylus to draw a guitar on a tablet.

Test: image_audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The image displays four luxury vehicles—a Rolls-Royce, a Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911—while the audio features a person coughing.

Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

vLLM tests are currently blocked because the latest vLLM version depends on an outdated Transformers release. Qwen3-Omni requires Transformers >= 5.1.0 to address several known issues.

Copilot

Pull request overview

Adds quantization support for the Qwen3-Omni MoE model family by integrating model-specific loading/version gating, calibration forward behavior for thinker/talker, and custom multimodal block discovery.

Changes:

Added explicit Transformers version guard for qwen3_omni_moe.
Introduced Qwen3-Omni processor/template registration and model-specific multimodal block name discovery.
Implemented a Qwen3-Omni-specific forward path to run thinker (and optionally talker) during calibration.

Reviewed changes

Copilot reviewed 10 out of 10 changed files in this pull request and generated 4 comments.

Show a summary per file

File	Description
pyproject.toml	Adds a project-specific word to typos’ allowlist.
auto_round/utils/model.py	Adds Transformers version guard and adjusts `lm_head` discovery logic.
auto_round/utils/common.py	Adds `_no_split_modules` normalization and extends multimodal ignore-key lists.
auto_round/special_model_handler.py	Adds Qwen3-Omni special forward + block discovery + ignore-layer rule.
auto_round/compressors/shard_writer.py	Improves `tie_word_embeddings` lookup for nested multimodal configs.
auto_round/compressors/mllm/utils.py	Extends multimodal ignore-key list for Qwen3-Omni components.
auto_round/compressors/mllm/template.py	Registers a Qwen3-Omni model template with the new processor.
auto_round/compressors/mllm/processor.py	Adds a custom processor for Qwen3-Omni chat-template inputs.
auto_round/compressors/base.py	Imports the new normalization helper.
auto_round/auto_scheme/utils.py	Uses normalized `_no_split_modules` when dispatching across devices.

Copilot · 2026-02-11T11:11:38Z

+                # Use text projection to convert thinker embeddings to talker space
+                if hasattr(model.talker, "text_projection"):
+                    # Get thinker embeddings
+                    thinker_embeds = model.thinker.get_input_embeddings()(input_ids)
+                    talker_inputs_embeds = model.talker.text_projection(thinker_embeds)


This path assumes input_ids is provided; if calibration runs with inputs_embeds (or other modalities without input_ids), this will throw and then be silently ignored (due to the broad except), meaning the talker forward never runs. Consider deriving inputs from inputs_embeds when present, or projecting from thinker_output.hidden_states[-1] (which you already compute) instead of re-embedding input_ids.

Suggested change

# Use text projection to convert thinker embeddings to talker space

if hasattr(model.talker, "text_projection"):

# Get thinker embeddings

thinker_embeds = model.thinker.get_input_embeddings()(input_ids)

talker_inputs_embeds = model.talker.text_projection(thinker_embeds)

# Use text projection to convert thinker hidden states to talker space

if hasattr(model.talker, "text_projection"):

# Project thinker hidden states directly into the talker embedding space

talker_inputs_embeds = model.talker.text_projection(thinker_hidden)

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

wenhuach21 · 2026-02-25T11:31:10Z

Quantize:
from auto_round import AutoRound

model_name_or_path = "Qwen/Qwen3-Omni-30B-A3B-Instruct"

ar = AutoRound(
    model=model_name_or_path,
    scheme="W4A16",
    lr=5e-3,
    iters=100,
)
ar.quantize_and_save(format="auto_round", output_dir="tmp_qwen_omni_w4a16")
Inference with transformers 5.1.0
#!/usr/bin/env python3
"""Verify a quantized Qwen3-Omni model with transformers.

Tests text-only, image, audio, and video inputs.
"""

import argparse
import os
import sys
import traceback

import soundfile as sf
import torch
from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info


USE_AUDIO_IN_VIDEO = True

# Demo resources from Qwen
DEMO_IMAGE = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"
DEMO_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"
DEMO_VIDEO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"


TEST_CASES = {
    "text_only": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What is the capital of France? Answer in one short sentence."},
                ],
            },
        ],
    },
    "image": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "text", "text": "Describe this image in one short sentence."},
                ],
            },
        ],
    },
    "audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What sound can you hear? Answer in one short sentence."},
                ],
            },
        ],
    },
    "video": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": DEMO_VIDEO},
                    {"type": "text", "text": "Describe what happens in this video in one short sentence."},
                ],
            },
        ],
    },
    "image_audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What can you see and hear? Answer in one short sentence."},
                ],
            },
        ],
    },
}


def run_test(model, processor, test_name, conversation, max_new_tokens, enable_audio_output):
    """Run a single test case and return the result."""
    print(f"\n{'='*60}")
    print(f"Test: {test_name}")
    print(f"{'='*60}")

    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)

    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
    )
    inputs = inputs.to(model.device).to(model.dtype)

    generate_kwargs = dict(
        **inputs,
        max_new_tokens=max_new_tokens,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
        do_sample=False,
    )
    if enable_audio_output:
        generate_kwargs["speaker"] = "Ethan"
        generate_kwargs["thinker_return_dict_in_generate"] = True

    output = model.generate(**generate_kwargs)

    # Qwen3-Omni generate() always returns (text_ids, audio) tuple
    if isinstance(output, tuple):
        text_ids, audio_out = output
    else:
        text_ids, audio_out = output, None

    # With thinker_return_dict_in_generate=True, text_ids has .sequences
    if hasattr(text_ids, "sequences"):
        decode_ids = text_ids.sequences[:, inputs["input_ids"].shape[1]:]
    else:
        decode_ids = text_ids[:, inputs["input_ids"].shape[1]:]

    generated_text = processor.batch_decode(
        decode_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    print(f"Output: {generated_text}")

    if audio_out is not None and enable_audio_output:
        wav_path = f"output_{test_name}.wav"
        sf.write(wav_path, audio_out.reshape(-1).detach().cpu().numpy(), samplerate=24000)
        print(f"Audio saved to {wav_path}")

    if len(generated_text.strip()) == 0:
        print(f"WARNING: Empty output for test '{test_name}'")
        return False
    return True


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Verify a quantized Qwen3-Omni model with text, image, audio, and video inputs."
    )
    parser.add_argument(
        "--model-dir",
        required=True,
        help="Path to the quantized model directory.",
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=256,
        help="Maximum number of new tokens to generate.",
    )
    parser.add_argument(
        "--tests",
        nargs="+",
        default=list(TEST_CASES.keys()),
        choices=list(TEST_CASES.keys()),
        help="Which tests to run (default: all).",
    )
    parser.add_argument(
        "--enable-audio-output",
        action="store_true",
        default=False,
        help="Enable audio output generation (requires talker model).",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    model_dir = os.path.abspath(args.model_dir)
    if not os.path.isdir(model_dir):
        print(f"Model directory not found: {model_dir}")
        return 1

    print(f"Loading model from {model_dir} ...")
    model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
        model_dir,
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
    )
    processor = Qwen3OmniMoeProcessor.from_pretrained(model_dir)
    print("Model and processor loaded.\n")

    passed, failed = [], []
    for test_name in args.tests:
        tc = TEST_CASES[test_name]
        try:
            ok = run_test(
                model, processor, test_name, tc["conversation"],
                args.max_new_tokens, args.enable_audio_output,
            )
            (passed if ok else failed).append(test_name)
        except Exception as exc:
            print(f"ERROR in test '{test_name}': {exc}")
            traceback.print_exc()
            failed.append(test_name)

    print(f"\n{'='*60}")
    print(f"Results: {len(passed)} passed, {len(failed)} failed out of {len(args.tests)} tests")
    if passed:
        print(f"  Passed: {', '.join(passed)}")
    if failed:
        print(f"  Failed: {', '.join(failed)}")
    print(f"{'='*60}")

    del model
    torch.cuda.empty_cache()
    return 1 if failed else 0


if __name__ == "__main__":
    sys.exit(main())
CUDA_VISIBLE_DEVICES=0 python verify_quantized_transformers.py --model-dir ./tmp_qwen_omni_w4a16/
Loading model from /mnt/disk1/lvl/auto-round/tmp_qwen_omni_w4a16 ...
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
2026-02-11 18:50:27 WARNING modeling_utils.py L4356: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-02-11 18:53:41 INFO moe_experts_interface.py L432: Unfused 68 MOE experts modules for quantization
2026-02-11 18:53:41 INFO replace_modules.py L80: Prepared 68 MOE modules for quantization
2026-02-11 18:53:45 WARNING backend.py L1088: Better backend is found, please install all the following requirements to enable it.
2026-02-11 18:53:45 WARNING backend.py L1088: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|█| 54786/54786 [00:15<00:00, 3647.06it/s, Materiali
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Model and processor loaded.
Test: text_only
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
2026-02-11 18:54:24 WARNING utils.py L2088: The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Output: The capital of France is Paris.
Test: image
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: This composite image displays four different luxury vehicles: a white Rolls-Royce Phantom, a grey Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911.
Test: audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The sound of a person coughing is heard.
Test: video
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person uses a stylus to draw a guitar on a tablet.
Test: image_audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The image displays four luxury vehicles—a Rolls-Royce, a Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911—while the audio features a person coughing.
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

vLLM tests are currently blocked because the latest vLLM version depends on an outdated Transformers release. Qwen3-Omni requires Transformers >= 5.1.0 to address several known issues.

you could update transformers after installing vllm

for more information, see https://pre-commit.ci

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

…upport_omni

…into lvl/support_omni

for more information, see https://pre-commit.ci

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

for more information, see https://pre-commit.ci Signed-off-by: lvliang-intel <liang1.lv@intel.com>

for more information, see https://pre-commit.ci

…into lvl/support_omni

for more information, see https://pre-commit.ci

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

…into lvl/support_omni

lvliang-intel · 2026-03-06T02:18:05Z

Qwen2.5 Omni quantize and inference test pass:

CUDA_VISIBLE_DEVICES=3 python quantize_qwen25_omni.py --model /mnt/disk2/lvl/Qwen2.5-Omni-3B --output tmp_qwen25_omni_w4a16 --iters 200

CUDA_VISIBLE_DEVICES=6 python run_qwen25_omni.py --model-dir tmp_qwen25_omni_w4a16 --enable-audio-output
Loading model from /mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16 ...
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but Flash Attention only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.
2026-03-06 09:49:10 INFO device.py L1643: [Memory Monitor] Before Applying general replacements: 'peak_ram': 0.92GB
2026-03-06 09:49:10 WARNING modeling_utils.py L4430: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-03-06 09:49:10 INFO device.py L1646: [Memory Monitor] After Applying general replacements: 'peak_ram': 0.92GB
2026-03-06 09:49:10 WARNING backend.py L1084: Better backend is found, please install all the following requirements to enable it.
2026-03-06 09:49:10 WARNING backend.py L1084: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|██████████████████████████████████████████| 3383/3383 [00:01<00:00, 2609.58it/s]
2026-03-06 09:49:12 INFO common.py L246: Patched Qwen2.5-Omni talker prepare_inputs_for_generation for transformers compat.
Model and processor loaded.

============================================================
Test: text_only

Output: The capital of France is Paris. If you want to know more about Paris or France, feel free to ask.
Audio saved to output_text_only.wav

============================================================
Test: image

Output: This image shows four different luxury cars. One is a white Rolls - Royce, another is a Mercedes - Benz GLE SUV, the third is a red Ferrari Portofino M, and the fourth is a white Porsche. What do you think about these cars?
Audio saved to output_image.wav

============================================================
Test: audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Output: I can hear a cough. What do you think about it?
Audio saved to output_audio.wav

============================================================
Test: video

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
Output: In the video, someone is drawing a guitar on a tablet.
Audio saved to output_video.wav

============================================================
Test: image_audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Output: I can see four different types of cars in the pictures, and I can hear a cough. What do you think about these cars?
Audio saved to output_image_audio.wav

============================================================
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

Qwen3 Omni quantize and inference test pass:

CUDA_VISIBLE_DEVICES=0 python quantize_qwen3_omni.py --model /mnt/disk2/lvl/Qwen3-VL-30B-A3B-Instruct/ --output ./tmp_qwen3_omni_w4a16

CUDA_VISIBLE_DEVICES=5 python run_qwen3_omni.py --model-dir ./tmp_qwen3_omni_w4a16 --enable-audio-output
Loading model from /mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16 ...
[patch] adding missing talker text_config.num_shared_experts = 128
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
2026-03-06 09:19:14 INFO replace_modules.py L106: Experts (before replacement) [thinker.model.layers.0.mlp.experts] (Qwen3OmniMoeThinkerTextExperts):
Qwen3OmniMoeThinkerTextExperts(
(act_fn): SiLUActivation()
)
2026-03-06 09:19:14 INFO device.py L1643: [Memory Monitor] Before Applying custom replacements: 'peak_ram': 0.93GB
2026-03-06 09:19:14 INFO replace_modules.py L336: Found 68 modules to replace
Replacing modules: 100%|██████████████████████████████████████████████| 68/68 [00:04<00:00, 15.81it/s]
2026-03-06 09:19:19 INFO replace_modules.py L363: Replaced 68 modules
2026-03-06 09:19:19 INFO device.py L1646: [Memory Monitor] After Applying custom replacements: 'peak_ram': 1.04GB
2026-03-06 09:19:19 INFO device.py L1643: [Memory Monitor] Before Applying general replacements: 'peak_ram': 1.04GB
2026-03-06 09:19:19 WARNING modeling_utils.py L4430: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-03-06 09:19:22 INFO device.py L1646: [Memory Monitor] After Applying general replacements: 'peak_ram': 1.04GB
2026-03-06 09:19:22 INFO replace_modules.py L106: Experts (after replacement) [thinker.model.layers.0.mlp.experts] (SequentialQwen3OmniThinkerExperts):
SequentialQwen3OmniThinkerExperts(
(0-127): 128 x Qwen3OmniMoeThinkerTextMLP(
(gate_proj): Linear(in_features=2048, out_features=768, bias=False)
(up_proj): Linear(in_features=2048, out_features=768, bias=False)
(down_proj): Linear(in_features=768, out_features=2048, bias=False)
(act_fn): SiLUActivation()
)
)
2026-03-06 09:19:27 WARNING backend.py L1084: Better backend is found, please install all the following requirements to enable it.
2026-03-06 09:19:27 WARNING backend.py L1084: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|████████████████████████████████████████| 80898/80898 [00:10<00:00, 7577.17it/s]
2026-03-06 09:19:55 INFO common.py L342: Patched Qwen3-Omni MoE talker prepare_inputs_for_generation for transformers compat.
Model and processor loaded.

============================================================
Test: text_only

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
2026-03-06 09:20:08 WARNING utils.py L1988: The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Output: The capital of France is Paris.
Audio saved to output_text_only.wav

============================================================
Test: image

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: This composite image displays four different luxury vehicles: a white Rolls-Royce sedan, a grey Mercedes-Benz GLE SUV, a red Ferrari Portofino M convertible, and a white Porsche 911 coupe.
Audio saved to output_image.wav

============================================================
Test: audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person is coughing.
Audio saved to output_audio.wav

============================================================
Test: video

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person uses a stylus to draw a guitar on a tablet.
Audio saved to output_video.wav

============================================================
Test: image_audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The image displays four luxury cars—a Rolls-Royce, a Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911—while the audio captures a person coughing.
Audio saved to output_image_audio.wav

============================================================
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

wenhuach21 · 2026-03-06T02:41:05Z



-SPECIAL_MULTIMODAL_BLOCK = {"deepseek_vl_v2": _get_deepseek_vl2_multimodal_block}
+def _get_qwen2_5_omni_multimodal_block(model, quant_vision=False):


Since the code for these two models has grown to 300+ lines, it’s making the main file quite cluttered. Shall we refine this file later

Sure, we will refactor this file later.

wenhuach21 · 2026-03-06T02:43:09Z

Awesome work, Liang Ge!

lvliang-intel · 2026-03-06T03:33:26Z

vLLM inference test with Qwen Omni 2.5 quantized model, accuracy is good

CUDA_VISIBLE_DEVICES=5 python run_qwen25_omni_vllm.py --model-dir ./tmp_qwen25_omni_w4a16
Warning: failed to apply auto_round_extension directly (No module named 'vllm.model_executor.layers.quantization.auto_round').
Fallback: set VLLM_ENABLE_AR_EXT=1 and continue.
Loading vLLM model from: /mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16
VLLM_ENABLE_AR_EXT=1
INFO 03-06 11:23:09 [utils.py:228] non-default args: {'trust_remote_code': True, 'disable_log_stats': True, 'quantization': 'auto-round', 'allow_deprecated_quantization': True, 'model': '/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16'}
WARNING 03-06 11:23:09 [envs.py:1706] Unknown vLLM environment variable detected: VLLM_ENABLE_AR_EXT
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section'}
INFO 03-06 11:23:09 [model.py:531] Resolved architecture: Qwen2_5OmniModel
INFO 03-06 11:23:09 [model.py:1554] Using max model len 32768
INFO 03-06 11:23:10 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 03-06 11:23:10 [vllm.py:747] Asynchronous scheduling is enabled.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:28 [core.py:101] Initializing a V1 LLM engine (v0.17.0rc1.dev92+gc012a8c47) with config: model='/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16', speculative_config=None, tokenizer='/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=inc, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:29 [parallel_state.py:1395] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.112.228.151:35297 backend=nccl
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:30 [parallel_state.py:1717] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:36 [gpu_model_runner.py:4261] Starting to load model /mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16...
(EngineCore_DP0 pid=3401586) You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
(EngineCore_DP0 pid=3401586) :1241: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
(EngineCore_DP0 pid=3401586) :1241: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:36 [cuda.py:453] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:36 [mm_encoder_attention.py:215] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [vllm.py:747] Asynchronous scheduling is enabled.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [gptq_marlin.py:376] Using MarlinLinearKernel for GPTQMarlinLinearMethod
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [cuda.py:405] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [flash_attn.py:593] Using FlashAttention version 2
Loading safetensors checkpoint shards: 0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 12% Completed | 1/8 [00:00<00:05, 1.26it/s]
Loading safetensors checkpoint shards: 25% Completed | 2/8 [00:01<00:02, 2.12it/s]
Loading safetensors checkpoint shards: 38% Completed | 3/8 [00:01<00:01, 2.81it/s]
Loading safetensors checkpoint shards: 50% Completed | 4/8 [00:01<00:01, 3.39it/s]
Loading safetensors checkpoint shards: 62% Completed | 5/8 [00:01<00:00, 3.84it/s]
Loading safetensors checkpoint shards: 75% Completed | 6/8 [00:01<00:00, 4.77it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:01<00:00, 4.48it/s]
(EngineCore_DP0 pid=3401586)
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:39 [default_loader.py:293] Loading weights took 1.80 seconds
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:39 [gpu_model_runner.py:4344] Model loading took 5.0 GiB memory and 2.319511 seconds
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:39 [gpu_model_runner.py:5260] Encoder cache will be initialized with a budget of 32768 tokens, and profiled with 1 video items of the maximum feature size.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:44 [decorators.py:465] Directly load AOT compilation from path /home/lianglv/.cache/vllm/torch_compile_cache/torch_aot_compile/a673b33b5a60a7ad62b531b8f037704f82238a10a2568f87fd1453da9f308ff9/rank_0_0/model
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:44 [backends.py:913] Using cache directory: /home/lianglv/.cache/vllm/torch_compile_cache/4f036895df/rank_0_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:44 [backends.py:973] Dynamo bytecode transform time: 3.10 s
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [backends.py:283] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 1.475 s
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [monitor.py:35] torch.compile and initial profiling run took 5.03 s in total
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [gpu_worker.py:424] Available KV cache memory: 65.65 GiB
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [kv_cache_utils.py:1314] GPU KV cache size: 1,912,224 tokens
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [kv_cache_utils.py:1319] Maximum concurrency for 32,768 tokens per request: 58.36x
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|████████| 51/51 [00:01<00:00, 25.77it/s]
Capturing CUDA graphs (decode, FULL): 100%|███████████████████████████| 35/35 [00:01<00:00, 27.89it/s]
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:50 [gpu_model_runner.py:5366] Graph capturing finished in 4 secs, took 0.64 GiB
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:50 [core.py:282] init engine (profile, create kv cache, warmup model) took 11.30 seconds
INFO 03-06 11:23:51 [llm.py:388] Supported tasks: ['generate']

============================================================
Test: text_only

Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 79.54it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 4.36it/s, est. speed input: 231.49 toks/s, output: 165.9
Output: The capital of France is Paris. It's a really big and important city. If you want to know more about Paris, like its famous landmarks or culture, just let me know.

============================================================
Test: image

Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.95it/s]
Processed prompts: 100%|█| 1/1 [00:01<00:00, 1.12s/it, est. speed input: 7047.24 toks/s, output: 48.9
Output: This image shows four different luxury cars. One is a white Rolls - Royce, another is a Mercedes - Benz GLE SUV, the third is a red Ferrari Portofino M, and the fourth is a white Porsche. What do you think about these cars?

============================================================
Test: audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 7.68it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 5.08it/s, est. speed input: 685.88 toks/s, output: 71.12
Output: I can hear a cough. What do you think about it?

============================================================
Test: video

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
[2026-03-06 11:24:48] INFO vision_process.py:213: torchvision: video_path='https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4', total_frames=197, video_fps=30.0, time=20.513s
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.86it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 1.47it/s, est. speed input: 6448.62 toks/s, output: 58.8
Output: In the video, someone is drawing a guitar on a tablet using a stylus. It looks like they're really focused on their work. What do you think about the way they're drawing?

============================================================
Test: image_audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.07it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 6.74it/s, est. speed input: 54052.29 toks/s, output: 189
Output: I can see four different types of cars in the pictures, and I can hear a cough. What do you think about these cars?

============================================================
Model: Qwen2.5-Omni (vLLM)
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

[rank0]:[W306 11:25:00.572522223 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())

lvliang-intel · 2026-03-06T03:37:59Z

vLLM inference test with Qwen3 Omni quantized mode, accuracy is not good. Looks like vLLM issue since transfromer inference test is good for Qwen3 Omni.

CUDA_VISIBLE_DEVICES=5 python run_qwen3_omni_vllm.py --model-dir ./tmp_qwen3_omni_w4a16
Warning: failed to apply auto_round_extension directly (No module named 'vllm.model_executor.layers.quantization.auto_round').
Fallback: set VLLM_ENABLE_AR_EXT=1 and continue.
Loading vLLM model from: /mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16
VLLM_ENABLE_AR_EXT=1
tokenizer_mode=slow
INFO 03-06 11:27:42 [utils.py:228] non-default args: {'tokenizer_mode': 'slow', 'trust_remote_code': True, 'disable_log_stats': True, 'quantization': 'auto-round', 'allow_deprecated_quantization': True, 'model': '/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16'}
WARNING 03-06 11:27:42 [envs.py:1706] Unknown vLLM environment variable detected: VLLM_ENABLE_AR_EXT
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_interleaved', 'interleaved', 'mrope_section'}
INFO 03-06 11:27:42 [model.py:531] Resolved architecture: Qwen3OmniMoeForConditionalGeneration
INFO 03-06 11:27:42 [model.py:1554] Using max model len 65536
INFO 03-06 11:27:42 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 03-06 11:27:42 [vllm.py:747] Asynchronous scheduling is enabled.
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:00 [core.py:101] Initializing a V1 LLM engine (v0.17.0rc1.dev92+gc012a8c47) with config: model='/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16', speculative_config=None, tokenizer='/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16', skip_tokenizer_init=False, tokenizer_mode=slow, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=65536, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=inc, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=3407242) Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section', 'interleaved'}
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:02 [parallel_state.py:1395] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.112.228.151:36671 backend=nccl
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:03 [parallel_state.py:1717] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A
(EngineCore_DP0 pid=3407242) Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section', 'interleaved'}
(EngineCore_DP0 pid=3407242) Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section', 'interleaved'}
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [gpu_model_runner.py:4261] Starting to load model /mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16...
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [cuda.py:453] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [mm_encoder_attention.py:215] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [vllm.py:747] Asynchronous scheduling is enabled.
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [gptq_marlin.py:376] Using MarlinLinearKernel for GPTQMarlinLinearMethod
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [cuda.py:405] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [flash_attn.py:593] Using FlashAttention version 2
(EngineCore_DP0 pid=3407242) :1241: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
(EngineCore_DP0 pid=3407242) :1241: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
Loading safetensors checkpoint shards: 0% Completed | 0/11 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 9% Completed | 1/11 [00:01<00:15, 1.59s/it]
Loading safetensors checkpoint shards: 18% Completed | 2/11 [00:03<00:14, 1.65s/it]
Loading safetensors checkpoint shards: 27% Completed | 3/11 [00:04<00:13, 1.64s/it]
Loading safetensors checkpoint shards: 36% Completed | 4/11 [00:06<00:11, 1.63s/it]
Loading safetensors checkpoint shards: 45% Completed | 5/11 [00:08<00:09, 1.63s/it]
Loading safetensors checkpoint shards: 55% Completed | 6/11 [00:09<00:08, 1.61s/it]
Loading safetensors checkpoint shards: 64% Completed | 7/11 [00:11<00:06, 1.60s/it]
Loading safetensors checkpoint shards: 73% Completed | 8/11 [00:12<00:04, 1.38s/it]
Loading safetensors checkpoint shards: 82% Completed | 9/11 [00:12<00:02, 1.08s/it]
Loading safetensors checkpoint shards: 91% Completed | 10/11 [00:12<00:00, 1.19it/s]
Loading safetensors checkpoint shards: 100% Completed | 11/11 [00:12<00:00, 1.18s/it]
(EngineCore_DP0 pid=3407242)
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:22 [default_loader.py:293] Loading weights took 12.98 seconds
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:24 [gpu_model_runner.py:4344] Model loading took 18.04 GiB memory and 14.610588 seconds
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:24 [gpu_model_runner.py:5260] Encoder cache will be initialized with a budget of 62720 tokens, and profiled with 1 video items of the maximum feature size.
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:30 [decorators.py:465] Directly load AOT compilation from path /home/lianglv/.cache/vllm/torch_compile_cache/torch_aot_compile/1e43a1e6cc10ae2ccd27dbb34914089c68a7bff9208d779e004e03e0e68baac0/rank_0_0/model
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:30 [backends.py:913] Using cache directory: /home/lianglv/.cache/vllm/torch_compile_cache/613a967063/rank_0_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:30 [backends.py:973] Dynamo bytecode transform time: 4.16 s
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:33 [backends.py:283] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 2.013 s
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:33 [monitor.py:35] torch.compile and initial profiling run took 7.23 s in total
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:34 [gpu_worker.py:424] Available KV cache memory: 52.54 GiB
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:34 [kv_cache_utils.py:1314] GPU KV cache size: 573,872 tokens
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:34 [kv_cache_utils.py:1319] Maximum concurrency for 65,536 tokens per request: 8.76x
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|████████| 51/51 [00:03<00:00, 14.22it/s]
Capturing CUDA graphs (decode, FULL): 100%|███████████████████████████| 35/35 [00:02<00:00, 15.66it/s]
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:40 [gpu_model_runner.py:5366] Graph capturing finished in 6 secs, took 1.03 GiB
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:40 [core.py:282] init engine (profile, create kv cache, warmup model) took 16.31 seconds
INFO 03-06 11:28:41 [llm.py:388] Supported tasks: ['generate', 'transcription']

============================================================
Test: text_only

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 81.42it/s]
Processed prompts: 100%|█| 1/1 [00:01<00:00, 1.86s/it, est. speed input: 7.00 toks/s, output: 137.79
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: image

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.53it/s]
Processed prompts: 100%|█| 1/1 [00:03<00:00, 3.00s/it, est. speed input: 2018.51 toks/s, output: 85.2
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: audio

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|███████████████████████████████████████████████| 1/1 [00:00<00:00, 101.73it/s]
Processed prompts: 100%|█| 1/1 [00:01<00:00, 1.98s/it, est. speed input: 30.33 toks/s, output: 129.41
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: video

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
[2026-03-06 11:29:42] INFO vision_process.py:213: torchvision: video_path='https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4', total_frames=197, video_fps=30.0, time=20.561s
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.43it/s]
Processed prompts: 100%|█| 1/1 [00:02<00:00, 2.44s/it, est. speed input: 1424.53 toks/s, output: 104.
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: image_audio

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.03it/s]
Processed prompts: 100%|█| 1/1 [00:02<00:00, 2.00s/it, est. speed input: 3051.97 toks/s, output: 127.
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================

for more information, see https://pre-commit.ci

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

lvliang-intel · 2026-03-17T14:13:32Z

Verified with vllm-omni (Based on PR vllm-project/vllm-omni#1777 and some adpations)

1 Quantized Qwen2.5-Omni Inference Test

[INFO] Running 7 prompts (3 text + 4 multimodal)...

[INFO] Inference completed in 93.6s

Text outputs: 3
Audio outputs: 3

Q1: What is 2 + 3? Answer with just the number.
A1: 5. If you have any other math questions or just want to chat about something else, feel free to let me know.

Q2: Briefly describe what a neural network is in one sentence.
A2: Well, a neural network is like a bunch of interconnected nodes that work together to process information. It's modeled after how our brains do things, but it can learn from data and make predictions o

Q3: Translate 'Hello, how are you?' to Chinese.
A3: “你好，你怎么样？”嗯，要是你还想学点别的翻译内容，随时跟我说哈。

[OK] Quantized model produced 3 text outputs

Multimodal Test Results

[OK] image: This image shows four different luxury cars - a white Rolls Royce, a Mercedes - Benz GLE SUV, a red Ferrari Portofino M,
[OK] audio: I can hear someone coughing. What do you think about that?
[OK] video: In the video, someone is drawing on an iPad using a stylus. They're creating a guitar - like shape with a curved neck an
[OK] image_audio: I can see four different types of cars in the pictures, but I don't hear anything. What do you think about these cars?

[INFO] Results saved to output_qwen25_omni_w4a16_verify/

verify_qwen25_omni_w4a16.py
run_verify_qwen25_w4a16.sh

2 Quantized Qwen3-Omni Inference Test

Text outputs: 3
Audio outputs: 3

Q1: What is 2 + 3? Answer with just the number.
A1: 5

Q2: Briefly describe what a neural network is in one sentence.
A2: A neural network is a computational model inspired by the human brain, composed of interconnected layers of nodes (neurons) that process and learn patterns from data through adjustments of internal we

Q3: Translate 'Hello, how are you?' to Chinese.
A3: 你好，你怎么样？

[OK] Quantized model produced 3 text outputs

Multimodal Test Results

[OK] image: The composite image displays four luxury vehicles: a white Rolls-Royce Phantom, a grey Mercedes-Benz GLE SUV in a desert
[OK] audio: The sound of someone coughing is heard.
[OK] video: The person erases the outline of the guitar on the iPad screen with their stylus.
[OK] image_audio: The image displays four luxury cars—a white Rolls-Royce, a gray Mercedes-Benz GLE SUV, a red Ferrari Portofino, and a wh

[INFO] Results saved to output_qwen3_omni_w4a16_verify/

run_verify_qwen3_w4a16.sh
verify_qwen3_omni_w4a16.py

wenhuach21

Great, thanks!
1 please update the vlms doc to show these models have been supported. feel free to add it in another pr
2 please help upstream the quantized model to intel space

lvliang-intel · 2026-03-18T06:35:25Z

Great, thanks! 1 please update the vlms doc to show these models have been supported. feel free to add it in another pr 2 please help upstream the quantized model to intel space

Will upstream the quantized model to intel space once the PR vllm-project/vllm-omni#1777 is merged and also will update document.

wenhuach21 · 2026-03-18T06:38:34Z

Great, thanks! 1 please update the vlms doc to show these models have been supported. feel free to add it in another pr 2 please help upstream the quantized model to intel space

Will upstream the quantized model to intel space once the PR vllm-project/vllm-omni#1777 is merged and also will update document.

Just mentioning this PR in the model card as a requirement is fine

lvliang-intel and others added 4 commits February 4, 2026 14:50

Support Qwen3 Omni model quantization

f19650d

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

[pre-commit.ci] auto fixes from pre-commit.com hooks

913a993

for more information, see https://pre-commit.ci

Merge branch 'main' into lvl/support_omni

fb0ae29

fix lint issue

2fef4c2

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

lvliang-intel and others added 4 commits February 9, 2026 14:55

Merge branch 'main' into lvl/support_omni

9000a8d

[pre-commit.ci] auto fixes from pre-commit.com hooks

cf69613

for more information, see https://pre-commit.ci

fix issuse with transformers running

5072aff

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

Merge branch 'main' into lvl/support_omni

6797b5d

Merge branch 'main' into lvl/support_omni

5ca1500

lvliang-intel marked this pull request as ready for review February 11, 2026 11:09

Copilot AI review requested due to automatic review settings February 11, 2026 11:09

Copilot AI reviewed Feb 11, 2026

View reviewed changes

lvliang-intel and others added 2 commits February 11, 2026 19:20

Update auto_round/special_model_handler.py

9e95103

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

remove unnecessary code

b2a65fc

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

chensuyue requested a review from xin3he February 12, 2026 06:29

lvliang-intel and others added 12 commits March 3, 2026 14:43

Merge branch 'main' into lvl/support_omni

fa9b4e4

[pre-commit.ci] auto fixes from pre-commit.com hooks

4a3d1c5

for more information, see https://pre-commit.ci

Update special_model_handler.py

3e2906e

Merge branch 'main' into lvl/support_omni

40e5fa0

support qwen2.5 omni

6a8cd62

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

Merge branch 'main' of https://github.com/intel/auto-round into lvl/s…

8efe1f7

…upport_omni

Merge branch 'lvl/support_omni' of https://github.com/intel/auto-round …

88a04f3

…into lvl/support_omni

[pre-commit.ci] auto fixes from pre-commit.com hooks

bc49bca

for more information, see https://pre-commit.ci

fix qwen2.5 omni issues

4733d83

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

add ut

e60e3e7

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

[pre-commit.ci] auto fixes from pre-commit.com hooks

6926dfb

for more information, see https://pre-commit.ci Signed-off-by: lvliang-intel <liang1.lv@intel.com>

Merge branch 'main' into lvl/support_omni

4c89a4a

pre-commit-ci Bot and others added 3 commits March 6, 2026 01:44

[pre-commit.ci] auto fixes from pre-commit.com hooks

54cb02a

for more information, see https://pre-commit.ci

Merge branch 'lvl/support_omni' of https://github.com/intel/auto-round …

a4fe356

…into lvl/support_omni

[pre-commit.ci] auto fixes from pre-commit.com hooks

681d391

for more information, see https://pre-commit.ci

lvliang-intel changed the title ~~Support Qwen3 Omni model quantization~~ Support Qwen3 and Qwen2.5 Omni model quantization Mar 6, 2026

lvliang-intel added 2 commits March 6, 2026 09:51

fix lint issue

4ac6f9b

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

Merge branch 'lvl/support_omni' of https://github.com/intel/auto-round …

1f2f464

…into lvl/support_omni

wenhuach21 reviewed Mar 6, 2026

View reviewed changes

chensuyue added this to the 0.12.0 milestone Mar 16, 2026

lvliang-intel and others added 4 commits March 17, 2026 10:35

Merge branch 'main' into lvl/support_omni

9d976d6

[pre-commit.ci] auto fixes from pre-commit.com hooks

5f32868

for more information, see https://pre-commit.ci

Apply suggestion from @Copilot

633f3af

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

Merge branch 'main' into lvl/support_omni

fbfebba

wenhuach21 self-requested a review March 18, 2026 01:40

wenhuach21 approved these changes Mar 18, 2026

View reviewed changes

chensuyue merged commit 91af35f into main Mar 18, 2026
28 of 29 checks passed

chensuyue deleted the lvl/support_omni branch March 18, 2026 06:55

This was referenced Mar 18, 2026

Qwen2.5-Omni-7B with Auto-Round #775

Closed

Inference failure in the Qwen3 Omni quantized model #862

Closed



		SPECIAL_MULTIMODAL_BLOCK = {"deepseek_vl_v2": _get_deepseek_vl2_multimodal_block}
		def _get_qwen2_5_omni_multimodal_block(model, quant_vision=False):

Conversation

lvliang-intel commented Feb 4, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Description

Type of Change

Related Issues

Checklist Before Submitting

Uh oh!

wenhuach21 commented Feb 5, 2026

Uh oh!

lvliang-intel commented Feb 11, 2026

Quantize:

Inference with transformers 5.1.0

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull request overview

Reviewed changes

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

Uh oh!

wenhuach21 commented Feb 25, 2026

Quantize:

Inference with transformers 5.1.0

Uh oh!

lvliang-intel commented Mar 6, 2026

Qwen2.5 Omni quantize and inference test pass:

============================================================ Test: text_only

============================================================ Test: image

============================================================ Test: audio

============================================================ Test: video

============================================================ Test: image_audio

============================================================ Results: 5 passed, 0 failed out of 5 tests Passed: text_only, image, audio, video, image_audio

Qwen3 Omni quantize and inference test pass:

============================================================ Test: text_only

============================================================ Test: image

============================================================ Test: audio

============================================================ Test: video

============================================================ Test: image_audio

============================================================ Results: 5 passed, 0 failed out of 5 tests Passed: text_only, image, audio, video, image_audio

Uh oh!

wenhuach21 Mar 6, 2026

Choose a reason for hiding this comment

Uh oh!

lvliang-intel Mar 6, 2026

Choose a reason for hiding this comment

Uh oh!

wenhuach21 commented Mar 6, 2026

Uh oh!

lvliang-intel commented Mar 6, 2026

vLLM inference test with Qwen Omni 2.5 quantized model, accuracy is good

============================================================ Test: text_only

============================================================ Test: image

============================================================ Test: audio

============================================================ Test: video

============================================================ Test: image_audio

============================================================ Model: Qwen2.5-Omni (vLLM) Results: 5 passed, 0 failed out of 5 tests Passed: text_only, image, audio, video, image_audio

Uh oh!

lvliang-intel commented Mar 6, 2026

vLLM inference test with Qwen3 Omni quantized mode, accuracy is not good. Looks like vLLM issue since transfromer inference test is good for Qwen3 Omni.

============================================================ Test: text_only

============================================================ Test: image

============================================================ Test: audio

============================================================ Test: video

============================================================ Test: image_audio

Uh oh!

lvliang-intel commented Mar 17, 2026

Verified with vllm-omni (Based on PR vllm-project/vllm-omni#1777 and some adpations)

Multimodal Test Results

Multimodal Test Results

Uh oh!

wenhuach21 left a comment

Choose a reason for hiding this comment

Uh oh!

lvliang-intel commented Mar 18, 2026

Uh oh!

lvliang-intel commented Feb 4, 2026 •

edited

Loading

============================================================
Test: text_only

============================================================
Test: image

============================================================
Test: audio

============================================================
Test: video

============================================================
Test: image_audio

============================================================
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

============================================================
Test: text_only

============================================================
Test: image

============================================================
Test: audio

============================================================
Test: video

============================================================
Test: image_audio

============================================================
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

============================================================
Test: text_only

============================================================
Test: image

============================================================
Test: audio

============================================================
Test: video

============================================================
Test: image_audio

============================================================
Model: Qwen2.5-Omni (vLLM)
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

============================================================
Test: text_only

============================================================
Test: image

============================================================
Test: audio

============================================================
Test: video

============================================================
Test: image_audio