Skip to content
Merged
2 changes: 2 additions & 0 deletions docs/user_guide/diffusion/quantization/autoround.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ At load time:
| Model | HF Checkpoint | Scheme | Group Size | Backend |
|-------|--------------|--------|------------|---------|
| FLUX.1-dev | `vllm-project-org/FLUX.1-dev-AutoRound-w4a16` | W4A16 | 128 | GPTQ-Marlin |
| Qwen2.5-Omni-7B | `Intel/Qwen2.5-Omni-7B-int4-AutoRound` | W4A16 | 128 | GPTQ-Marlin |
| Qwen3-Omni-30B-A3B-Instruct | `Intel/Qwen3-Omni-30B-A3B-Instruct-int4-AutoRound` | W4A16 | 128 | GPTQ-Marlin |

## Creating a Quantized Checkpoint

Expand Down
19 changes: 18 additions & 1 deletion examples/offline_inference/qwen2_5_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
with the correct prompt format on Qwen2.5-Omni
"""

import json
import os
import time
from typing import NamedTuple
Expand Down Expand Up @@ -289,7 +290,10 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin


def main(args):
model_name = "Qwen/Qwen2.5-Omni-7B"
model_name = args.model
quantization_config = None
if args.quantization_config is not None:
quantization_config = json.loads(args.quantization_config)

# Get paths from args
video_path = getattr(args, "video_path", None)
Expand Down Expand Up @@ -320,6 +324,7 @@ def main(args):
query_result = query_func(audio_path=audio_path, sampling_rate=sampling_rate)
else:
query_result = query_func()
args.quantization_config = quantization_config
omni = Omni.from_cli_args(args, model=model_name)
thinker_sampling_params = SamplingParams(
temperature=0.0, # Deterministic - no randomness
Expand Down Expand Up @@ -424,6 +429,18 @@ def main(args):

def parse_args():
parser = FlexibleArgumentParser(description="Demo on using vLLM for offline inference with audio language models")
parser.add_argument(
"--model",
type=str,
default="Qwen/Qwen2.5-Omni-7B",
help="Model name or local path.",
)
parser.add_argument(
"--quantization-config",
type=str,
default=None,
help="Optional JSON string forwarded to Omni(quantization_config=...).",
)
parser.add_argument(
"--query-type",
"-q",
Expand Down
Loading
Loading