Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3059e27
init and registry
JaredforReal Jan 8, 2026
c0a7684
implement glm_image_transformer.py
JaredforReal Jan 8, 2026
800cea4
update transformer
JaredforReal Jan 9, 2026
8664695
init pipeline_glm_image.py
JaredforReal Jan 9, 2026
b88b4b2
init pipeline_glm_image.py
JaredforReal Jan 9, 2026
b9108f4
remove pre process
JaredforReal Jan 9, 2026
371afd5
add check_input(), implement CFG parallel in diffuse(), align generat…
JaredforReal Jan 9, 2026
3d4f5f2
fix check_input(prompt_embed), add KVCache for Image Edit
JaredforReal Jan 9, 2026
0810dae
print out vllm version
Jan 13, 2026
8e36c51
update model config
tzhouam Jan 13, 2026
7f704d5
update worker
tzhouam Jan 13, 2026
4afb2ff
update one import in AsyncOmniLLM (not finish all, but can run)
tzhouam Jan 13, 2026
cb2e053
update Qwen3 Omni ViT init based on updated interface (the update for…
tzhouam Jan 13, 2026
e052c4a
Remove unnecessary override for OmniRequestState (the update for Omni…
tzhouam Jan 13, 2026
c08dcdd
update model runner dummy run
tzhouam Jan 13, 2026
166fc78
update ar scheduler
tzhouam Jan 13, 2026
4db8f0b
update _preprocess, execute model and sample_tokens for AR Model Runner
tzhouam Jan 13, 2026
63a69a5
debug AR Scheduler
tzhouam Jan 13, 2026
5bcdb43
update OmniGPUModelRunner._update_states
tzhouam Jan 13, 2026
2a0f72f
update the offline LLM request sorting due to changed requested id fo…
tzhouam Jan 14, 2026
f7c8af9
update Qwen3 Omni to fit with the engine core logic
tzhouam Jan 14, 2026
f12e0af
Merge PR #724
tzhouam Jan 14, 2026
e2462d2
update generation model runner
tzhouam Jan 14, 2026
d89e3c4
debug GLM-Image Model
tzhouam Jan 14, 2026
f269e0e
remove deleted args from doc string
tzhouam Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions examples/offline_inference/qwen3_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
import soundfile as sf
from PIL import Image
import vllm
from vllm import SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
Expand Down Expand Up @@ -237,6 +238,7 @@ def get_multi_audios_query() -> QueryResult:

def main(args):
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
print(f"="*20,"\n",f"vllm version: {vllm.__version__}","\n","="*20)

# Get paths from args
video_path = getattr(args, "video_path", None)
Expand Down Expand Up @@ -302,8 +304,8 @@ def main(args):

sampling_params_list = [
thinker_sampling_params,
talker_sampling_params, # code predictor is integrated into talker for Qwen3 Omni
code2wav_sampling_params,
# talker_sampling_params, # code predictor is integrated into talker for Qwen3 Omni
# code2wav_sampling_params,
Comment on lines 305 to +308
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Provide per-stage sampling params to match 3-stage pipeline

With only thinker_sampling_params in sampling_params_list, the default Qwen3-Omni Instruct pipeline (three stages in vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml) will raise a ValueError because Omni._run_generation requires len(sampling_params_list) == len(self.stage_list) (vllm_omni/entrypoints/omni.py). This means running the example with the default stage config now fails before any generation occurs; it only works if users manually supply a single-stage config (e.g., thinking-only), which isn’t the default for this model.

Useful? React with 👍 / 👎.

]

if args.txt_prompts is None:
Expand Down
162 changes: 35 additions & 127 deletions vllm_omni/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,10 @@
import vllm.envs as envs
from pydantic import ConfigDict
from pydantic.dataclasses import dataclass
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.config import ModelConfig, config
from vllm.config.model import (
_RUNNER_CONVERTS,
_RUNNER_TASKS,
ConvertOption,
ConvertType,
RunnerOption,
TaskOption,
_get_and_verify_dtype,
get_served_model_name,
)
Expand All @@ -31,11 +26,8 @@
from vllm.transformers_utils.gguf_utils import (
maybe_patch_hf_config_from_gguf,
)
from vllm.transformers_utils.utils import (
is_gguf,
maybe_model_redirect,
)

from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.transformers_utils.gguf_utils import is_gguf
import vllm_omni.model_executor.models as me_models

logger = init_logger(__name__)
Expand Down Expand Up @@ -116,7 +108,9 @@ def __post_init__(
video_pruning_rate: float | None,
) -> None:
# Keep set served_model_name before maybe_model_redirect(self.model)
self.served_model_name = get_served_model_name(self.model, self.served_model_name)
self.served_model_name = get_served_model_name(
self.model, self.served_model_name
)
self.model = maybe_model_redirect(self.model)
# The tokenizer is consistent with the model by default.
if self.tokenizer is None:
Expand Down Expand Up @@ -146,14 +140,6 @@ def __post_init__(

self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)

if (backend := envs.VLLM_ATTENTION_BACKEND) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
raise ValueError(
"VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
"module was not found. See "
"https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501
"for instructions on how to install it."
)

if self.override_attention_dtype is not None and not current_platform.is_rocm():
warnings.warn(
"override-attention-dtype is set but not using ROCm platform",
Expand Down Expand Up @@ -181,115 +167,24 @@ def __post_init__(
if dict_overrides:
self._apply_dict_overrides(hf_config, dict_overrides)
self.hf_text_config = self.draw_hf_text_config()
self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None)
self.attention_chunk_size = getattr(
self.hf_text_config, "attention_chunk_size", None
)
self.encoder_config = self._get_encoder_config()
# Try to load image processor config, but allow it to fail for stages that don't need it
try:
self.hf_image_processor_config = get_hf_image_processor_config(
self.model, hf_token=self.hf_token, revision=self.revision
)
except (OSError, ValueError, IndexError) as e:
# Some stages (e.g., code2wav, talker) don't need image processor
# Log warning but allow initialization to continue
logger.warning(
f"Failed to load image processor config for model '{self.model}': {e}. "
"This is expected for stages that don't require image processing."
)
self.hf_image_processor_config = None
self.hf_image_processor_config = get_hf_image_processor_config(
self.model, hf_token=self.hf_token, revision=self.revision
)
self.model_arch_config = self.get_model_arch_config()

architectures = self.architectures
registry = self.registry
is_generative_model = registry.is_text_generation_model(architectures, self)
is_pooling_model = registry.is_pooling_model(architectures, self)

def _task_to_convert(task: TaskOption) -> ConvertType:
if task == "embedding" or task == "embed":
return "embed"
if task == "classify":
return "classify"
if task == "reward":
return "reward"
if task == "score":
new_task = self._get_default_pooling_task(architectures)
return "classify" if new_task == "classify" else "embed"

return "none"

if self.task is not None:
runner: RunnerOption = "auto"
convert: ConvertOption = "auto"
msg_prefix = (
"The 'task' option has been deprecated and will be removed in v0.13.0 or v1.0, whichever comes first."
)
msg_hint = "Please remove this option."

is_generative_task = self.task in _RUNNER_TASKS["generate"]
is_pooling_task = self.task in _RUNNER_TASKS["pooling"]

if is_generative_model and is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = (
"Please replace this option with `--runner "
"generate` to continue using this model "
"as a generative model."
)
elif is_pooling_task:
runner = "pooling"
convert = "auto"
msg_hint = (
"Please replace this option with `--runner "
"pooling` to continue using this model "
"as a pooling model."
)
else: # task == "auto"
pass
elif is_generative_model or is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = "Please remove this option"
elif is_pooling_task:
runner = "pooling"
convert = _task_to_convert(self.task)
msg_hint = (
"Please replace this option with `--convert "
f"{convert}` to continue using this model "
"as a pooling model."
)
else: # task == "auto"
pass
else:
# Neither generative nor pooling model - try to convert if possible
if is_pooling_task:
runner = "pooling"
convert = _task_to_convert(self.task)
msg_hint = (
"Please replace this option with `--runner pooling "
f"--convert {convert}` to continue using this model "
"as a pooling model."
)
else:
debug_info = {
"architectures": architectures,
"is_generative_model": is_generative_model,
"is_pooling_model": is_pooling_model,
}
raise AssertionError(
"The model should be a generative or "
"pooling model when task is set to "
f"{self.task!r}. Found: {debug_info}"
)

self.runner = runner
self.convert = convert

msg = f"{msg_prefix} {msg_hint}"
warnings.warn(msg, DeprecationWarning, stacklevel=2)

self.runner_type = self._get_runner_type(architectures, self.runner)
self.convert_type = self._get_convert_type(architectures, self.runner_type, self.convert)
self.convert_type = self._get_convert_type(
architectures, self.runner_type, self.convert
)

if self.runner_type == "generate" and not is_generative_model:
generate_converts = _RUNNER_CONVERTS["generate"]
Expand Down Expand Up @@ -325,9 +220,12 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
if getattr(self.pooler_config, k) is None:
setattr(self.pooler_config, k, v)

default_pooling_type = self._model_info.default_pooling_type
if self.pooler_config.pooling_type is None:
self.pooler_config.pooling_type = default_pooling_type
default_seq_pooling_type = self._model_info.default_seq_pooling_type
if self.pooler_config.seq_pooling_type is None:
self.pooler_config.seq_pooling_type = default_seq_pooling_type
default_tok_pooling_type = self._model_info.default_tok_pooling_type
if self.pooler_config.tok_pooling_type is None:
self.pooler_config.tok_pooling_type = default_tok_pooling_type

self.dtype: torch.dtype = _get_and_verify_dtype(
self.model,
Expand All @@ -339,9 +237,17 @@ def _task_to_convert(task: TaskOption) -> ConvertType:

self.original_max_model_len = self.max_model_len
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)

if self.is_encoder_decoder:
self.mm_processor_cache_gb = 0
logger.info("Encoder-decoder model detected, disabling mm processor cache.")

# Init multimodal config if needed
if self._model_info.supports_multimodal:
if mm_encoder_tp_mode == "data" and not self._model_info.supports_multimodal_encoder_tp_data:
if (
mm_encoder_tp_mode == "data"
and not self._model_info.supports_multimodal_encoder_tp_data
):
logger.warning_once(
"This model does not support `--mm-encoder-tp-mode data`. "
"Falling back to `--mm-encoder-tp-mode weights`."
Expand All @@ -363,7 +269,9 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
video_pruning_rate=video_pruning_rate,
)

mm_config_kwargs = {k: v for k, v in mm_config_kwargs.items() if v is not None}
mm_config_kwargs = {
k: v for k, v in mm_config_kwargs.items() if v is not None
}

self.multimodal_config = MultiModalConfig(**mm_config_kwargs)

Expand All @@ -382,7 +290,7 @@ def _task_to_convert(task: TaskOption) -> ConvertType:

# Avoid running try_verify_and_update_config multiple times
self.config_updated = False

self._try_verify_and_update_model_config()
self._verify_quantization()
self._verify_cuda_graph()
self._verify_bnb_config()
64 changes: 56 additions & 8 deletions vllm_omni/core/sched/omni_ar_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ def update_from_output(
pooler_outputs = model_runner_output.pooler_output
num_nans_in_logits = model_runner_output.num_nans_in_logits
kv_connector_output = model_runner_output.kv_connector_output
cudagraph_stats = model_runner_output.cudagraph_stats

perf_stats: PerfStats | None = None
if self.perf_metrics and self.perf_metrics.is_enabled():
perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)

outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
spec_decoding_stats: SpecDecodingStats | None = None
Expand Down Expand Up @@ -131,26 +136,49 @@ def update_from_output(
spec_decoding_stats,
num_draft_tokens=num_draft_tokens,
num_accepted_tokens=num_accepted,
num_invalid_spec_tokens=scheduler_output.num_invalid_spec_tokens,
request_id=req_id,
)

stopped = False
new_logprobs = None
new_token_ids = generated_token_ids
pooler_output = pooler_outputs[req_index] if pooler_outputs else None
kv_transfer_params = None
status_before_stop = request.status

# Check for stop and update request status.
if new_token_ids:
new_token_ids, stopped = self._update_request_with_output(request, new_token_ids)

# Stop checking for pooler models.
pooler_output = None
if pooler_outputs:
pooler_output = pooler_outputs[req_index]
if pooler_output:
# Note: As we occupied the pooler output, for multimodal outputs, we do not intermediate stop checking for pooler output
if request.output_token_ids:
stopped = check_stop(request, self.max_model_len, pooler_output)

stopped = check_stop(request, self.max_model_len)
routed_experts = None
if stopped:
if self.vllm_config.model_config.enable_return_routed_experts:
kv_blocks = self.kv_cache_manager.get_blocks(request.request_id)
block_ids = kv_blocks.get_block_ids()[0]
num_tokens = request.num_tokens - 1

# compute slot mapping
block_ids_array = np.array(block_ids, dtype=np.int32)
num_blocks = len(block_ids)
block_size = self.block_size

# generate block offsets
block_offsets = np.arange(0, block_size)

# compute slot mapping: slot = block_id * block_size + offset
slot_mapping = (
block_offsets.reshape((1, block_size))
+ block_ids_array.reshape((num_blocks, 1)) * block_size
).flatten()[:num_tokens]

routed_experts = self.routed_experts_reader.get_routed_experts(
indices=slot_mapping
)
kv_transfer_params = self._free_request(request)
if status_before_stop == RequestStatus.RUNNING:
stopped_running_reqs.add(request)
Expand All @@ -165,7 +193,13 @@ def update_from_output(
struct_output_request = request.structured_output_request
assert struct_output_request is not None
assert struct_output_request.grammar is not None
struct_output_request.grammar.accept_tokens(req_id, new_token_ids)
ok = struct_output_request.grammar.accept_tokens(req_id, new_token_ids)
if not ok:
logger.warning(
"Unexpected: grammar rejected tokens %s for request %s.",
new_token_ids,
req_id,
)

if num_nans_in_logits is not None and req_id in num_nans_in_logits:
request.num_nans_in_logits = num_nans_in_logits[req_id]
Expand Down Expand Up @@ -200,7 +234,21 @@ def update_from_output(
if stopped_preempted_reqs:
# This is a rare case and unlikely to impact performance.
self.waiting.remove_requests(stopped_preempted_reqs)


if failed_kv_load_req_ids and not self.recompute_kv_load_failures:
requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids]
self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR)
for request in requests:
outputs[request.client_index].append(
EngineCoreOutput(
request_id=request.request_id,
new_token_ids=[],
finish_reason=request.get_finished_reason(),
events=request.take_events(),
trace_headers=request.trace_headers,
num_cached_tokens=request.num_cached_tokens,
)
)
# KV Connector: update state for finished KV Transfers.
if kv_connector_output:
self._update_from_kv_xfer_finished(kv_connector_output)
Expand Down
11 changes: 10 additions & 1 deletion vllm_omni/diffusion/forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,14 @@ def set_forward_context(
attn_metadata=attn_metadata,
split_text_embed_in_sp=split_text_embed_in_sp,
)
# vLLM CustomOp dispatch (e.g. QKVParallelLinear) requires a global
# vLLM config set via set_current_vllm_config().
with override_forward_context(forward_context):
yield
if vllm_config is None:
yield
else:
# Local import to avoid importing vllm.config.vllm at module import time.
from vllm.config.vllm import set_current_vllm_config

with set_current_vllm_config(vllm_config):
yield
Loading