Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions vllm_omni/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,14 +351,18 @@ async def create_chat_completion(
tprompt["modalities"] = ["image"]
if negative_prompt is not None:
tprompt["negative_prompt"] = negative_prompt
# GLM-Image's _call_hf_processor expects target_h/target_w in mm_processor_kwargs
# Always attach mm_processor_kwargs (possibly empty) so
# OmniInputPreprocessor._process_text routes through the
# multimodal processor path. Without it, the preprocessor
# falls back to plain _tokenize_prompt and AR-based image-gen
# models like GLM-Image never see their image-generation
# scaffold.
mm_processor_kwargs: dict[str, Any] = {}
if height is not None:
mm_processor_kwargs["target_h"] = height
if width is not None:
mm_processor_kwargs["target_w"] = width
if mm_processor_kwargs:
tprompt["mm_processor_kwargs"] = mm_processor_kwargs
tprompt["mm_processor_kwargs"] = mm_processor_kwargs
if engine_prompt_image is not None:
tprompt["multi_modal_data"] = engine_prompt_image
# Provide multi_modal_uuids so that newer vLLM versions
Expand Down Expand Up @@ -736,6 +740,22 @@ def _apply_request_overrides(
extra_body = getattr(request, "extra_body", {}) or {}
height, width = self._resolve_height_width_from_extra_body(extra_body)

# Fall back to the diffusion stage's default h/w when the user didn't
# specify them, so the compute works for the bare-curl request shape
# (no extra_body). Implicit gate: only fires when a stage in the
# pipeline declares height/width in its sampling params (e.g. GLM-Image
# stage-1 yaml); LLM-only / audio pipelines have neither and are skipped.
if height is None or width is None:
for dp in self.engine_client.default_sampling_params_list or []:
stage_h = getattr(dp, "height", None)
stage_w = getattr(dp, "width", None)
if stage_h is not None and stage_w is not None:
if height is None:
height = stage_h
if width is None:
width = stage_w
break

# Best-effort mode detection from user messages.
# i2i requests include at least one reference image in message content.
_, reference_images = self._extract_diffusion_prompt_and_images_from_messages(request.messages)
Expand All @@ -746,26 +766,16 @@ def _apply_request_overrides(
try:
from vllm_omni.model_executor.stage_input_processors.glm_image import compute_max_tokens

max_tokens = getattr(explicit_fields, "max_tokens", None)
if max_tokens is None:
max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img)
params.max_tokens = max_tokens
if "max_tokens" not in explicit_fields:
params.max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img)
# Keep target size in stage-0 sampling params so runner/model can
# build deterministic M-RoPE grids for t2i (no MM features).
extra_args = dict(getattr(params, "extra_args", {}) or {})
extra_args["target_h"] = int(height)
extra_args["target_w"] = int(width)
params.extra_args = extra_args
except (ImportError, ValueError, TypeError) as e:
logger.warning(f"Failed to compute max_tokens: {e}, using default if available")
else:
logger.info(
"[SamplingParams] Skip dynamic max_tokens (height=%s, width=%s, mode=%s, ref_images=%s)",
height,
width,
"i2i" if is_img2img else "t2i",
ref_image_count,
)
logger.warning("Failed to compute max_tokens: %s", e)

return params

Expand Down
6 changes: 5 additions & 1 deletion vllm_omni/inputs/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ def _process_text(
additional_information = parsed_content.get("additional_information")
if additional_information is not None:
inputs["additional_information"] = additional_information
elif mm_processor_kwargs:
elif "mm_processor_kwargs" in parsed_content:
# Presence — not truthiness. An explicitly-set empty dict still
# signals "route through the multimodal processor" (needed for
# AR-based image-gen where the HF processor supplies its own
# defaults and scaffold).
inputs = self._process_multimodal(
prompt_text,
{},
Expand Down