From 38188f0cfbea99d32b37e708a595b5ce8017cdd2 Mon Sep 17 00:00:00 2001 From: Piotr Tarasiewicz Date: Thu, 23 Apr 2026 19:54:29 +0200 Subject: [PATCH 1/4] [Bugfix] GLM-Image: fall back to diffusion stage h/w for AR max_tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #2320 (`7e28eda9`) dropped `max_tokens: 1281` from the GLM-Image stage config and moved the compute into `serving_chat._apply_request_overrides`, but gated it on `height is not None and width is not None`. For the recipe's bare-curl request (no `extra_body.height` / `extra_body.width`) the gate skipped the compute; `SamplingParams.max_tokens` then fell through to vLLM's `max_model_len - seq_len` (~131k) and the AR stage's generation budget no longer matched the VQ token layout the parser expects, leaving the pre-refactor path latently broken since #2320 and surfacing as the IndexError the deploy-yaml edit in #3034 was working around. Fix: when the user didn't pass h/w, fall back to the diffusion stage's default h/w (GLM-Image stage-1 yaml already declares `height: 1024, width: 1024`), rather than hardcoding a second size default in serving_chat or re-adding the yaml entry. This makes the compute effectively unconditional for AR + image-diffusion pipelines that declare a target size in their sampling params; LLM-only and audio pipelines have neither height nor width in any stage's params and continue to skip the block — no architecture gate needed. Also fix a related bug: `getattr(explicit_fields, "max_tokens", None)` was reading an attribute off a `set[str]` (Pydantic's `model_fields_set`), so it always returned `None` and silently overwrote user-provided `max_tokens`. Replaced with a proper set membership check. Signed-off-by: Piotr Tarasiewicz --- vllm_omni/entrypoints/openai/serving_chat.py | 35 +++++++++++--------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index fb96b397eb7..8451ce6a5ae 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -736,20 +736,33 @@ def _apply_request_overrides( extra_body = getattr(request, "extra_body", {}) or {} height, width = self._resolve_height_width_from_extra_body(extra_body) + # Fall back to the diffusion stage's default h/w when the user didn't + # specify them, so the compute works for the bare-curl request shape + # (no extra_body). Implicit gate: only fires when a stage in the + # pipeline declares height/width in its sampling params (e.g. GLM-Image + # stage-1 yaml); LLM-only / audio pipelines have neither and are skipped. + if height is None or width is None: + for dp in self.engine_client.default_sampling_params_list or []: + stage_h = getattr(dp, "height", None) + stage_w = getattr(dp, "width", None) + if stage_h is not None and stage_w is not None: + if height is None: + height = stage_h + if width is None: + width = stage_w + break + # Best-effort mode detection from user messages. # i2i requests include at least one reference image in message content. _, reference_images = self._extract_diffusion_prompt_and_images_from_messages(request.messages) - ref_image_count = len(reference_images) - is_img2img = ref_image_count > 0 + is_img2img = len(reference_images) > 0 if height is not None and width is not None: try: from vllm_omni.model_executor.stage_input_processors.glm_image import compute_max_tokens - max_tokens = getattr(explicit_fields, "max_tokens", None) - if max_tokens is None: - max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img) - params.max_tokens = max_tokens + if "max_tokens" not in explicit_fields: + params.max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img) # Keep target size in stage-0 sampling params so runner/model can # build deterministic M-RoPE grids for t2i (no MM features). extra_args = dict(getattr(params, "extra_args", {}) or {}) @@ -757,15 +770,7 @@ def _apply_request_overrides( extra_args["target_w"] = int(width) params.extra_args = extra_args except (ImportError, ValueError, TypeError) as e: - logger.warning(f"Failed to compute max_tokens: {e}, using default if available") - else: - logger.info( - "[SamplingParams] Skip dynamic max_tokens (height=%s, width=%s, mode=%s, ref_images=%s)", - height, - width, - "i2i" if is_img2img else "t2i", - ref_image_count, - ) + logger.warning("Failed to compute max_tokens: %s", e) return params From 5e32dc057479f4abc6221500322dbc51b2234f89 Mon Sep 17 00:00:00 2001 From: Piotr Tarasiewicz Date: Thu, 23 Apr 2026 20:08:42 +0200 Subject: [PATCH 2/4] [Bugfix] GLM-Image: route t2i requests through the multimodal processor (#3034) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vllm-omni issue #3034: `zai-org/GLM-Image` served via `vllm serve --omni` returns noisy / washed-out images for the minimal curl from the recipe: {"messages":[{"role":"user","content":"A beautiful landscape painting"}]} Root cause: - `OmniOpenAIServingChat` only attached `mm_processor_kwargs` to the tprompt when the request explicitly supplied `extra_body.height` / `extra_body.width`. For the bare-curl request the field was omitted entirely. - `OmniInputPreprocessor._process_text` checked `elif mm_processor_kwargs:` (truthiness). With the field omitted the default `{}` was falsy, so the preprocessor fell back to plain `_tokenize_prompt`, skipping the multimodal processor path. - That path is where GLM-Image's HF processor emits its image-generation scaffold `<|image|>PROMPTH Wh w<|dit_token_N|>`. Without the scaffold the AR stage never entered image-generation mode and collapsed to a handful of repeated VQ codes (unique=15 across 1281 positions, no terminal EOS), which the DiT denoised into a uniform / near-white image (mean=249, std=15). Fix (minimal, two one-file changes): - `serving_chat`: always attach `mm_processor_kwargs` (possibly empty) for image-modality requests, so the preprocessor sees it. - `OmniInputPreprocessor._process_text`: switch from truthiness to presence — `"mm_processor_kwargs" in parsed_content`. An explicitly-attached empty dict is now a valid "route through the multimodal processor" signal, matching callers who want the HF processor's defaults to apply. After the fix the AR produces 139 unique tokens with a terminal EOS and the image is a coherent landscape (mean=117, std=71, full 0-255 range). Signed-off-by: Piotr Tarasiewicz --- vllm_omni/entrypoints/openai/serving_chat.py | 10 +++++++--- vllm_omni/inputs/preprocess.py | 6 +++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 8451ce6a5ae..d9c4c0c473a 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -351,14 +351,18 @@ async def create_chat_completion( tprompt["modalities"] = ["image"] if negative_prompt is not None: tprompt["negative_prompt"] = negative_prompt - # GLM-Image's _call_hf_processor expects target_h/target_w in mm_processor_kwargs + # Always attach mm_processor_kwargs (possibly empty) so + # OmniInputPreprocessor._process_text routes through the + # multimodal processor path. Without it, the preprocessor + # falls back to plain _tokenize_prompt and AR-based image-gen + # models like GLM-Image never see their image-generation + # scaffold (vllm-omni issue #3034). mm_processor_kwargs: dict[str, Any] = {} if height is not None: mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if mm_processor_kwargs: - tprompt["mm_processor_kwargs"] = mm_processor_kwargs + tprompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_image is not None: tprompt["multi_modal_data"] = engine_prompt_image # Provide multi_modal_uuids so that newer vLLM versions diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py index cca6ce56870..484f8627f85 100644 --- a/vllm_omni/inputs/preprocess.py +++ b/vllm_omni/inputs/preprocess.py @@ -60,7 +60,11 @@ def _process_text( additional_information = parsed_content.get("additional_information") if additional_information is not None: inputs["additional_information"] = additional_information - elif mm_processor_kwargs: + elif "mm_processor_kwargs" in parsed_content: + # Presence — not truthiness. An explicitly-set empty dict still + # signals "route through the multimodal processor" (needed for + # AR-based image-gen where the HF processor supplies its own + # defaults and scaffold, see vllm-omni issue #3034). inputs = self._process_multimodal( prompt_text, {}, From cdc9694846eecb9f5d052aa2fb86ae11b46ec482 Mon Sep 17 00:00:00 2001 From: Piotr Tarasiewicz Date: Thu, 23 Apr 2026 20:11:27 +0200 Subject: [PATCH 3/4] [Misc] GLM-Image: drop issue-tracker references from code comments Comments should explain the invariant, not where to read about it; the PR body / commit log is the right place for issue links. Signed-off-by: Piotr Tarasiewicz --- vllm_omni/entrypoints/openai/serving_chat.py | 2 +- vllm_omni/inputs/preprocess.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index d9c4c0c473a..8f0622fbd5e 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -356,7 +356,7 @@ async def create_chat_completion( # multimodal processor path. Without it, the preprocessor # falls back to plain _tokenize_prompt and AR-based image-gen # models like GLM-Image never see their image-generation - # scaffold (vllm-omni issue #3034). + # scaffold. mm_processor_kwargs: dict[str, Any] = {} if height is not None: mm_processor_kwargs["target_h"] = height diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py index 484f8627f85..7282d7a520d 100644 --- a/vllm_omni/inputs/preprocess.py +++ b/vllm_omni/inputs/preprocess.py @@ -64,7 +64,7 @@ def _process_text( # Presence — not truthiness. An explicitly-set empty dict still # signals "route through the multimodal processor" (needed for # AR-based image-gen where the HF processor supplies its own - # defaults and scaffold, see vllm-omni issue #3034). + # defaults and scaffold). inputs = self._process_multimodal( prompt_text, {}, From 25c4e15e198d3d199ec3b95a9e7141fb439d01c1 Mon Sep 17 00:00:00 2001 From: Piotr Tarasiewicz Date: Thu, 23 Apr 2026 20:11:44 +0200 Subject: [PATCH 4/4] [Misc] GLM-Image: keep ref_image_count as its own local Cosmetic: restore the two-line `ref_image_count = len(reference_images)` / `is_img2img = ref_image_count > 0` shape from the pre-#2320 code to keep the diff against main smaller and match the surrounding style. Signed-off-by: Piotr Tarasiewicz --- vllm_omni/entrypoints/openai/serving_chat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 8f0622fbd5e..d92f296a8d7 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -759,7 +759,8 @@ def _apply_request_overrides( # Best-effort mode detection from user messages. # i2i requests include at least one reference image in message content. _, reference_images = self._extract_diffusion_prompt_and_images_from_messages(request.messages) - is_img2img = len(reference_images) > 0 + ref_image_count = len(reference_images) + is_img2img = ref_image_count > 0 if height is not None and width is not None: try: