From 38188f0cfbea99d32b37e708a595b5ce8017cdd2 Mon Sep 17 00:00:00 2001
From: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
Date: Thu, 23 Apr 2026 19:54:29 +0200
Subject: [PATCH 1/4] [Bugfix] GLM-Image: fall back to diffusion stage h/w for
 AR max_tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #2320 (`7e28eda9`) dropped `max_tokens: 1281` from the GLM-Image
stage config and moved the compute into
`serving_chat._apply_request_overrides`, but gated it on
`height is not None and width is not None`. For the recipe's bare-curl
request (no `extra_body.height` / `extra_body.width`) the gate skipped
the compute; `SamplingParams.max_tokens` then fell through to vLLM's
`max_model_len - seq_len` (~131k) and the AR stage's generation
budget no longer matched the VQ token layout the parser expects,
leaving the pre-refactor path latently broken since #2320 and
surfacing as the IndexError the deploy-yaml edit in #3034 was
working around.

Fix: when the user didn't pass h/w, fall back to the diffusion stage's
default h/w (GLM-Image stage-1 yaml already declares
`height: 1024, width: 1024`), rather than hardcoding a second size
default in serving_chat or re-adding the yaml entry. This makes the
compute effectively unconditional for AR + image-diffusion pipelines
that declare a target size in their sampling params; LLM-only and
audio pipelines have neither height nor width in any stage's params
and continue to skip the block — no architecture gate needed.

Also fix a related bug: `getattr(explicit_fields, "max_tokens", None)`
was reading an attribute off a `set[str]` (Pydantic's
`model_fields_set`), so it always returned `None` and silently
overwrote user-provided `max_tokens`. Replaced with a proper set
membership check.

Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 35 +++++++++++---------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index fb96b397eb7..8451ce6a5ae 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -736,20 +736,33 @@ def _apply_request_overrides(
         extra_body = getattr(request, "extra_body", {}) or {}
         height, width = self._resolve_height_width_from_extra_body(extra_body)
 
+        # Fall back to the diffusion stage's default h/w when the user didn't
+        # specify them, so the compute works for the bare-curl request shape
+        # (no extra_body). Implicit gate: only fires when a stage in the
+        # pipeline declares height/width in its sampling params (e.g. GLM-Image
+        # stage-1 yaml); LLM-only / audio pipelines have neither and are skipped.
+        if height is None or width is None:
+            for dp in self.engine_client.default_sampling_params_list or []:
+                stage_h = getattr(dp, "height", None)
+                stage_w = getattr(dp, "width", None)
+                if stage_h is not None and stage_w is not None:
+                    if height is None:
+                        height = stage_h
+                    if width is None:
+                        width = stage_w
+                    break
+
         # Best-effort mode detection from user messages.
         # i2i requests include at least one reference image in message content.
         _, reference_images = self._extract_diffusion_prompt_and_images_from_messages(request.messages)
-        ref_image_count = len(reference_images)
-        is_img2img = ref_image_count > 0
+        is_img2img = len(reference_images) > 0
 
         if height is not None and width is not None:
             try:
                 from vllm_omni.model_executor.stage_input_processors.glm_image import compute_max_tokens
 
-                max_tokens = getattr(explicit_fields, "max_tokens", None)
-                if max_tokens is None:
-                    max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img)
-                params.max_tokens = max_tokens
+                if "max_tokens" not in explicit_fields:
+                    params.max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img)
                 # Keep target size in stage-0 sampling params so runner/model can
                 # build deterministic M-RoPE grids for t2i (no MM features).
                 extra_args = dict(getattr(params, "extra_args", {}) or {})
@@ -757,15 +770,7 @@ def _apply_request_overrides(
                 extra_args["target_w"] = int(width)
                 params.extra_args = extra_args
             except (ImportError, ValueError, TypeError) as e:
-                logger.warning(f"Failed to compute max_tokens: {e}, using default if available")
-        else:
-            logger.info(
-                "[SamplingParams] Skip dynamic max_tokens (height=%s, width=%s, mode=%s, ref_images=%s)",
-                height,
-                width,
-                "i2i" if is_img2img else "t2i",
-                ref_image_count,
-            )
+                logger.warning("Failed to compute max_tokens: %s", e)
 
         return params
 

From 5e32dc057479f4abc6221500322dbc51b2234f89 Mon Sep 17 00:00:00 2001
From: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
Date: Thu, 23 Apr 2026 20:08:42 +0200
Subject: [PATCH 2/4] [Bugfix] GLM-Image: route t2i requests through the
 multimodal processor (#3034)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vllm-omni issue #3034: `zai-org/GLM-Image` served via
`vllm serve --omni` returns noisy / washed-out images for the minimal
curl from the recipe:

    {"messages":[{"role":"user","content":"A beautiful landscape painting"}]}

Root cause:

- `OmniOpenAIServingChat` only attached `mm_processor_kwargs` to the
  tprompt when the request explicitly supplied
  `extra_body.height` / `extra_body.width`. For the bare-curl request
  the field was omitted entirely.
- `OmniInputPreprocessor._process_text` checked
  `elif mm_processor_kwargs:` (truthiness). With the field omitted the
  default `{}` was falsy, so the preprocessor fell back to plain
  `_tokenize_prompt`, skipping the multimodal processor path.
- That path is where GLM-Image's HF processor emits its
  image-generation scaffold
  `<|image|>PROMPT<sop>H W<eop><sop>h w<eop><|dit_token_N|>`. Without
  the scaffold the AR stage never entered image-generation mode and
  collapsed to a handful of repeated VQ codes (unique=15 across 1281
  positions, no terminal EOS), which the DiT denoised into a uniform
  / near-white image (mean=249, std=15).

Fix (minimal, two one-file changes):

- `serving_chat`: always attach `mm_processor_kwargs` (possibly empty)
  for image-modality requests, so the preprocessor sees it.
- `OmniInputPreprocessor._process_text`: switch from truthiness to
  presence — `"mm_processor_kwargs" in parsed_content`. An
  explicitly-attached empty dict is now a valid "route through the
  multimodal processor" signal, matching callers who want the HF
  processor's defaults to apply.

After the fix the AR produces 139 unique tokens with a terminal EOS
and the image is a coherent landscape (mean=117, std=71, full
0-255 range).

Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 10 +++++++---
 vllm_omni/inputs/preprocess.py               |  6 +++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 8451ce6a5ae..d9c4c0c473a 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -351,14 +351,18 @@ async def create_chat_completion(
                     tprompt["modalities"] = ["image"]
                 if negative_prompt is not None:
                     tprompt["negative_prompt"] = negative_prompt
-                # GLM-Image's _call_hf_processor expects target_h/target_w in mm_processor_kwargs
+                # Always attach mm_processor_kwargs (possibly empty) so
+                # OmniInputPreprocessor._process_text routes through the
+                # multimodal processor path. Without it, the preprocessor
+                # falls back to plain _tokenize_prompt and AR-based image-gen
+                # models like GLM-Image never see their image-generation
+                # scaffold (vllm-omni issue #3034).
                 mm_processor_kwargs: dict[str, Any] = {}
                 if height is not None:
                     mm_processor_kwargs["target_h"] = height
                 if width is not None:
                     mm_processor_kwargs["target_w"] = width
-                if mm_processor_kwargs:
-                    tprompt["mm_processor_kwargs"] = mm_processor_kwargs
+                tprompt["mm_processor_kwargs"] = mm_processor_kwargs
                 if engine_prompt_image is not None:
                     tprompt["multi_modal_data"] = engine_prompt_image
                     # Provide multi_modal_uuids so that newer vLLM versions
diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py
index cca6ce56870..484f8627f85 100644
--- a/vllm_omni/inputs/preprocess.py
+++ b/vllm_omni/inputs/preprocess.py
@@ -60,7 +60,11 @@ def _process_text(
             additional_information = parsed_content.get("additional_information")
             if additional_information is not None:
                 inputs["additional_information"] = additional_information
-        elif mm_processor_kwargs:
+        elif "mm_processor_kwargs" in parsed_content:
+            # Presence — not truthiness. An explicitly-set empty dict still
+            # signals "route through the multimodal processor" (needed for
+            # AR-based image-gen where the HF processor supplies its own
+            # defaults and scaffold, see vllm-omni issue #3034).
             inputs = self._process_multimodal(
                 prompt_text,
                 {},

From cdc9694846eecb9f5d052aa2fb86ae11b46ec482 Mon Sep 17 00:00:00 2001
From: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
Date: Thu, 23 Apr 2026 20:11:27 +0200
Subject: [PATCH 3/4] [Misc] GLM-Image: drop issue-tracker references from code
 comments

Comments should explain the invariant, not where to read about it;
the PR body / commit log is the right place for issue links.

Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 2 +-
 vllm_omni/inputs/preprocess.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index d9c4c0c473a..8f0622fbd5e 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -356,7 +356,7 @@ async def create_chat_completion(
                 # multimodal processor path. Without it, the preprocessor
                 # falls back to plain _tokenize_prompt and AR-based image-gen
                 # models like GLM-Image never see their image-generation
-                # scaffold (vllm-omni issue #3034).
+                # scaffold.
                 mm_processor_kwargs: dict[str, Any] = {}
                 if height is not None:
                     mm_processor_kwargs["target_h"] = height
diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py
index 484f8627f85..7282d7a520d 100644
--- a/vllm_omni/inputs/preprocess.py
+++ b/vllm_omni/inputs/preprocess.py
@@ -64,7 +64,7 @@ def _process_text(
             # Presence — not truthiness. An explicitly-set empty dict still
             # signals "route through the multimodal processor" (needed for
             # AR-based image-gen where the HF processor supplies its own
-            # defaults and scaffold, see vllm-omni issue #3034).
+            # defaults and scaffold).
             inputs = self._process_multimodal(
                 prompt_text,
                 {},

From 25c4e15e198d3d199ec3b95a9e7141fb439d01c1 Mon Sep 17 00:00:00 2001
From: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
Date: Thu, 23 Apr 2026 20:11:44 +0200
Subject: [PATCH 4/4] [Misc] GLM-Image: keep ref_image_count as its own local

Cosmetic: restore the two-line `ref_image_count = len(reference_images)`
/ `is_img2img = ref_image_count > 0` shape from the pre-#2320 code to
keep the diff against main smaller and match the surrounding style.

Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 8f0622fbd5e..d92f296a8d7 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -759,7 +759,8 @@ def _apply_request_overrides(
         # Best-effort mode detection from user messages.
         # i2i requests include at least one reference image in message content.
         _, reference_images = self._extract_diffusion_prompt_and_images_from_messages(request.messages)
-        is_img2img = len(reference_images) > 0
+        ref_image_count = len(reference_images)
+        is_img2img = ref_image_count > 0
 
         if height is not None and width is not None:
             try: