ptarasiewiczNV · ptarasiewiczNV · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -351,14 +351,18 @@ async def create_chat_completion(
                     tprompt["modalities"] = ["image"]
                 if negative_prompt is not None:
                     tprompt["negative_prompt"] = negative_prompt
-                # GLM-Image's _call_hf_processor expects target_h/target_w in mm_processor_kwargs
+                # Always attach mm_processor_kwargs (possibly empty) so
+                # OmniInputPreprocessor._process_text routes through the
+                # multimodal processor path. Without it, the preprocessor
+                # falls back to plain _tokenize_prompt and AR-based image-gen
+                # models like GLM-Image never see their image-generation
+                # scaffold.
                 mm_processor_kwargs: dict[str, Any] = {}
                 if height is not None:
                     mm_processor_kwargs["target_h"] = height
                 if width is not None:
                     mm_processor_kwargs["target_w"] = width
-                if mm_processor_kwargs:
-                    tprompt["mm_processor_kwargs"] = mm_processor_kwargs
+                tprompt["mm_processor_kwargs"] = mm_processor_kwargs
                 if engine_prompt_image is not None:
                     tprompt["multi_modal_data"] = engine_prompt_image
                     # Provide multi_modal_uuids so that newer vLLM versions
@@ -736,6 +740,22 @@ def _apply_request_overrides(
         extra_body = getattr(request, "extra_body", {}) or {}
         height, width = self._resolve_height_width_from_extra_body(extra_body)
 
+        # Fall back to the diffusion stage's default h/w when the user didn't
+        # specify them, so the compute works for the bare-curl request shape
+        # (no extra_body). Implicit gate: only fires when a stage in the
+        # pipeline declares height/width in its sampling params (e.g. GLM-Image
+        # stage-1 yaml); LLM-only / audio pipelines have neither and are skipped.
+        if height is None or width is None:
+            for dp in self.engine_client.default_sampling_params_list or []:
+                stage_h = getattr(dp, "height", None)
+                stage_w = getattr(dp, "width", None)
+                if stage_h is not None and stage_w is not None:
+                    if height is None:
+                        height = stage_h
+                    if width is None:
+                        width = stage_w
+                    break
+
         # Best-effort mode detection from user messages.
         # i2i requests include at least one reference image in message content.
         _, reference_images = self._extract_diffusion_prompt_and_images_from_messages(request.messages)
@@ -746,26 +766,16 @@ def _apply_request_overrides(
             try:
                 from vllm_omni.model_executor.stage_input_processors.glm_image import compute_max_tokens
 
-                max_tokens = getattr(explicit_fields, "max_tokens", None)
-                if max_tokens is None:
-                    max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img)
-                params.max_tokens = max_tokens
+                if "max_tokens" not in explicit_fields:
+                    params.max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img)
                 # Keep target size in stage-0 sampling params so runner/model can
                 # build deterministic M-RoPE grids for t2i (no MM features).
                 extra_args = dict(getattr(params, "extra_args", {}) or {})
                 extra_args["target_h"] = int(height)
                 extra_args["target_w"] = int(width)
                 params.extra_args = extra_args
             except (ImportError, ValueError, TypeError) as e:
-                logger.warning(f"Failed to compute max_tokens: {e}, using default if available")
-        else:
-            logger.info(
-                "[SamplingParams] Skip dynamic max_tokens (height=%s, width=%s, mode=%s, ref_images=%s)",
-                height,
-                width,
-                "i2i" if is_img2img else "t2i",
-                ref_image_count,
-            )
+                logger.warning("Failed to compute max_tokens: %s", e)
 
         return params
 

diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py
@@ -60,7 +60,11 @@ def _process_text(
             additional_information = parsed_content.get("additional_information")
             if additional_information is not None:
                 inputs["additional_information"] = additional_information
-        elif mm_processor_kwargs:
+        elif "mm_processor_kwargs" in parsed_content:
+            # Presence — not truthiness. An explicitly-set empty dict still
+            # signals "route through the multimodal processor" (needed for
+            # AR-based image-gen where the HF processor supplies its own
+            # defaults and scaffold).
             inputs = self._process_multimodal(
                 prompt_text,
                 {},