diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index fb96b397eb7..d92f296a8d7 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -351,14 +351,18 @@ async def create_chat_completion( tprompt["modalities"] = ["image"] if negative_prompt is not None: tprompt["negative_prompt"] = negative_prompt - # GLM-Image's _call_hf_processor expects target_h/target_w in mm_processor_kwargs + # Always attach mm_processor_kwargs (possibly empty) so + # OmniInputPreprocessor._process_text routes through the + # multimodal processor path. Without it, the preprocessor + # falls back to plain _tokenize_prompt and AR-based image-gen + # models like GLM-Image never see their image-generation + # scaffold. mm_processor_kwargs: dict[str, Any] = {} if height is not None: mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if mm_processor_kwargs: - tprompt["mm_processor_kwargs"] = mm_processor_kwargs + tprompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_image is not None: tprompt["multi_modal_data"] = engine_prompt_image # Provide multi_modal_uuids so that newer vLLM versions @@ -736,6 +740,22 @@ def _apply_request_overrides( extra_body = getattr(request, "extra_body", {}) or {} height, width = self._resolve_height_width_from_extra_body(extra_body) + # Fall back to the diffusion stage's default h/w when the user didn't + # specify them, so the compute works for the bare-curl request shape + # (no extra_body). Implicit gate: only fires when a stage in the + # pipeline declares height/width in its sampling params (e.g. GLM-Image + # stage-1 yaml); LLM-only / audio pipelines have neither and are skipped. + if height is None or width is None: + for dp in self.engine_client.default_sampling_params_list or []: + stage_h = getattr(dp, "height", None) + stage_w = getattr(dp, "width", None) + if stage_h is not None and stage_w is not None: + if height is None: + height = stage_h + if width is None: + width = stage_w + break + # Best-effort mode detection from user messages. # i2i requests include at least one reference image in message content. _, reference_images = self._extract_diffusion_prompt_and_images_from_messages(request.messages) @@ -746,10 +766,8 @@ def _apply_request_overrides( try: from vllm_omni.model_executor.stage_input_processors.glm_image import compute_max_tokens - max_tokens = getattr(explicit_fields, "max_tokens", None) - if max_tokens is None: - max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img) - params.max_tokens = max_tokens + if "max_tokens" not in explicit_fields: + params.max_tokens = compute_max_tokens(int(height), int(width), is_i2i=is_img2img) # Keep target size in stage-0 sampling params so runner/model can # build deterministic M-RoPE grids for t2i (no MM features). extra_args = dict(getattr(params, "extra_args", {}) or {}) @@ -757,15 +775,7 @@ def _apply_request_overrides( extra_args["target_w"] = int(width) params.extra_args = extra_args except (ImportError, ValueError, TypeError) as e: - logger.warning(f"Failed to compute max_tokens: {e}, using default if available") - else: - logger.info( - "[SamplingParams] Skip dynamic max_tokens (height=%s, width=%s, mode=%s, ref_images=%s)", - height, - width, - "i2i" if is_img2img else "t2i", - ref_image_count, - ) + logger.warning("Failed to compute max_tokens: %s", e) return params diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py index cca6ce56870..7282d7a520d 100644 --- a/vllm_omni/inputs/preprocess.py +++ b/vllm_omni/inputs/preprocess.py @@ -60,7 +60,11 @@ def _process_text( additional_information = parsed_content.get("additional_information") if additional_information is not None: inputs["additional_information"] = additional_information - elif mm_processor_kwargs: + elif "mm_processor_kwargs" in parsed_content: + # Presence — not truthiness. An explicitly-set empty dict still + # signals "route through the multimodal processor" (needed for + # AR-based image-gen where the HF processor supplies its own + # defaults and scaffold). inputs = self._process_multimodal( prompt_text, {},