From fc54ff98c17b3905094d90d9dba61278d7886062 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 10:57:35 +0800 Subject: [PATCH 01/53] refine glm-image implementation Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_transformer.py | 21 ++++++------------- .../models/glm_image/pipeline_glm_image.py | 5 ++--- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index 09f7b17e133..d783a11b319 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -8,8 +8,8 @@ import torch import torch.nn as nn from diffusers.models.attention import FeedForward -from diffusers.models.transformers.transformer_glm_image import GlmImageCombinedTimestepSizeEmbeddings from diffusers.models.modeling_outputs import Transformer2DModelOutput +from diffusers.models.transformers.transformer_glm_image import GlmImageCombinedTimestepSizeEmbeddings from vllm.logger import init_logger from vllm.model_executor.layers.linear import QKVParallelLinear from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -17,7 +17,6 @@ from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.cache.base import CachedTransformer from vllm_omni.diffusion.data import OmniDiffusionConfig -from vllm_omni.diffusion.layers.rope import RotaryEmbedding logger = init_logger(__name__) @@ -354,8 +353,7 @@ def __init__( nn.Dropout(0.0), ) - # RoPE and attention - self.rope = RotaryEmbedding(is_neox_style=False) + # Attention self.attn = Attention( num_heads=num_heads, head_size=head_dim, @@ -368,7 +366,6 @@ def forward( hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, - attention_mask: torch.Tensor | None = None, kv_cache: GlmImageLayerKVCache | None = None, kv_cache_mode: KVCacheMode | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: @@ -379,7 +376,6 @@ def forward( hidden_states: Image hidden states [B, img_seq_len, D] encoder_hidden_states: Text hidden states [B, text_seq_len, D] image_rotary_emb: Tuple of (cos, sin) for RoPE - attention_mask: Optional attention mask for text tokens kv_cache: Optional layer KV cache for image editing kv_cache_mode: Cache mode (WRITE, READ, SKIP) @@ -407,16 +403,13 @@ def forward( # Apply RoPE only to image tokens (not text tokens) if image_rotary_emb is not None: - cos, sin = image_rotary_emb - cos = cos.to(query.dtype) - sin = sin.to(query.dtype) # Only apply RoPE to image part (after text_seq_length) query_img = query[:, text_seq_length:, :, :] key_img = key[:, text_seq_length:, :, :] from diffusers.models.embeddings import apply_rotary_emb - query_img = apply_rotary_emb(query_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) - # key_img = self.rope(key_img, cos, sin) - key_img = apply_rotary_emb(key_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + + query_img = apply_rotary_emb(query_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + key_img = apply_rotary_emb(key_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) @@ -555,7 +548,7 @@ def __init__( od_config: OmniDiffusionConfig, ): super().__init__() - + patch_size = od_config.tf_model_config.patch_size in_channels = od_config.tf_model_config.in_channels out_channels = od_config.tf_model_config.out_channels @@ -565,8 +558,6 @@ def __init__( condition_dim = od_config.tf_model_config.condition_dim prior_vq_quantizer_codebook_size = od_config.tf_model_config.prior_vq_quantizer_codebook_size text_embed_dim = od_config.tf_model_config.text_embed_dim - - # Get num_layers from config if available model_config = od_config.tf_model_config diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index f582c3b9b69..74a1ecac334 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -73,7 +73,7 @@ def get_glm_image_post_process_func(od_config: OmniDiffusionConfig): image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) def post_process_func(images: PIL.Image.Image): - return images + return image_processor.postprocess(images, output_type="pil") return post_process_func @@ -951,8 +951,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: latents = latents * latents_std + latents_mean image = self.vae.decode(latents, return_dict=False, generator=generator)[0] - # 9. Post-process - image = self.image_processor.postprocess(image, output_type="pil")[0] + # 9. Leave post-process to vllm-omni pipeline return DiffusionOutput(output=image) From 91bff0639366816426208949584edb102a1d4d86 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 12:59:54 +0800 Subject: [PATCH 02/53] implement GLM Image vllm AR Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 1775 +++++++++++++++++ 1 file changed, 1775 insertions(+) create mode 100644 vllm_omni/diffusion/models/glm_image/glm_image_ar.py diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_ar.py b/vllm_omni/diffusion/models/glm_image/glm_image_ar.py new file mode 100644 index 00000000000..98fd1dd19db --- /dev/null +++ b/vllm_omni/diffusion/models/glm_image/glm_image_ar.py @@ -0,0 +1,1775 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm_image/modeling_glm_image.py +# Copyright 2025 The vLLM team. +# Copyright 2025 The ZhipuAI Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GLM-Image model compatible with HuggingFace weights.""" + +from collections.abc import Iterable, Mapping, Sequence +from typing import Annotated, Literal + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import BatchFeature +from transformers.models.glm_image.configuration_glm_image import ( + GlmImageConfig, + GlmImageTextConfig, + GlmImageVisionConfig, + GlmImageVQVAEConfig, +) +from vllm.attention.layer import Attention +from vllm.config import CacheConfig, MultiModalConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) +from vllm.model_executor.layers.conv import Conv2dLayer +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import ( + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) +from vllm.model_executor.models.utils import ( + WeightsMapper, + make_empty_intermediate_tensors_factory, + make_layers, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFeatureSpec, + MultiModalFieldConfig, + MultiModalKwargsItems, +) +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import ( + BaseMultiModalProcessor, + BaseProcessingInfo, + PromptReplacement, + PromptUpdate, + PromptUpdateDetails, +) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum + +from .vision import get_vit_attn_backend + +logger = init_logger(__name__) + + +# === Multimodal Processing === + + +class GlmImagePixelInputs(TensorSchema): + """ + Schema for GLM-Image pixel inputs. + + Dimensions: + - np: Number of patches (total across all images) + - cpp: channels * patch_size * patch_size + - ni: Number of images + - g: Grid dimensions (3 for temporal, height, width) + """ + + type: Literal["pixel_values"] = "pixel_values" + + pixel_values: Annotated[torch.Tensor, TensorShape("np", "cpp")] + image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] + + +class GlmImageProcessingInfo(BaseProcessingInfo): + """ + Processing information for GLM-Image model. + + GLM-Image is an image generation model that uses: + - Vision encoder for encoding source images (image-to-image) + - VQ-VAE for tokenizing image features + - Text model for generating image tokens + """ + + def get_hf_config(self) -> GlmImageConfig: + return self.ctx.get_hf_config(GlmImageConfig) + + def get_hf_processor(self, **kwargs: object): + # GLM-Image uses a processor similar to Qwen2-VL + # Try to get GlmImageProcessor if available + try: + from transformers import GlmImageProcessor + + return self.ctx.get_hf_processor(GlmImageProcessor, **kwargs) + except ImportError: + # Fallback: return None and handle in processor + return None + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + # GLM-Image supports multiple source images for image-to-image generation + # or no image for text-to-image generation + # None means no limit on the number of images + return {"image": None} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + """ + Calculate the number of image tokens for a given image size. + + GLM-Image processes images through a patch embedding with patch_size=16, + then quantizes through VQ-VAE. The number of tokens is: + (image_height // patch_size) * (image_width // patch_size) + """ + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + + # Number of patches in each dimension + num_patches_h = image_height // patch_size + num_patches_w = image_width // patch_size + + return num_patches_h * num_patches_w + + def get_max_image_tokens(self) -> int: + """ + Get the maximum number of image tokens. + + Based on the default image size (2048x2048) and patch size (16). + """ + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + + # Default max size + image_size = getattr(vision_config, "image_size", 2048) + patch_size = getattr(vision_config, "patch_size", 16) + + max_patches = (image_size // patch_size) ** 2 + return max_patches + + def get_image_size_with_most_features(self) -> tuple[int, int]: + """ + Get the image size that produces the most features. + + Returns: + Tuple of (width, height) + """ + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + image_size = getattr(vision_config, "image_size", 2048) + return (image_size, image_size) + + +class GlmImageDummyInputsBuilder(BaseDummyInputsBuilder[GlmImageProcessingInfo]): + """ + Builds dummy inputs for GLM-Image model profiling. + """ + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + """ + Generate dummy text with image placeholders. + + GLM-Image uses <|image|> as the image placeholder token. + """ + num_images = mm_counts.get("image", 0) + + hf_config = self.info.get_hf_config() + # Get image token from config or use default + image_token_id = getattr(hf_config, "image_token_id", 167855) + + tokenizer = self.info.get_tokenizer() + # Try to get the image token string + try: + image_token = tokenizer.convert_ids_to_tokens(image_token_id) + except Exception: + image_token = "<|image|>" + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + """ + Generate dummy multimodal data for profiling. + """ + hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + + # Default image size from config + image_size = getattr(vision_config, "image_size", 2048) + width = height = image_size + + num_images = mm_counts.get("image", 0) + + image_overrides = mm_options.get("image") if mm_options else None + + return { + "image": self._get_dummy_images( + width=width, + height=height, + num_images=num_images, + overrides=image_overrides, + ) + } + + +class GlmImageMultiModalProcessor(BaseMultiModalProcessor[GlmImageProcessingInfo]): + """ + Multimodal processor for GLM-Image. + + Handles: + - Image preprocessing and tokenization + - Prompt construction with image placeholders + - Grid dimension calculation for M-RoPE position encoding + """ + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + """ + Call the HuggingFace processor. + + If no multimodal data is provided (text-to-image mode), + we only tokenize the text. + """ + if not mm_data or not mm_data.get("image"): + # Text-to-image mode: just tokenize the prompt + tokenizer = self.info.get_tokenizer() + prompt_ids = tokenizer.encode(prompt) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + # Image-to-image mode: use full processor + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + """ + Get the multimodal field configuration. + """ + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_grid_thw=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + """ + Get prompt updates for image tokens. + + GLM-Image replaces each image placeholder with: + <|image_start|> + image_tokens + <|image_end|> + """ + hf_config = self.info.get_hf_config() + + # Get special token IDs from config + image_token_id = getattr(hf_config, "image_token_id", 167855) + image_start_id = getattr(hf_config, "image_start_token_id", 16384) + image_end_id = getattr(hf_config, "image_end_token_id", 16385) + + # Get image grid info to determine number of tokens per image + # For now, use a simple approach based on config + vision_config = hf_config.vision_config + image_size = getattr(vision_config, "image_size", 2048) + patch_size = getattr(vision_config, "patch_size", 16) + + # Default number of image tokens + num_image_tokens = (image_size // patch_size) ** 2 + image_tokens = [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptUpdateDetails.select_token_id( + [image_start_id] + image_tokens + [image_end_id], + embed_token_id=image_token_id, + ), + ) + ] + + +# === VQ-VAE Components === + + +class GlmImageVQVAEVectorQuantizer(nn.Module): + """ + Vector Quantizer module for GLM-Image VQ-VAE (Inference-optimized). + + This module quantizes continuous latent vectors into discrete codebook vectors + using L2-normalized distance computation for better stability. + + Key differences from Chameleon's VQ-VAE: + - GLM-Image uses L2 normalization on both input and codebook embeddings + - Distance is computed as cosine similarity in normalized space + + Optimizations for inference (compared to transformers implementation): + 1. Uses matmul + argmax(similarity) instead of einsum + argmin(distance) + - Mathematically equivalent: argmin(2 - 2*sim) = argmax(sim) + - More efficient and clearer for L2-normalized vectors + 2. Removes redundant normalization (transformers normalizes twice) + 3. Removes training-only components (loss, straight-through estimator, beta) + 4. Directly returns quantized vectors without gradient preservation + + Args: + config: GlmImageVQVAEConfig containing: + - num_embeddings: Number of codebook vectors (typically 16384) + - embed_dim: Dimension of each embedding vector (typically 2048) + + Mathematical Verification: + For L2-normalized vectors where ||z|| = ||e|| = 1: + - distance = ||z - e||^2 = 2 - 2*(z·e) = 2(1 - cosine_similarity) + - Therefore: argmin(distance) ≡ argmax(cosine_similarity) + This equivalence has been verified numerically (see verify_vqvae_correctness.py) + """ + + def __init__(self, config: GlmImageVQVAEConfig): + super().__init__() + self.num_embeddings = config.num_embeddings + self.embedding_dim = config.embed_dim + + self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim) + + def forward(self, hidden_state: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize the input hidden states. + + Args: + hidden_state: Input tensor of shape (batch, channels, height, width) + + Returns: + Tuple of: + - hidden_state_quant: Quantized tensor, same shape as input + - min_encoding_indices: Codebook indices of shape + (batch * height * width,) + """ + batch_size, channels, height, width = hidden_state.shape + + # Permute to (batch, height, width, channels) and flatten for processing + hidden_state_flat = hidden_state.permute(0, 2, 3, 1).reshape(-1, self.embedding_dim) + + # L2 normalize both hidden states and embeddings + # This is the key difference from Chameleon's implementation + hidden_state_normalized = F.normalize(hidden_state_flat, p=2, dim=-1) + embedding_normalized = F.normalize(self.embedding.weight, p=2, dim=-1) + + # Compute cosine similarity (since both are L2 normalized) + # Higher similarity = closer match, so we negate for argmin + # Using matmul for efficiency: (N, D) @ (D, K) -> (N, K) + similarity = torch.matmul(hidden_state_normalized, embedding_normalized.t()) + + # Find nearest codebook entry (highest similarity) + min_encoding_indices = torch.argmax(similarity, dim=1) + + # Get quantized vectors using normalized embeddings + # For inference, we directly return the quantized vectors without + # straight-through estimator (no gradients needed) + hidden_state_quant = embedding_normalized[min_encoding_indices] + + # Reshape back to (batch, height, width, channels) + # then (batch, channels, height, width) + hidden_state_quant = ( + hidden_state_quant.view(batch_size, height, width, self.embedding_dim).permute(0, 3, 1, 2).contiguous() + ) + + return hidden_state_quant, min_encoding_indices + + +class GlmImageVQVAE(nn.Module): + """ + VQ-VAE module for GLM-Image. + + Unlike Chameleon's VQ-VAE which includes a full encoder, GLM-Image's VQ-VAE + only contains: + - quant_conv: Projects from latent_channels to embed_dim + - quantize: Vector quantizer + - post_quant_conv: Projects from embed_dim back to latent_channels + + The encoder functionality is handled by GlmImageVisionModel instead. + + This module is always in eval mode as the VQ-VAE is frozen during inference. + + Args: + config: GlmImageVQVAEConfig + """ + + def __init__(self, config: GlmImageVQVAEConfig): + super().__init__() + self.config = config + + # Vector quantizer + self.quantize = GlmImageVQVAEVectorQuantizer(config) + + # Convolutions for projecting to/from embedding space + # Using vLLM's optimized Conv2dLayer + self.quant_conv = Conv2dLayer( + in_channels=config.latent_channels, + out_channels=config.embed_dim, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.post_quant_conv = Conv2dLayer( + in_channels=config.embed_dim, + out_channels=config.latent_channels, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + + # VQ-VAE is always frozen in GLM-Image + self.eval() + + def encode(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Encode input features into quantized latent codes. + + Args: + hidden_states: Input tensor of shape (batch, latent_channels, height, width) + This is typically the output from GlmImageVisionModel reshaped + into spatial format. + + Returns: + Tuple of: + - quant: Quantized tensor of shape (batch, embed_dim, height, width) + - indices: Codebook indices of shape (batch * height * width,) + """ + # Project to embedding dimension + hidden_states = self.quant_conv(hidden_states) + + # Quantize + quant, indices = self.quantize(hidden_states) + + return quant, indices + + @property + def dtype(self) -> torch.dtype: + """Get the dtype of the model.""" + return self.quant_conv.weight.dtype + + @property + def device(self) -> torch.device: + """Get the device of the model.""" + return self.quant_conv.weight.device + + +# === Vision Model Components === + + +class GlmImageVisionMLP(nn.Module): + """ + MLP module for GLM-Image vision model. + + Uses GELU activation with standard fc1 -> fc2 structure. + Key difference from Glm4vVisionMLP: uses GELU instead of SwiGLU. + """ + + def __init__( + self, + config: GlmImageVisionConfig, + quant_config: QuantizationConfig | None = None, + multimodal_config: MultiModalConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" if multimodal_config else False + self.fc1 = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel, + ) + self.fc2 = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel, + ) + self.act_fn = nn.GELU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.fc1(x) + x = self.act_fn(x) + x, _ = self.fc2(x) + return x + + +class GlmImageVisionAttention(nn.Module): + """ + Multi-headed attention for GLM-Image vision model. + + Key differences from Glm4vVisionAttention: + - No RoPE - uses learned position embeddings instead + - Uses standard qkv projection (not separate q, k, v) + - attention_bias from config controls bias in linear layers + """ + + def __init__( + self, + config: GlmImageVisionConfig, + quant_config: QuantizationConfig | None = None, + multimodal_config: MultiModalConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" if multimodal_config else False + self.tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size() + + self.embed_dim = config.hidden_size + self.num_heads = config.num_heads + self.head_dim = self.embed_dim // self.num_heads + attention_bias = getattr(config, "attention_bias", True) + + self.num_heads_per_partition = dist_utils.divide(self.num_heads, self.tp_size) + + # QKV projection - uses bias based on config + self.qkv = QKVParallelLinear( + hidden_size=self.embed_dim, + head_size=self.head_dim, + total_num_heads=self.num_heads, + total_num_kv_heads=self.num_heads, # No GQA in vision model + bias=attention_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + disable_tp=use_data_parallel, + ) + self.proj = RowParallelLinear( + input_size=self.embed_dim, + output_size=self.embed_dim, + bias=attention_bias, + quant_config=quant_config, + prefix=f"{prefix}.proj", + disable_tp=use_data_parallel, + ) + + # MMEncoderAttention for efficient vision attention + self.attn = MMEncoderAttention( + num_heads=self.num_heads_per_partition, + head_size=self.head_dim, + scale=self.head_dim**-0.5, + multimodal_config=multimodal_config, + ) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: int | None = None, + ) -> torch.Tensor: + # hidden_states: [seq_len, hidden_size] (no batch dim) + seq_len = hidden_states.shape[0] + + # QKV projection + qkv, _ = self.qkv(hidden_states) + q, k, v = qkv.chunk(3, dim=-1) + + # Reshape for attention: [seq, hidden] -> [1, seq, heads, head_dim] + q = q.view(seq_len, self.num_heads_per_partition, self.head_dim).unsqueeze(0) + k = k.view(seq_len, self.num_heads_per_partition, self.head_dim).unsqueeze(0) + v = v.view(seq_len, self.num_heads_per_partition, self.head_dim).unsqueeze(0) + + # No RoPE in GLM-Image vision model - position info comes from embeddings + + # Apply attention + attn_output = self.attn( + query=q, + key=k, + value=v, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + + # Reshape back: [1, seq, heads, head_dim] -> [seq, hidden] + attn_output = attn_output.view(seq_len, -1) + + # Output projection + output, _ = self.proj(attn_output) + return output + + +class GlmImageVisionPatchEmbed(nn.Module): + """ + Patch embedding for GLM-Image vision model. + + Key difference from Glm4vVisionPatchEmbed: + - Uses 2D convolution (no temporal dimension) + - GLM-Image processes single images, not videos + """ + + def __init__(self, config: GlmImageVisionConfig) -> None: + super().__init__() + self.patch_size = config.patch_size + self.in_channels = config.in_channels + self.embed_dim = config.hidden_size + + # 2D convolution for patch embedding + self.proj = Conv2dLayer( + in_channels=self.in_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding=0, + bias=True, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Args: + hidden_states: Packed pixel values of shape + [total_patches, in_channels * patch_size * patch_size] + + Returns: + Patch embeddings of shape [total_patches, embed_dim] + """ + target_dtype = self.proj.weight.dtype + # Reshape from [N, C*P*P] to [N, C, P, P] + hidden_states = hidden_states.view(-1, self.in_channels, self.patch_size, self.patch_size) + # Conv2d and flatten: [N, C, P, P] -> [N, embed_dim, 1, 1] -> [N, embed_dim] + hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim) + return hidden_states + + +class GlmImageVisionEmbeddings(nn.Module): + """ + Vision embeddings for GLM-Image. + + Uses learned 2D position embeddings with bilinear interpolation + for variable resolution support. + + Key difference from Glm4vVisionEmbeddings: + - Uses bilinear interpolation (not bicubic) for position embedding adaptation + """ + + def __init__(self, config: GlmImageVisionConfig) -> None: + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + + # GLM-Image uses bilinear, Glm4v uses bicubic + self.interpolation_mode = "bilinear" + + def forward( + self, + embeddings: torch.Tensor, + lengths: list[int] | torch.Tensor, + image_shapes: torch.Tensor, + h_coords: torch.Tensor, + w_coords: torch.Tensor, + ) -> torch.Tensor: + """ + Add adapted position embeddings to patch embeddings. + + Args: + embeddings: Patch embeddings [total_seq, embed_dim] + lengths: Sequence length for each image + image_shapes: [num_images, 3] with (t, h, w) for each image + h_coords: Height coordinates for each patch [total_seq] + w_coords: Width coordinates for each patch [total_seq] + + Returns: + Embeddings with position encoding added [total_seq, embed_dim] + """ + pos_embed_weight = self.position_embedding.weight + hidden_size = pos_embed_weight.shape[1] + total_seq = h_coords.shape[0] + device = pos_embed_weight.device + + # Handle empty sequence case + if total_seq == 0: + adapted_pos_embed = torch.empty(0, hidden_size, device=device, dtype=pos_embed_weight.dtype) + else: + # Convert to tensors if needed + if isinstance(lengths, list): + lengths = torch.tensor(lengths, device=device, dtype=torch.long) + if not isinstance(image_shapes, torch.Tensor): + image_shapes = torch.tensor(image_shapes, device=device, dtype=torch.long) + + # Prepare 2D position embedding for interpolation + orig_size_sq = pos_embed_weight.shape[0] + orig_size = int(orig_size_sq**0.5) + pos_embed_2d = ( + pos_embed_weight.view(orig_size, orig_size, hidden_size) + .permute(2, 0, 1) # [H, W, C] -> [C, H, W] + .unsqueeze(0) # [1, C, H, W] + .to(device=device, dtype=torch.float32) + ) + + # Calculate target dimensions for each patch + target_h = torch.cat([image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]).to( + device=device, dtype=torch.float32 + ) + target_w = torch.cat([image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]).to( + device=device, dtype=torch.float32 + ) + + # Normalize coordinates to [-1, 1] for grid_sample + h_coords = h_coords.to(device=device, dtype=torch.float32) + w_coords = w_coords.to(device=device, dtype=torch.float32) + norm_w = ((w_coords + 0.5) / target_w) * 2 - 1 + norm_h = ((h_coords + 0.5) / target_h) * 2 - 1 + + # Create sampling grid [1, total_seq, 1, 2] + grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2) + + # Bilinear interpolation (GLM-Image uses bilinear, not bicubic) + interpolated_embed = F.grid_sample( + pos_embed_2d, + grid, + mode=self.interpolation_mode, + align_corners=False, + padding_mode="border", + ) + + # Reshape: [1, C, total_seq, 1] -> [total_seq, C] + adapted_pos_embed = (interpolated_embed.squeeze(0).squeeze(-1).permute(1, 0)).to(pos_embed_weight.dtype) + + # Add position embedding to patch embeddings + embeddings = embeddings + adapted_pos_embed.to(embeddings.device) + return embeddings + + +class GlmImageVisionBlock(nn.Module): + """ + Transformer block for GLM-Image vision model. + + Key differences from Glm4vVisionBlock: + - Uses LayerNorm instead of RMSNorm + - No RoPE position embeddings (handled in GlmImageVisionEmbeddings) + - Uses GELU MLP instead of SwiGLU + """ + + def __init__( + self, + config: GlmImageVisionConfig, + quant_config: QuantizationConfig | None = None, + multimodal_config: MultiModalConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attn = GlmImageVisionAttention( + config, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.attn", + ) + self.mlp = GlmImageVisionMLP( + config, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.mlp", + ) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: int | None = None, + ) -> torch.Tensor: + # Pre-norm attention + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states = self.attn( + hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + hidden_states = residual + hidden_states + + # Pre-norm MLP + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class GlmImageVisionModel(nn.Module): + """ + Vision encoder for GLM-Image. + + Key differences from Glm4vVisionTransformer: + - No RoPE - uses learned position embeddings with bilinear interpolation + - No merger, downsample, or post-processing layers + - Uses LayerNorm instead of RMSNorm in blocks + - No temporal dimension (images only, no video) + """ + + def __init__( + self, + config: GlmImageVisionConfig, + quant_config: QuantizationConfig | None = None, + multimodal_config: MultiModalConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_heads + self.head_dim = self.hidden_size // self.num_heads + self.patch_size = config.patch_size + self.spatial_merge_size = config.spatial_merge_size + + # Patch embedding + self.patch_embed = GlmImageVisionPatchEmbed(config) + + # Position embeddings + self.embeddings = GlmImageVisionEmbeddings(config) + + # Transformer blocks + self.blocks = nn.ModuleList( + [ + GlmImageVisionBlock( + config, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.blocks.{i}", + ) + for i in range(config.depth) + ] + ) + + # Attention backend selection + self.attn_backend = get_vit_attn_backend( + head_size=self.head_dim, + dtype=torch.get_default_dtype(), + attn_backend_override=(multimodal_config.mm_encoder_attn_backend if multimodal_config else None), + ) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def compute_position_ids(self, grid_thw: torch.Tensor) -> torch.Tensor: + """ + Compute position IDs for each patch based on grid dimensions. + + Args: + grid_thw: [num_images, 3] with (t, h, w) for each image + + Returns: + Position IDs [total_patches, 2] with (h_pos, w_pos) for each patch + """ + pos_ids = [] + for t, h, w in grid_thw: + # Create h and w position grids + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + + # Reshape for spatial merge + hpos_ids = ( + hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + .permute(0, 2, 1, 3) + .flatten() + ) + + wpos_ids = ( + wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + .permute(0, 2, 1, 3) + .flatten() + ) + + # Stack and repeat for temporal dimension + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + + return torch.cat(pos_ids, dim=0) + + def compute_attn_mask_seqlen( + self, + cu_seqlens: torch.Tensor, + ) -> int | None: + """Compute max sequence length for flash attention.""" + if ( + self.attn_backend == AttentionBackendEnum.FLASH_ATTN + or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA + ): + return (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + return None + + def forward( + self, + pixel_values: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + """ + Forward pass through vision encoder. + + Args: + pixel_values: Packed pixel values + [total_patches, num_channels * patch_size * patch_size] + grid_thw: [num_images, 3] with (t, h, w) for each image + + Returns: + Hidden states [total_patches, hidden_size] + """ + # Patch embedding + hidden_states = self.patch_embed(pixel_values.to(self.device, self.dtype)) + + # Compute position IDs + position_ids = self.compute_position_ids(grid_thw) + + # Compute cumulative sequence lengths for attention + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32 + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + cu_seqlens = cu_seqlens.to(self.device) + + # Get sequence lengths for position embedding + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + + # Add position embeddings + hidden_states = self.embeddings( + hidden_states, + seqlens, + grid_thw, + position_ids[:, 0].to(hidden_states.device), + position_ids[:, 1].to(hidden_states.device), + ) + + # Compute max seqlen for flash attention + max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) + + # Transformer blocks + for blk in self.blocks: + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + + return hidden_states + + +# === Text Model Components === + + +class GlmImageTextMLP(nn.Module): + """ + MLP module for GLM-Image text model. + + Uses SiLU activation with gated linear units (SwiGLU variant). + """ + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QuantizationConfig | None = None, + bias: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[intermediate_size] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. Only silu is supported for GLM-Image.") + # Import here to avoid circular dependency + from vllm.model_executor.layers.activation import SiluAndMul + + self.act_fn = SiluAndMul() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class GlmImageTextAttention(nn.Module): + """ + Multi-headed attention for GLM-Image text model. + + Uses Grouped Query Attention (GQA) with M-RoPE position embeddings. + """ + + def __init__( + self, + config: GlmImageTextConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position_embeddings: int = 32768, + quant_config: QuantizationConfig | None = None, + bias: bool = True, + cache_config: CacheConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + assert self.total_num_kv_heads % tp_size == 0 + else: + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + # M-RoPE for 3D position encoding + rope_parameters = getattr(config, "rope_parameters", None) + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class GlmImageTextDecoderLayer(nn.Module): + """ + Decoder layer for GLM-Image text model. + + Key difference from standard LLaMA-style decoder: + - Uses 4 RMSNorm layers instead of 2: + - input_layernorm: before self-attention + - post_self_attn_layernorm: after self-attention, before residual add + - post_attention_layernorm: before MLP + - post_mlp_layernorm: after MLP, before residual add + """ + + def __init__( + self, + config: GlmImageTextConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 32768) + attention_bias = getattr(config, "attention_bias", True) + + self.self_attn = GlmImageTextAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", config.num_attention_heads), + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = GlmImageTextMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=False, + prefix=f"{prefix}.mlp", + ) + + # GLM-Image uses 4 RMSNorm layers per decoder layer + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Save residual for first add + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + # Self Attention + hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states) + + # Post self-attention norm and residual add + hidden_states = self.post_self_attn_layernorm(hidden_states) + hidden_states = residual + hidden_states + + # MLP + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_mlp_layernorm(hidden_states) + hidden_states = residual + hidden_states + + # Return hidden_states and None for residual (already added) + return hidden_states, None + + +class GlmImageTextModel(nn.Module): + """ + Text model (language backbone) for GLM-Image. + + This is the decoder-only transformer that generates discrete image tokens. + Uses M-RoPE (3D position encoding) for multimodal position awareness. + """ + + def __init__( + self, + *, + vllm_config: VllmConfig, + config: GlmImageTextConfig, + prefix: str = "", + ) -> None: + super().__init__() + + self.config = config + self.quant_config = vllm_config.quant_config + cache_config = vllm_config.cache_config + + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + + # Embedding layer + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + ) + else: + self.embed_tokens = None + + # Decoder layers + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: GlmImageTextDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=vllm_config.quant_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + + # Final norm + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = None + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def get_input_embeddings(self) -> VocabParallelEmbedding: + return self.embed_tokens + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_tokens(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({"hidden_states": hidden_states, "residual": residual}) + + hidden_states = self.norm(hidden_states) + return hidden_states + + +class GlmImageModel(nn.Module): + """ + GLM-Image model that combines Vision Encoder, VQ-VAE, and Text Model. + + This model is used for image generation tasks: + - Image-to-Image: Source image → Vision Encoder → VQ-VAE tokens → Text Model + - Text-to-Image: Text tokens → Text Model → Generate image tokens + + Key components: + - visual: GlmImageVisionModel for encoding input images + - vqmodel: GlmImageVQVAE for tokenizing image features + - language_model: GlmImageTextModel for text/token generation + + The model uses M-RoPE (3D position encoding) for multimodal position awareness: + - temporal: constant for image tokens, incremental for text + - height: row position for image tokens + - width: column position for image tokens + """ + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + ) -> None: + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + # Vision encoder + self.visual = GlmImageVisionModel( + config.vision_config, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.visual" if prefix else "visual", + ) + + # VQ-VAE for image tokenization (frozen) + self.vqmodel = GlmImageVQVAE(config.vq_config) + + # Text/Language model + self.language_model = GlmImageTextModel( + vllm_config=vllm_config, + config=config.text_config, + prefix=f"{prefix}.language_model" if prefix else "language_model", + ) + + # Store special token IDs + self.image_token_id = config.image_token_id + self.image_start_token_id = config.image_start_token_id + self.image_end_token_id = config.image_end_token_id + + self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors + + def get_input_embeddings(self) -> VocabParallelEmbedding: + return self.language_model.get_input_embeddings() + + def get_image_features( + self, + pixel_values: torch.Tensor, + image_grid_thw: torch.Tensor, + ) -> torch.Tensor: + """ + Extract image features using the vision encoder. + + Args: + pixel_values: Packed pixel values + [total_patches, num_channels * patch_size * patch_size] + image_grid_thw: [num_images, 3] with (t, h, w) for each image + + Returns: + Image features [total_patches, hidden_size] + """ + return self.visual(pixel_values, image_grid_thw) + + def get_image_tokens( + self, + hidden_states: torch.Tensor, + image_grid_thw: torch.Tensor, + ) -> torch.Tensor: + """ + Tokenize image features into discrete tokens using VQ-VAE. + + Args: + hidden_states: Image features [total_patches, hidden_size] + image_grid_thw: [num_images, 3] with (t, h, w) for each image + + Returns: + Discrete token indices [total_patches] + """ + hidden_size = hidden_states.shape[-1] + split_sizes = (image_grid_thw.prod(dim=-1)).tolist() + hidden_states_list = torch.split(hidden_states, split_sizes, dim=0) + + all_image_tokens = [] + for i, hs in enumerate(hidden_states_list): + grid_t, grid_h, grid_w = image_grid_thw[i].tolist() + # Reshape to spatial format: [t, h, w, c] -> [t, c, h, w] + hs = hs.view(grid_t, grid_h, grid_w, hidden_size) + hs = hs.permute(0, 3, 1, 2).contiguous() + # Encode with VQ-VAE + _, indices = self.vqmodel.encode(hs) + all_image_tokens.append(indices) + + return torch.cat(all_image_tokens, dim=0) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + pixel_values: torch.Tensor | None = None, + image_grid_thw: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + """ + Forward pass through the GLM-Image model. + + For image-to-image generation: + 1. Encode source images with vision encoder + 2. Tokenize features with VQ-VAE + 3. Replace placeholder tokens with actual image tokens + 4. Run through language model + + Args: + input_ids: Input token IDs [batch_size, seq_len] + positions: Position IDs, shape (3, seq_len) for M-RoPE + intermediate_tensors: For pipeline parallelism + inputs_embeds: Pre-computed embeddings (optional) + pixel_values: Source image pixels (for image-to-image) + image_grid_thw: Grid dimensions for source images + + Returns: + Hidden states or intermediate tensors for PP + """ + # Handle intermediate tensors for pipeline parallelism + if intermediate_tensors is not None: + return self.language_model( + input_ids=None, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=None, + ) + + # Process source images if provided (image-to-image generation) + if pixel_values is not None and image_grid_thw is not None: + # Encode images + image_features = self.get_image_features(pixel_values, image_grid_thw) + # Tokenize with VQ-VAE + image_tokens = self.get_image_tokens(image_features, image_grid_thw) + image_tokens = image_tokens.to(input_ids.device) + + # Replace placeholder tokens with actual image tokens + special_image_mask = input_ids == self.image_token_id + if special_image_mask.sum() > 0: + input_ids = input_ids.clone() + input_ids[special_image_mask] = image_tokens + + # Get embeddings + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + input_ids = None + + # Forward through language model + hidden_states = self.language_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor( + GlmImageMultiModalProcessor, + info=GlmImageProcessingInfo, + dummy_inputs=GlmImageDummyInputsBuilder, +) +class GlmImageForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, SupportsMRoPE): + """ + GLM-Image model for conditional image generation. + + This is the main entry point for GLM-Image in vLLM. It wraps: + - GlmImageModel (Vision Encoder + VQ-VAE + Text Model) + - LM Head for token prediction + + Supports: + - Multimodal inputs (images for image-to-image generation) + - M-RoPE (3D position encoding) for multimodal generation + - Pipeline Parallelism + - Image-to-Image and Text-to-Image generation + """ + + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": ["gate_up_proj"], + } + + # Weight mapping from HuggingFace to vLLM format + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "lm_head.": "lm_head.", + "model.language_model.": "model.language_model.", + "model.visual.": "model.visual.", + "model.vqmodel.": "model.vqmodel.", + } + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config: GlmImageConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.config = config + self.vllm_config = vllm_config + + # Main model (Vision + VQ-VAE + Text) + self.model = GlmImageModel( + vllm_config=vllm_config, + prefix=f"{prefix}.model" if prefix else "model", + ) + + # LM head for token prediction + # GLM-Image outputs to vision_vocab_size (16512) not full vocab + self.lm_head = ParallelLMHead( + config.text_config.vision_vocab_size, + config.text_config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.lm_head" if prefix else "lm_head", + ) + + # Logits processor + self.logits_processor = LogitsProcessor( + config.text_config.vision_vocab_size, + soft_cap=None, + ) + + self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors + + def get_input_embeddings(self) -> VocabParallelEmbedding: + return self.model.get_input_embeddings() + + def get_image_features( + self, + pixel_values: torch.Tensor, + image_grid_thw: torch.Tensor, + ) -> torch.Tensor: + """Extract image features using vision encoder.""" + return self.model.get_image_features(pixel_values, image_grid_thw) + + def get_image_tokens( + self, + hidden_states: torch.Tensor, + image_grid_thw: torch.Tensor, + ) -> torch.Tensor: + """Tokenize image features with VQ-VAE.""" + return self.model.get_image_tokens(hidden_states, image_grid_thw) + + def get_mrope_input_positions( + self, + input_tokens: list[int], + mm_features: list[MultiModalFeatureSpec], + ) -> tuple[torch.Tensor, int]: + """ + Compute M-RoPE position IDs for GLM-Image generation. + + GLM-Image uses 3D position encoding: + - For text tokens: all 3 dimensions (temporal, height, width) are the same + - For image tokens: + - temporal: constant (marks image region) + - height: row position in image grid + - width: column position in image grid + + Args: + input_tokens: List of input token IDs + mm_features: Multimodal feature specifications + + Returns: + Tuple of (position_ids [3, seq_len], mrope_position_delta) + """ + # Gather image grid info from multimodal features + kwargs = MultiModalFeatureSpec.gather_kwargs( + mm_features, + {"image_grid_thw"}, + ) + image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])] + + hf_config = self.config + image_start_token_id = hf_config.image_start_token_id + image_end_token_id = hf_config.image_end_token_id + + seq_len = len(input_tokens) + llm_pos_ids_list: list[torch.Tensor] = [] + + if image_grid_thw: + # Build position IDs considering image regions + current_pos = 0 + image_idx = 0 + i = 0 + + while i < seq_len: + token = input_tokens[i] + + if token == image_start_token_id and image_idx < len(image_grid_thw): + # Start of image region + # Add position for the start marker + llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]])) + current_pos += 1 + i += 1 + + # Get grid dimensions for this image + _, h, w = image_grid_thw[image_idx] + total_image_tokens = h * w + + # Build 2D position IDs for image tokens + t_indices = torch.full((total_image_tokens,), current_pos) + h_indices = torch.arange(h).unsqueeze(1).expand(h, w).flatten() + current_pos + w_indices = torch.arange(w).unsqueeze(0).expand(h, w).flatten() + current_pos + + llm_pos_ids_list.append(torch.stack([t_indices, h_indices, w_indices], dim=0)) + + # Skip image tokens + i += total_image_tokens + current_pos += max(h, w) + image_idx += 1 + + elif token == image_end_token_id: + # End marker - just add normal position + llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]])) + current_pos += 1 + i += 1 + + else: + # Regular text token + llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]])) + current_pos += 1 + i += 1 + + llm_positions = torch.cat(llm_pos_ids_list, dim=1) + else: + # Pure text - all dimensions same + llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1) + + mrope_position_delta = (llm_positions.max() + 1 - seq_len).item() + return llm_positions, mrope_position_delta + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + pixel_values: torch.Tensor | None = None, + image_grid_thw: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + """ + Forward pass through GLM-Image. + + Args: + input_ids: Input token IDs [seq_len] + positions: Position IDs, shape (3, seq_len) for M-RoPE + intermediate_tensors: For pipeline parallelism + inputs_embeds: Pre-computed embeddings + pixel_values: Source image pixels (for image-to-image) + image_grid_thw: Grid dimensions for images + + Returns: + Hidden states or intermediate tensors + """ + if intermediate_tensors is not None: + inputs_embeds = None + + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + """Compute logits from hidden states.""" + logits = self.logits_processor( + self.lm_head, + hidden_states, + ) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """ + Load weights from HuggingFace checkpoint. + + Handles weight mapping for: + - Vision encoder weights + - VQ-VAE weights + - Text model weights + - LM head weights + """ + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + # Handle stacked parameters (QKV, gate_up) + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if name not in params_dict: + break + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight, shard_id) + break + else: + # Regular weight loading + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + loaded_params.add(name) + + return loaded_params From 21df56b1e2b610f268e63c8189cc233c91f25f5c Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 13:10:59 +0800 Subject: [PATCH 03/53] init multistage Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 0 vllm_omni/model_executor/stage_configs/glm_image.yaml | 1 + .../model_executor/stage_configs/glm_image_muilticonnector.yaml | 1 + vllm_omni/model_executor/stage_input_processors/glm_image.py | 1 + 4 files changed, 3 insertions(+) rename vllm_omni/{diffusion => model_executor}/models/glm_image/glm_image_ar.py (100%) create mode 100644 vllm_omni/model_executor/stage_configs/glm_image.yaml create mode 100644 vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml create mode 100644 vllm_omni/model_executor/stage_input_processors/glm_image.py diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py similarity index 100% rename from vllm_omni/diffusion/models/glm_image/glm_image_ar.py rename to vllm_omni/model_executor/models/glm_image/glm_image_ar.py diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml new file mode 100644 index 00000000000..e1ad7ddae76 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -0,0 +1 @@ +# init placeholder diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml new file mode 100644 index 00000000000..e1ad7ddae76 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml @@ -0,0 +1 @@ +# init placeholder diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py new file mode 100644 index 00000000000..e1ad7ddae76 --- /dev/null +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -0,0 +1 @@ +# init placeholder From b02a12a3d80f8ed44223d4935de64ee585dada59 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 06:30:47 +0000 Subject: [PATCH 04/53] revert attention_mask in GlmImageAttention forward() Signed-off-by: JaredforReal --- vllm_omni/diffusion/models/glm_image/glm_image_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index d783a11b319..f3f8f98ff17 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -366,6 +366,7 @@ def forward( hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + attention_mask: torch.Tensor | None = None, kv_cache: GlmImageLayerKVCache | None = None, kv_cache_mode: KVCacheMode | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: From d5697906333726c70b619943ff970df086a4af30 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 06:40:18 +0000 Subject: [PATCH 05/53] init and registry Signed-off-by: JaredforReal --- vllm_omni/model_executor/models/glm_image/__init__.py | 3 +++ vllm_omni/model_executor/models/registry.py | 5 +++++ 2 files changed, 8 insertions(+) create mode 100644 vllm_omni/model_executor/models/glm_image/__init__.py diff --git a/vllm_omni/model_executor/models/glm_image/__init__.py b/vllm_omni/model_executor/models/glm_image/__init__.py new file mode 100644 index 00000000000..d37044c09f1 --- /dev/null +++ b/vllm_omni/model_executor/models/glm_image/__init__.py @@ -0,0 +1,3 @@ +from .glm_image_ar import GlmImageForConditionalGeneration + +__all__ = ["GlmImageForConditionalGeneration"] diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py index 56bceae41ab..72a371cec4f 100644 --- a/vllm_omni/model_executor/models/registry.py +++ b/vllm_omni/model_executor/models/registry.py @@ -48,6 +48,11 @@ "qwen3_omni_code2wav", "Qwen3OmniMoeCode2Wav", ), + "GlmImageForConditionalGeneration": ( + "glm_image", + "glm_image_ar", + "GlmImageForConditionalGeneration", + ), } _VLLM_OMNI_MODELS = { From 234e49ff1bc106becafef5cd5cd4348f097fc7de Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 15:20:21 +0800 Subject: [PATCH 06/53] implement stage config and stage input processor Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 34 +++- .../stage_configs/glm_image.yaml | 71 ++++++- .../glm_image_muilticonnector.yaml | 87 +++++++- .../stage_input_processors/glm_image.py | 189 +++++++++++++++++- 4 files changed, 370 insertions(+), 11 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 74a1ecac334..20c06d30b88 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -855,14 +855,32 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: if req.seed is not None: generator = torch.Generator(device=self.device).manual_seed(req.seed) - # 1. Generate prior tokens with AR model - logger.info("Generating prior tokens with AR model...") - prior_token_id, prior_token_image_ids = self.generate_prior_tokens( - prompt=prompt, - image=condition_images, - height=height, - width=width, - ) + # 1. Get prior tokens - either from external source (multistage) or generate internally + # Check if prior_token_ids are provided externally (from AR stage in multistage mode) + external_prior_tokens = req.extra.get("prior_token_ids") if req.extra else None + external_prior_image_ids = req.extra.get("prior_token_image_ids") if req.extra else None + + if external_prior_tokens is not None: + # Multistage mode: use externally provided prior tokens from vLLM AR stage + logger.info("Using externally provided prior tokens from AR stage...") + prior_token_id = external_prior_tokens + if isinstance(prior_token_id, list): + prior_token_id = torch.tensor(prior_token_id, dtype=torch.long, device=self.device) + elif isinstance(prior_token_id, torch.Tensor): + prior_token_id = prior_token_id.to(device=self.device, dtype=torch.long) + # Ensure shape is [1, num_tokens] for batch processing + if prior_token_id.dim() == 1: + prior_token_id = prior_token_id.unsqueeze(0) + prior_token_image_ids = external_prior_image_ids + else: + # Single-stage mode: generate prior tokens with internal AR model + logger.info("Generating prior tokens with AR model...") + prior_token_id, prior_token_image_ids = self.generate_prior_tokens( + prompt=prompt, + image=condition_images, + height=height, + width=width, + ) # 2. Encode prompt for glyph embeddings logger.info("Encoding prompt...") diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index e1ad7ddae76..21ee05f3a4f 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -1 +1,70 @@ -# init placeholder +# Stage config for running GLM-Image with 2-stage architecture +# Stage 0: AR Model (vLLM implementation) - generates prior_token_ids +# Stage 1: Diffusion (DiT + VAE) - denoising and image decoding + +# The following config is designed for H100-80G GPUs. +stage_args: + # Stage 0: AR Model (GlmImageForConditionalGeneration) + # This stage uses the vLLM-optimized AR model to generate prior tokens + # for conditioning the diffusion process. + - stage_id: 0 + stage_type: llm + runtime: + process: true + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: ar + model_arch: GlmImageForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: false + trust_remote_code: true + engine_output_type: token_ids # Output prior_token_ids for diffusion stage + enable_prefix_caching: false + max_num_batched_tokens: 32768 + hf_config_name: vision_language_encoder # Subfolder in model path + final_output: false # AR is not the final output + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384) + seed: 42 + detokenize: false + + # Stage 1: Diffusion (DiT + VAE) + # This stage receives prior_token_ids from AR and performs denoising + VAE decode + - stage_id: 1 + stage_type: diffusion + runtime: + process: true + devices: "1" # Can use different GPU, or same GPU if memory allows + max_batch_size: 1 + engine_args: + model_arch: GlmImagePipeline + # Diffusion-specific parameters + num_gpus: 1 + cfg_parallel_size: 1 # Set to 2 for CFG parallelism on 2 GPUs + engine_input_source: [0] # Input from AR stage + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion + final_output: true + final_output_type: image + default_sampling_params: + num_inference_steps: 50 + guidance_scale: 1.5 + height: 1024 + width: 1024 + +# Top-level runtime config +runtime: + enabled: true + defaults: + window_size: -1 # Trigger downstream only after full upstream completion + max_inflight: 1 # Process serially within each stage + + edges: + - from: 0 # AR → Diffusion: trigger after AR completes + to: 1 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml index e1ad7ddae76..e5ee76e6a54 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml @@ -1 +1,86 @@ -# init placeholder +# Stage config for running GLM-Image with 2-stage architecture (MultiConnector version) +# Stage 0: AR Model (vLLM implementation) - generates prior_token_ids +# Stage 1: Diffusion (DiT + VAE) - denoising and image decoding +# +# This config uses OmniConnectors for inter-stage communication, +# enabling efficient tensor transfer between stages on different processes/nodes. + +# The following config is designed for multi-GPU setups (e.g., 2x H100-80G). +stage_args: + # Stage 0: AR Model (GlmImageForConditionalGeneration) + # This stage uses the vLLM-optimized AR model to generate prior tokens + # for conditioning the diffusion process. + - stage_id: 0 + stage_type: llm + runtime: + process: true + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: ar + model_arch: GlmImageForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: false + trust_remote_code: true + engine_output_type: token_ids # Output prior_token_ids for diffusion stage + enable_prefix_caching: false + max_num_batched_tokens: 32768 + hf_config_name: vision_language_encoder # Subfolder in model path + final_output: false # AR is not the final output + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 16384 # Support up to 2048x2048 images + seed: 42 + detokenize: false + + # Stage 1: Diffusion (DiT + VAE) + # This stage receives prior_token_ids from AR and performs denoising + VAE decode + - stage_id: 1 + stage_type: diffusion + runtime: + process: true + devices: "1" # Use separate GPU for diffusion + max_batch_size: 1 + engine_args: + model_arch: GlmImagePipeline + # Diffusion-specific parameters + num_gpus: 1 + cfg_parallel_size: 1 # Set to 2 for CFG parallelism + engine_input_source: [0] # Input from AR stage + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion + final_output: true + final_output_type: image + default_sampling_params: + num_inference_steps: 50 + guidance_scale: 1.5 + height: 1024 + width: 1024 + +# Top-level runtime config with MultiConnector support +runtime: + enabled: true + defaults: + window_size: -1 # Trigger downstream only after full upstream completion + max_inflight: 1 # Process serially within each stage + + edges: + - from: 0 # AR → Diffusion + to: 1 + window_size: -1 + +# OmniConnector configuration for efficient inter-stage tensor transfer +connectors: + - type: tensor_transfer + source_stage: 0 + target_stage: 1 + # Transfer prior_token_ids efficiently between stages + fields: + - name: prior_token_ids + dtype: int64 + - name: prior_token_image_ids + dtype: int64 + optional: true diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index e1ad7ddae76..4ea7e66c83c 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -1 +1,188 @@ -# init placeholder +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Stage input processor for GLM-Image: AR → Diffusion transition.""" + +from math import sqrt +from typing import Any + +import torch +from vllm.inputs import TextPrompt +from vllm.logger import init_logger + +from vllm_omni.inputs.data import OmniTokensPrompt + +logger = init_logger(__name__) + + +def _upsample_token_ids(token_ids: torch.Tensor, token_h: int, token_w: int) -> torch.Tensor: + """Upsample token IDs by 2x using nearest neighbor interpolation. + + GLM-Image AR model generates tokens at 32x downsampling, but DiT expects + 16x downsampling, so we need to upsample by 2x. + + Args: + token_ids: Prior token IDs of shape [num_tokens] + token_h: Height in token space (at 32x downsampling) + token_w: Width in token space (at 32x downsampling) + + Returns: + Upsampled token IDs of shape [num_tokens * 4] + """ + token_ids = token_ids.view(1, 1, token_h, token_w) + token_ids = torch.nn.functional.interpolate(token_ids.float(), scale_factor=2, mode="nearest").to(dtype=torch.long) + token_ids = token_ids.view(-1) + return token_ids + + +def _parse_generated_tokens( + token_ids: list[int], + height: int, + width: int, + factor: int = 32, +) -> tuple[torch.Tensor, int, int]: + """Parse AR-generated tokens to extract prior_token_ids. + + The AR model generates tokens in a specific format: + - For text-to-image: small_image_tokens + large_image_tokens + EOS + - For image-to-image: large_image_tokens + EOS + + We need to extract the large_image_tokens and upsample them. + + Args: + token_ids: Generated token IDs from AR model + height: Target image height + width: Target image width + factor: Downsampling factor (default 32 for AR output) + + Returns: + Tuple of (upsampled_prior_token_ids, pixel_height, pixel_width) + """ + # Calculate token dimensions + token_h = height // factor + token_w = width // factor + large_image_tokens = token_h * token_w + + # Calculate small image dimensions (used in text-to-image) + ratio = token_h / token_w + prev_token_h = int(sqrt(ratio) * (factor // 2)) + prev_token_w = int(sqrt(1 / ratio) * (factor // 2)) + small_image_tokens = prev_token_h * prev_token_w + + # Determine if this is text-to-image (has small + large) or image-to-image (large only) + total_expected_t2i = small_image_tokens + large_image_tokens + 1 # +1 for EOS + total_expected_i2i = large_image_tokens + 1 + + token_tensor = torch.tensor(token_ids, dtype=torch.long) + + if len(token_ids) >= total_expected_t2i: + # Text-to-image: extract large image tokens after small image tokens + large_start = small_image_tokens + large_end = large_start + large_image_tokens + prior_token_ids_d32 = token_tensor[large_start:large_end] + elif len(token_ids) >= total_expected_i2i: + # Image-to-image: large image tokens are at the beginning + prior_token_ids_d32 = token_tensor[:large_image_tokens] + else: + # Fallback: use whatever tokens we have + logger.warning( + f"Unexpected token count: {len(token_ids)}, expected at least {total_expected_i2i}. Using available tokens." + ) + prior_token_ids_d32 = token_tensor[:large_image_tokens] + + # Upsample from 32x to 16x + prior_token_ids = _upsample_token_ids(prior_token_ids_d32, token_h, token_w) + + return prior_token_ids, height, width + + +def ar2diffusion( + stage_list: list[Any], + engine_input_source: list[int], + prompt: OmniTokensPrompt | TextPrompt | list | None = None, + requires_multimodal_data: bool = False, +) -> list[dict[str, Any]]: + """ + Process AR stage outputs to create Diffusion stage inputs. + + This function bridges the AR model (which generates prior_token_ids) and + the Diffusion pipeline (which uses them for conditioned denoising). + + Workflow: + 1. Extract generated token_ids from AR stage output + 2. Parse and upsample prior_token_ids (32x → 16x) + 3. Package into diffusion request format with original prompt info + + Args: + stage_list: List of stage objects containing outputs + engine_input_source: Source stage IDs (typically [0] for AR stage) + prompt: Original prompt data (contains height, width, prompt text, images) + requires_multimodal_data: Whether to pass multimodal data (condition images) + + Returns: + List of dicts containing diffusion request parameters + """ + if not engine_input_source: + raise ValueError("engine_input_source cannot be empty") + + source_stage_id = engine_input_source[0] + if source_stage_id >= len(stage_list): + raise IndexError(f"Invalid stage_id: {source_stage_id}") + + if stage_list[source_stage_id].engine_outputs is None: + raise RuntimeError(f"Stage {source_stage_id} has no outputs yet") + + ar_outputs = stage_list[source_stage_id].engine_outputs + diffusion_inputs = [] + + # Normalize prompt to list + if not isinstance(prompt, list): + prompt = [prompt] if prompt is not None else [{}] + + for i, ar_output in enumerate(ar_outputs): + output = ar_output.outputs[0] + generated_token_ids = output.token_ids + + # Get original prompt info + original_prompt = prompt[i] if i < len(prompt) else {} + if isinstance(original_prompt, (OmniTokensPrompt, TextPrompt)): + original_prompt = dict(original_prompt) if hasattr(original_prompt, "__iter__") else {} + + # Extract dimensions from original prompt or use defaults + height = original_prompt.get("height", 1024) + width = original_prompt.get("width", 1024) + text_prompt = original_prompt.get("prompt", "") + + # Parse and upsample prior tokens + prior_token_ids, pixel_h, pixel_w = _parse_generated_tokens(generated_token_ids, height, width) + + # Build diffusion input + # The diffusion stage expects these in OmniDiffusionRequest format + diffusion_input = { + "prompt": text_prompt, + "height": pixel_h, + "width": pixel_w, + "extra": { + "prior_token_ids": prior_token_ids, + # Pass condition image info for image-to-image mode + "prior_token_image_ids": output.multimodal_output.get("prior_token_image_ids") + if hasattr(output, "multimodal_output") and output.multimodal_output + else None, + }, + } + + # Include multimodal data (condition images) if required + if requires_multimodal_data: + mm_data = original_prompt.get("multi_modal_data") + if mm_data: + diffusion_input["pil_image"] = mm_data.get("image") + + # Copy other relevant parameters from original prompt + for key in ["seed", "num_inference_steps", "guidance_scale", "negative_prompt"]: + if key in original_prompt: + diffusion_input[key] = original_prompt[key] + + diffusion_inputs.append(diffusion_input) + + logger.debug(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs") + + return diffusion_inputs From 2320cf811dcb4ccdd17bd1993bf0e41de5fbed66 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 16:22:31 +0800 Subject: [PATCH 07/53] fix image2image error Signed-off-by: JaredforReal --- .../diffusion/models/glm_image/__init__.py | 4 +-- .../models/glm_image/pipeline_glm_image.py | 29 +++++++++++++++---- vllm_omni/diffusion/registry.py | 1 + 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/__init__.py b/vllm_omni/diffusion/models/glm_image/__init__.py index ac7a98fa743..fc8256d8de6 100644 --- a/vllm_omni/diffusion/models/glm_image/__init__.py +++ b/vllm_omni/diffusion/models/glm_image/__init__.py @@ -9,7 +9,7 @@ from vllm_omni.diffusion.models.glm_image.pipeline_glm_image import ( GlmImagePipeline, get_glm_image_post_process_func, - # get_glm_image_pre_process_func, + get_glm_image_pre_process_func, ) __all__ = [ @@ -17,5 +17,5 @@ "GlmImagePipeline", "GlmImageTransformer2DModel", "get_glm_image_post_process_func", - # "get_glm_image_pre_process_func", + "get_glm_image_pre_process_func", ] diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 20c06d30b88..2eac7345c53 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -441,15 +441,34 @@ def generate_prior_tokens( input_length = inputs["input_ids"].shape[-1] # Process condition images if provided + # prior_token_image_ids should be a LIST of tensors, one per condition image prior_token_image_ids = None if image is not None and existing_grid is not None: prior_token_image_embed = self.vision_language_encoder.get_image_features( inputs["pixel_values"], existing_grid ) prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0) - prior_token_image_ids = self.vision_language_encoder.get_image_tokens( + # get_image_tokens returns a flat tensor, we need to split it per image + flat_prior_token_image_ids = self.vision_language_encoder.get_image_tokens( prior_token_image_embed, existing_grid ) + # Split by image grid sizes and convert to list + # Each image has t*h*w tokens, we need to split and reshape + split_sizes = (existing_grid.prod(dim=-1)).tolist() + prior_token_image_ids_list = torch.split(flat_prior_token_image_ids, split_sizes, dim=0) + # Convert to list and add batch dimension for each, then upsample + prior_token_image_ids = [] + for i, token_ids in enumerate(prior_token_image_ids_list): + grid_t, grid_h, grid_w = existing_grid[i].tolist() + # Reshape to [1, t*h*w] then upsample like the main prior_token_ids + token_ids = token_ids.view(1, -1) + # Upsample 2x (from d32 to d64) + token_ids_2d = token_ids.view(1, 1, grid_h, grid_w) + token_ids_upsampled = torch.nn.functional.interpolate( + token_ids_2d.float(), scale_factor=2, mode="nearest" + ).to(dtype=torch.long) + token_ids_upsampled = token_ids_upsampled.view(1, -1) + prior_token_image_ids.append(token_ids_upsampled) # Generate with AR model outputs = self.vision_language_encoder.generate( @@ -634,7 +653,7 @@ def diffuse( timestep=timestep, target_size=target_size, crop_coords=crop_coords, - kv_caches=kv_caches, + kv_cache=kv_caches, return_dict=False, )[0].float() else: @@ -647,7 +666,7 @@ def diffuse( timestep=timestep, target_size=target_size, crop_coords=crop_coords, - kv_caches=kv_caches, + kv_cache=kv_caches, return_dict=False, )[0].float() @@ -690,7 +709,7 @@ def diffuse( timestep=timestep, target_size=target_size, crop_coords=crop_coords, - kv_caches=kv_caches, + kv_cache=kv_caches, return_dict=False, )[0].float() @@ -763,7 +782,7 @@ def _prepare_condition_image_kv_cache( timestep=torch.zeros((1,), device=self.device), target_size=torch.tensor([condition_image.shape[-2:]], device=self.device, dtype=prompt_embeds.dtype), crop_coords=torch.zeros((1, 2), device=self.device, dtype=prompt_embeds.dtype), - kv_caches=kv_caches, + kv_cache=kv_caches, return_dict=False, ) diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index e566ca66cfa..9a507122024 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -137,6 +137,7 @@ def initialize_model( "QwenImageEditPlusPipeline": "get_qwen_image_edit_plus_pre_process_func", "LongCatImageEditPipeline": "get_longcat_image_edit_pre_process_func", "QwenImageLayeredPipeline": "get_qwen_image_layered_pre_process_func", + "GlmImagePipeline": "get_glm_image_pre_process_func", "WanPipeline": "get_wan22_pre_process_func", "WanImageToVideoPipeline": "get_wan22_i2v_pre_process_func", } From 6b9b4869aad1fa0a68b38287fcffcfddd05b1ecf Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 16:23:01 +0800 Subject: [PATCH 08/53] implement a pre processor func Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 85 +++++++++++++++++-- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 2eac7345c53..9ab67985616 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -56,6 +56,73 @@ logger = logging.getLogger(__name__) +def get_glm_image_pre_process_func(od_config: OmniDiffusionConfig): + """Get pre-processing function for GLM-Image pipeline. + + Pre-processes condition images before they are sent to the pipeline. + This is called by DiffusionEngine before batching requests. + """ + model_name = od_config.model + if os.path.exists(model_name): + model_path = model_name + else: + model_path = download_weights_from_hf_specific(model_name, None, ["*"]) + + vae_config_path = os.path.join(model_path, "vae/config.json") + with open(vae_config_path) as f: + vae_config = json.load(f) + block_out_channels = vae_config.get("block_out_channels", [128, 256, 512, 512]) + vae_scale_factor = 2 ** (len(block_out_channels) - 1) + + image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) + # GLM-Image uses patch_size=2 for transformer + patch_size = 2 + + def pre_process_func(requests: list[OmniDiffusionRequest]): + """Pre-process condition images for Image Edit mode.""" + for req in requests: + images = req.pil_image + if images is None: + # Text-to-image mode, no preprocessing needed + continue + + if not isinstance(images, list): + images = [images] + + preprocessed = [] + height, width = None, None + + for img in images: + if isinstance(img, PIL.Image.Image): + img_h, img_w = img.size[::-1] # PIL is (width, height) + else: + img_h, img_w = img.shape[:2] + + # Align to multiple of vae_scale_factor * patch_size + multiple_of = vae_scale_factor * patch_size + img_h = (img_h // multiple_of) * multiple_of + img_w = (img_w // multiple_of) * multiple_of + + processed = image_processor.preprocess(img, height=img_h, width=img_w) + preprocessed.append(processed) + + # Use first image dimensions as default + if height is None: + height, width = img_h, img_w + + # Store in request + req.preprocessed_image = preprocessed + req.prompt_image = images # Keep original PIL images + if req.height is None: + req.height = height + if req.width is None: + req.width = width + + return requests + + return pre_process_func + + def get_glm_image_post_process_func(od_config: OmniDiffusionConfig): """Get post-processing function for GLM-Image pipeline.""" model_name = od_config.model @@ -849,12 +916,20 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: prompt_embeds = req.prompt_embeds if isinstance(req.prompt_embeds, torch.Tensor) else None # Get condition images for Image Edit mode - condition_images = req.pil_image - if condition_images is not None and not isinstance(condition_images, list): - condition_images = [condition_images] + # Check if pre-processing was already done by DiffusionEngine + if hasattr(req, "preprocessed_image") and req.preprocessed_image is not None: + # Use pre-processed images from pre_process_func + preprocessed_images = req.preprocessed_image + condition_images = req.prompt_image if hasattr(req, "prompt_image") else req.pil_image + img_height = req.height + img_width = req.width + else: + # Fallback: preprocess in pipeline (for backward compatibility / debug) + condition_images = req.pil_image + if condition_images is not None and not isinstance(condition_images, list): + condition_images = [condition_images] + preprocessed_images, img_height, img_width = self._preprocess_condition_images(condition_images) - # Preprocess condition images and get dimensions - preprocessed_images, img_height, img_width = self._preprocess_condition_images(condition_images) is_image_edit = preprocessed_images is not None # Use image dimensions as default if available From 2d92e22b20c74415b7e4f3902cfa3ecf47f63095 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 16:33:59 +0800 Subject: [PATCH 09/53] fix image2image error Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 209 +++++------------- 1 file changed, 59 insertions(+), 150 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 9ab67985616..53fea3d79e4 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -17,7 +17,6 @@ import os import re from collections.abc import Iterable -from math import sqrt import numpy as np import PIL.Image @@ -358,45 +357,41 @@ def check_inputs( # ==================== AR Stage Methods ==================== @staticmethod - def _build_image_grid_thw( - token_h: int, - token_w: int, - prev_token_h: int, - prev_token_w: int, - existing_grid: torch.Tensor | None = None, - device: torch.device | None = None, - ) -> torch.Tensor: - """Build image grid tensor for AR model.""" - if existing_grid is None or existing_grid.numel() == 0: - return torch.tensor( - [ - [1, token_h, token_w], - [1, prev_token_h, prev_token_w], - ], - device=device, - ) - else: - return torch.cat( - [existing_grid.to(device), torch.tensor([[1, token_h, token_w]], device=device)], - dim=0, - ) + def _compute_generation_params( + image_grid_thw: torch.Tensor, + is_text_to_image: bool, + ) -> tuple[int, int, int, int]: + """ + Compute AR generation parameters from image grid. - @staticmethod - def _calculate_ar_generation_params( - token_h: int, token_w: int, prev_token_h: int, prev_token_w: int, is_text_to_image: bool - ) -> tuple[int, int]: - """Calculate AR generation parameters.""" - large_image_tokens = token_h * token_w - small_image_tokens = prev_token_h * prev_token_w + Args: + image_grid_thw: Image grid tensor of shape [N, 3] where each row is [t, h, w] + is_text_to_image: Whether this is text-to-image (vs image-to-image) - if is_text_to_image: - max_new_tokens = small_image_tokens + large_image_tokens + 1 - large_image_start_offset = small_image_tokens - else: - max_new_tokens = large_image_tokens + 1 + Returns: + Tuple of (max_new_tokens, large_image_start_offset, target_grid_h, target_grid_w) + """ + grid_sizes = [] + grid_hw = [] + + for i in range(image_grid_thw.shape[0]): + t, h, w = image_grid_thw[i].tolist() + grid_sizes.append(int(h * w)) + grid_hw.append((int(h), int(w))) + + if not is_text_to_image: + # Image-to-image: only generate target image tokens + max_new_tokens = grid_sizes[-1] + 1 large_image_start_offset = 0 + target_grid_h, target_grid_w = grid_hw[-1] + else: + # Text-to-image: generate both small preview and large target + total_tokens = sum(grid_sizes) + max_new_tokens = total_tokens + 1 + large_image_start_offset = sum(grid_sizes[1:]) + target_grid_h, target_grid_w = grid_hw[0] - return max_new_tokens, large_image_start_offset + return max_new_tokens, large_image_start_offset, target_grid_h, target_grid_w @staticmethod def _extract_large_image_tokens( @@ -418,28 +413,6 @@ def _upsample_token_ids(token_ids: torch.Tensor, token_h: int, token_w: int) -> token_ids = token_ids.view(1, -1) return token_ids - @staticmethod - def _build_prompt_with_shape( - prompt: str, - height: int, - width: int, - is_text_to_image: bool, - factor: int = 32, - ) -> tuple[str, int, int, int, int]: - """Build prompt with shape information for AR model.""" - token_h = height // factor - token_w = width // factor - ratio = token_h / token_w - prev_token_h = int(sqrt(ratio) * (factor // 2)) - prev_token_w = int(sqrt(1 / ratio) * (factor // 2)) - - if is_text_to_image: - expanded_prompt = f"{prompt}{token_h} {token_w}{prev_token_h} {prev_token_w}" - else: - expanded_prompt = f"{prompt}{token_h} {token_w}" - - return expanded_prompt, token_h, token_w, prev_token_h, prev_token_w - @torch.inference_mode() def generate_prior_tokens( self, @@ -448,7 +421,7 @@ def generate_prior_tokens( width: int, image: list[PIL.Image.Image] | None = None, factor: int = 32, - ) -> tuple[torch.Tensor, torch.Tensor | None, int, int]: + ) -> tuple[torch.Tensor, list[torch.Tensor] | None]: """ Generate prior tokens using the AR model. @@ -460,74 +433,59 @@ def generate_prior_tokens( factor: Token factor (default 32) Returns: - Tuple of (prior_token_ids, prior_token_image_ids, pixel_height, pixel_width) + Tuple of (prior_token_ids, prior_token_image_ids) + prior_token_image_ids is a list of tensors, one per condition image """ device = self.vision_language_encoder.device height = (height // factor) * factor width = (width // factor) * factor is_text_to_image = image is None or len(image) == 0 - expanded_prompt, token_h, token_w, prev_h, prev_w = self._build_prompt_with_shape( - prompt, height, width, is_text_to_image - ) - # Build message content content = [] if image is not None: for img in image: content.append({"type": "image", "image": img}) - content.append({"type": "text", "text": expanded_prompt}) + content.append({"type": "text", "text": prompt}) messages = [{"role": "user", "content": content}] - # Apply chat template + # Apply chat template - processor will handle target dimensions and build grid inputs = self.processor.apply_chat_template( messages, - add_generation_prompt=True, tokenize=True, + target_h=height, + target_w=width, return_dict=True, return_tensors="pt", - ) + ).to(device) - # Build image grid - existing_grid = inputs.get("image_grid_thw") - inputs["image_grid_thw"] = self._build_image_grid_thw( - token_h, - token_w, - prev_h, - prev_w, - existing_grid=existing_grid if not is_text_to_image else None, - device=device, - ) + image_grid_thw = inputs.get("image_grid_thw") - max_new_tokens, large_image_offset = self._calculate_ar_generation_params( - token_h, token_w, prev_h, prev_w, is_text_to_image + # Compute generation parameters from the full grid + max_new_tokens, large_image_offset, token_h, token_w = self._compute_generation_params( + image_grid_thw=image_grid_thw, is_text_to_image=is_text_to_image ) - large_image_tokens = token_h * token_w - - inputs = inputs.to(device) - input_length = inputs["input_ids"].shape[-1] # Process condition images if provided - # prior_token_image_ids should be a LIST of tensors, one per condition image + # Use image_grid_thw[:-1] to exclude the target image grid (last entry) prior_token_image_ids = None - if image is not None and existing_grid is not None: + if image is not None and image_grid_thw is not None and len(image_grid_thw) > 1: + # Get features only for condition images (exclude target image grid) + condition_grid = image_grid_thw[:-1] prior_token_image_embed = self.vision_language_encoder.get_image_features( - inputs["pixel_values"], existing_grid + inputs["pixel_values"], condition_grid ) prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0) - # get_image_tokens returns a flat tensor, we need to split it per image flat_prior_token_image_ids = self.vision_language_encoder.get_image_tokens( - prior_token_image_embed, existing_grid + prior_token_image_embed, condition_grid ) # Split by image grid sizes and convert to list - # Each image has t*h*w tokens, we need to split and reshape - split_sizes = (existing_grid.prod(dim=-1)).tolist() + split_sizes = (condition_grid.prod(dim=-1)).tolist() prior_token_image_ids_list = torch.split(flat_prior_token_image_ids, split_sizes, dim=0) - # Convert to list and add batch dimension for each, then upsample + # Convert to list with upsampling prior_token_image_ids = [] for i, token_ids in enumerate(prior_token_image_ids_list): - grid_t, grid_h, grid_w = existing_grid[i].tolist() - # Reshape to [1, t*h*w] then upsample like the main prior_token_ids + grid_t, grid_h, grid_w = condition_grid[i].tolist() token_ids = token_ids.view(1, -1) # Upsample 2x (from d32 to d64) token_ids_2d = token_ids.view(1, 1, grid_h, grid_w) @@ -545,8 +503,9 @@ def generate_prior_tokens( ) # Extract and upsample tokens + large_image_tokens = token_h * token_w prior_token_ids_d32 = self._extract_large_image_tokens( - outputs, input_length, large_image_offset, large_image_tokens + outputs, inputs["input_ids"].shape[-1], large_image_offset, large_image_tokens ) prior_token_ids = self._upsample_token_ids(prior_token_ids_d32, token_h, token_w) @@ -855,48 +814,6 @@ def _prepare_condition_image_kv_cache( return kv_caches - def _preprocess_condition_images( - self, - images: list[PIL.Image.Image] | PIL.Image.Image | None, - ) -> tuple[list[torch.Tensor] | None, int | None, int | None]: - """ - Preprocess condition images for Image Edit mode. - - Args: - images: Input images (PIL or list of PIL) - - Returns: - Tuple of (preprocessed_images, height, width) - """ - if images is None: - return None, None, None - - if not isinstance(images, list): - images = [images] - - preprocessed = [] - height, width = None, None - - for img in images: - if isinstance(img, PIL.Image.Image): - img_h, img_w = img.size[::-1] - else: - img_h, img_w = img.shape[:2] - - # Align to multiple of vae_scale_factor * patch_size - multiple_of = self.vae_scale_factor * self._patch_size - img_h = (img_h // multiple_of) * multiple_of - img_w = (img_w // multiple_of) * multiple_of - - processed = self.image_processor.preprocess(img, height=img_h, width=img_w) - preprocessed.append(processed) - - # Use first image dimensions as default - if height is None: - height, width = img_h, img_w - - return preprocessed, height, width - @torch.inference_mode() def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: """ @@ -916,19 +833,11 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: prompt_embeds = req.prompt_embeds if isinstance(req.prompt_embeds, torch.Tensor) else None # Get condition images for Image Edit mode - # Check if pre-processing was already done by DiffusionEngine - if hasattr(req, "preprocessed_image") and req.preprocessed_image is not None: - # Use pre-processed images from pre_process_func - preprocessed_images = req.preprocessed_image - condition_images = req.prompt_image if hasattr(req, "prompt_image") else req.pil_image - img_height = req.height - img_width = req.width - else: - # Fallback: preprocess in pipeline (for backward compatibility / debug) - condition_images = req.pil_image - if condition_images is not None and not isinstance(condition_images, list): - condition_images = [condition_images] - preprocessed_images, img_height, img_width = self._preprocess_condition_images(condition_images) + # Use pre-processed images from pre_process_func + preprocessed_images = req.preprocessed_image + condition_images = req.prompt_image if hasattr(req, "prompt_image") else req.pil_image + img_height = req.height + img_width = req.width is_image_edit = preprocessed_images is not None From fbb2ac8c7c8306d9efd4b9d0938e84840d4837e3 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 18:10:36 +0800 Subject: [PATCH 10/53] update stage config Signed-off-by: JaredforReal --- .../model_executor/stage_configs/glm_image.yaml | 15 ++++++++++++++- .../stage_configs/glm_image_muilticonnector.yaml | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index 21ee05f3a4f..8ba307a8253 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -2,7 +2,6 @@ # Stage 0: AR Model (vLLM implementation) - generates prior_token_ids # Stage 1: Diffusion (DiT + VAE) - denoising and image decoding -# The following config is designed for H100-80G GPUs. stage_args: # Stage 0: AR Model (GlmImageForConditionalGeneration) # This stage uses the vLLM-optimized AR model to generate prior tokens @@ -22,10 +21,12 @@ stage_args: enforce_eager: false trust_remote_code: true engine_output_type: token_ids # Output prior_token_ids for diffusion stage + distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 hf_config_name: vision_language_encoder # Subfolder in model path final_output: false # AR is not the final output + is_comprehension: true default_sampling_params: temperature: 0.0 top_p: 1.0 @@ -44,14 +45,26 @@ stage_args: max_batch_size: 1 engine_args: model_arch: GlmImagePipeline + model_stage: dit # Diffusion-specific parameters num_gpus: 1 cfg_parallel_size: 1 # Set to 2 for CFG parallelism on 2 GPUs + enforce_eager: true + trust_remote_code: true + engine_output_type: image # Final output is image + distributed_executor_backend: "mp" + enable_prefix_caching: false engine_input_source: [0] # Input from AR stage custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion final_output: true final_output_type: image + is_comprehension: false default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + seed: 42 + detokenize: true num_inference_steps: 50 guidance_scale: 1.5 height: 1024 diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml index e5ee76e6a54..7d209d6e3c6 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml @@ -5,7 +5,6 @@ # This config uses OmniConnectors for inter-stage communication, # enabling efficient tensor transfer between stages on different processes/nodes. -# The following config is designed for multi-GPU setups (e.g., 2x H100-80G). stage_args: # Stage 0: AR Model (GlmImageForConditionalGeneration) # This stage uses the vLLM-optimized AR model to generate prior tokens @@ -25,10 +24,12 @@ stage_args: enforce_eager: false trust_remote_code: true engine_output_type: token_ids # Output prior_token_ids for diffusion stage + distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 hf_config_name: vision_language_encoder # Subfolder in model path final_output: false # AR is not the final output + is_comprehension: true default_sampling_params: temperature: 0.0 top_p: 1.0 @@ -47,14 +48,26 @@ stage_args: max_batch_size: 1 engine_args: model_arch: GlmImagePipeline + model_stage: dit # Diffusion-specific parameters num_gpus: 1 cfg_parallel_size: 1 # Set to 2 for CFG parallelism + enforce_eager: true + trust_remote_code: true + engine_output_type: image # Final output is image + distributed_executor_backend: "mp" + enable_prefix_caching: false engine_input_source: [0] # Input from AR stage custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion final_output: true final_output_type: image + is_comprehension: false default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + seed: 42 + detokenize: true num_inference_steps: 50 guidance_scale: 1.5 height: 1024 From 0e5366f0a4d0a8233c670a76af266ff6355735d7 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 18:51:43 +0800 Subject: [PATCH 11/53] implement example offline end2end files Signed-off-by: JaredforReal --- .../offline_inference/glm_image/README.md | 138 ++++++ .../offline_inference/glm_image/end2end.py | 402 ++++++++++++++++++ .../offline_inference/glm_image/run_i2i.sh | 93 ++++ .../offline_inference/glm_image/run_t2i.sh | 87 ++++ 4 files changed, 720 insertions(+) create mode 100644 examples/offline_inference/glm_image/README.md create mode 100644 examples/offline_inference/glm_image/end2end.py create mode 100755 examples/offline_inference/glm_image/run_i2i.sh create mode 100755 examples/offline_inference/glm_image/run_t2i.sh diff --git a/examples/offline_inference/glm_image/README.md b/examples/offline_inference/glm_image/README.md new file mode 100644 index 00000000000..73ae0e046dd --- /dev/null +++ b/examples/offline_inference/glm_image/README.md @@ -0,0 +1,138 @@ +# GLM-Image Multistage End-to-End Inference + +This example demonstrates how to run GLM-Image with the vLLM-Omni multistage architecture. + +## Architecture + +GLM-Image uses a 2-stage pipeline: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ GLM-Image Pipeline │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ Stage 0 (AR Model) Stage 1 (Diffusion) │ +│ ┌─────────────────┐ ┌─────────────────────┐ │ +│ │ vLLM-optimized │ │ GlmImagePipeline │ │ +│ │ GlmImageFor │ prior │ ┌───────────────┐ │ │ +│ │ Conditional │──tokens───►│ │ DiT Denoiser │ │ │ +│ │ Generation │ │ └───────────────┘ │ │ +│ │ (9B AR model) │ │ │ │ │ +│ └─────────────────┘ │ ▼ │ │ +│ ▲ │ ┌───────────────┐ │ │ +│ │ │ │ VAE Decode │──┼──► Image +│ Text/Image │ └───────────────┘ │ │ +│ Input └─────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Features + +- **vLLM-optimized AR**: Uses PagedAttention and tensor parallelism for faster prior token generation +- **Flexible deployment**: AR and Diffusion stages can run on different GPUs +- **Text-to-Image**: Generate images from text descriptions +- **Image-to-Image**: Edit existing images with text prompts + +## Usage + +### Text-to-Image + +```bash +python end2end.py \ + --model-path /path/to/glm-image \ + --config-path ../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "A beautiful sunset over the ocean with sailing boats" \ + --height 1024 \ + --width 1024 \ + --output output_t2i.png +``` + +### Image-to-Image (Image Editing) + +```bash +python end2end.py \ + --model-path /path/to/glm-image \ + --config-path ../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "Transform this scene into a winter wonderland" \ + --image input.png \ + --output output_i2i.png +``` + +### With Custom Parameters + +```bash +python end2end.py \ + --model-path /path/to/glm-image \ + --config-path ../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "A photorealistic cat sitting on a window sill" \ + --height 1024 \ + --width 1024 \ + --num-inference-steps 50 \ + --guidance-scale 1.5 \ + --seed 42 \ + --output output.png +``` + +## Shell Scripts + +### Run Text-to-Image + +```bash +./run_t2i.sh +``` + +### Run Image-to-Image + +```bash +./run_i2i.sh --image /path/to/input.png +``` + +## Stage Configuration + +The stage config (`glm_image.yaml`) defines: + +- **Stage 0 (AR)**: Uses `GPUARWorker` with vLLM engine + + - Model: `GlmImageForConditionalGeneration` + - Output: `token_ids` (prior tokens) + +- **Stage 1 (Diffusion)**: Uses diffusion engine + - Model: `GlmImagePipeline` + - Output: Generated image + +See `vllm_omni/model_executor/stage_configs/glm_image.yaml` for full configuration. + +## Comparison with Single-Stage + +| Aspect | Single-Stage (transformers) | Multistage (vLLM) | +| ----------- | --------------------------- | ------------------- | +| AR Model | transformers native | vLLM PagedAttention | +| Memory | Higher (no KV cache opt) | Lower (optimized) | +| Throughput | Lower | Higher | +| Flexibility | Single GPU | Multi-GPU support | + +## Troubleshooting + +### OOM Error + +Try reducing memory usage: + +```bash +# In glm_image.yaml, adjust: +gpu_memory_utilization: 0.5 # Reduce from 0.6 +``` + +### Slow Initialization + +The first run loads model weights. Subsequent runs are faster: + +```bash +--stage-init-timeout 900 # Increase timeout for slow storage +``` + +## Requirements + +- vLLM-Omni with GLM-Image support +- CUDA-capable GPU (recommended: H100/A100 with 80GB) +- GLM-Image model weights diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py new file mode 100644 index 00000000000..bc0ecb68f7e --- /dev/null +++ b/examples/offline_inference/glm_image/end2end.py @@ -0,0 +1,402 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +End-to-end offline inference example for GLM-Image with multistage architecture. + +This script tests the multistage pipeline where: +- Stage 0 (AR): vLLM-optimized GlmImageForConditionalGeneration generates prior_token_ids +- Stage 1 (Diffusion): GlmImagePipeline performs DiT denoising + VAE decode + +Usage (text-to-image): + python end2end.py \ + --model-path /path/to/glm-image \ + --config-path /path/to/glm_image.yaml \ + --prompt "A beautiful sunset over the ocean" \ + --output output_t2i.png + +Usage (image-to-image / image edit): + python end2end.py \ + --model-path /path/to/glm-image \ + --config-path /path/to/glm_image.yaml \ + --prompt "Make it look like winter" \ + --image input.png \ + --output output_i2i.png + +Usage (with custom parameters): + python end2end.py \ + --model-path /path/to/glm-image \ + --config-path /path/to/glm_image.yaml \ + --prompt "A cat sitting on a window sill" \ + --height 1024 \ + --width 1024 \ + --num-inference-steps 50 \ + --guidance-scale 1.5 \ + --seed 42 + +For more options, run: + python end2end.py --help +""" + +import argparse +import os +import time +from pathlib import Path + +from PIL import Image + +from vllm_omni.entrypoints.omni import Omni + +# Default stage config path (relative to vllm_omni package) +DEFAULT_CONFIG_PATH = "vllm_omni/model_executor/stage_configs/glm_image.yaml" + +SEED = 42 + + +def load_image(image_path: str) -> Image.Image: + """Load an image from file path.""" + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + return Image.open(image_path).convert("RGB") + + +def save_image(image: Image.Image, output_path: str) -> None: + """Save an image to file path.""" + output_dir = os.path.dirname(output_path) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + image.save(output_path) + print(f"Image saved to: {output_path}") + + +def build_prompt_for_t2i( + prompt: str, + height: int = 1024, + width: int = 1024, +) -> dict: + """ + Build prompt dict for text-to-image generation. + + Args: + prompt: Text description for image generation + height: Target image height + width: Target image width + + Returns: + Dict containing prompt and generation parameters + """ + return { + "prompt": prompt, + "height": height, + "width": width, + } + + +def build_prompt_for_i2i( + prompt: str, + image: Image.Image, + height: int | None = None, + width: int | None = None, +) -> dict: + """ + Build prompt dict for image-to-image generation. + + Args: + prompt: Text description for image editing + image: Source image for editing + height: Target image height (default: use source image size) + width: Target image width (default: use source image size) + + Returns: + Dict containing prompt, image, and generation parameters + """ + # Use source image dimensions if not specified + if height is None: + height = image.height + if width is None: + width = image.width + + return { + "prompt": prompt, + "multi_modal_data": { + "image": image, + }, + "height": height, + "width": width, + } + + +def main(args: argparse.Namespace) -> None: + """Main entry point for GLM-Image end-to-end inference.""" + print("=" * 60) + print("GLM-Image Multistage End-to-End Inference") + print("=" * 60) + + # Validate arguments + if not args.model_path: + raise ValueError("--model-path is required") + + if not args.prompt: + raise ValueError("--prompt is required") + + # Determine config path + config_path = args.config_path + if config_path is None: + # Try to find default config + if os.path.exists(DEFAULT_CONFIG_PATH): + config_path = DEFAULT_CONFIG_PATH + else: + # Try relative to script location + script_dir = Path(__file__).parent.parent.parent.parent + config_path = script_dir / "vllm_omni/model_executor/stage_configs/glm_image.yaml" + if not config_path.exists(): + raise FileNotFoundError( + f"Stage config not found. Please specify --config-path. Tried: {DEFAULT_CONFIG_PATH}" + ) + config_path = str(config_path) + + print(f"Model path: {args.model_path}") + print(f"Config path: {config_path}") + print(f"Prompt: {args.prompt}") + + # Load source image for image-to-image mode + source_image = None + if args.image: + print(f"Source image: {args.image}") + source_image = load_image(args.image) + print(f" Image size: {source_image.width}x{source_image.height}") + + # Build prompt based on mode + if source_image is not None: + # Image-to-image mode + prompt_dict = build_prompt_for_i2i( + prompt=args.prompt, + image=source_image, + height=args.height, + width=args.width, + ) + mode = "image-to-image" + else: + # Text-to-image mode + prompt_dict = build_prompt_for_t2i( + prompt=args.prompt, + height=args.height or 1024, + width=args.width or 1024, + ) + mode = "text-to-image" + + print(f"Mode: {mode}") + print(f"Target size: {prompt_dict.get('height', 1024)}x{prompt_dict.get('width', 1024)}") + + # Add generation parameters to prompt + prompt_dict["seed"] = args.seed + prompt_dict["num_inference_steps"] = args.num_inference_steps + prompt_dict["guidance_scale"] = args.guidance_scale + + if args.negative_prompt: + prompt_dict["negative_prompt"] = args.negative_prompt + + # Initialize Omni with multistage config + print("\nInitializing Omni with multistage pipeline...") + start_time = time.time() + + omni = Omni( + model=args.model_path, + stage_configs_path=config_path, + log_stats=args.enable_stats, + stage_init_timeout=args.stage_init_timeout, + ) + + init_time = time.time() - start_time + print(f"Initialization completed in {init_time:.2f}s") + + # Prepare prompts (support batch generation) + prompts = [prompt_dict for _ in range(args.num_prompts)] + + # No explicit sampling_params for diffusion - parameters are in prompt_dict + # For multistage, the AR stage may need sampling params + from vllm import SamplingParams + + ar_sampling_params = SamplingParams( + temperature=0.0, + top_p=1.0, + top_k=-1, + max_tokens=args.max_tokens, + seed=args.seed, + detokenize=False, + ) + + # For multistage, we may need multiple sampling_params (one per LLM stage) + # For GLM-Image: Stage 0 (AR) is LLM, Stage 1 (Diffusion) uses diffusion_kwargs + sampling_params_list = [ar_sampling_params] + + # Run generation + print(f"\nGenerating {args.num_prompts} image(s)...") + gen_start_time = time.time() + + output_dir = os.path.dirname(args.output) if args.output else "outputs" + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + output_count = 0 + for stage_outputs in omni.generate(prompts, sampling_params_list, py_generator=True): + if stage_outputs.final_output_type == "image": + for output in stage_outputs.request_output: + request_id = output.request_id + + # Get generated images + images = output.images if hasattr(output, "images") else [] + if not images and hasattr(output, "multimodal_output"): + images = output.multimodal_output.get("images", []) + + # Save each generated image + for idx, img in enumerate(images): + if args.num_prompts == 1 and len(images) == 1: + output_path = args.output + else: + base, ext = os.path.splitext(args.output) + output_path = f"{base}_{request_id}_{idx}{ext}" + + if isinstance(img, Image.Image): + save_image(img, output_path) + else: + print(f"Warning: Unexpected image type for request {request_id}: {type(img)}") + + output_count += 1 + + elif stage_outputs.final_output_type == "text": + # AR stage output (intermediate, for debugging) + if args.verbose: + for output in stage_outputs.request_output: + print(f"AR output for request {output.request_id}:") + print(f" Token count: {len(output.outputs[0].token_ids)}") + + gen_time = time.time() - gen_start_time + print(f"\nGeneration completed in {gen_time:.2f}s") + print(f"Generated {output_count} image(s)") + + # Cleanup + omni.close() + print("\nDone!") + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="GLM-Image Multistage End-to-End Inference", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Required arguments + parser.add_argument( + "--model-path", + type=str, + required=True, + help="Path to GLM-Image model directory or HuggingFace model ID", + ) + parser.add_argument( + "--prompt", + type=str, + required=True, + help="Text prompt for image generation", + ) + + # Optional arguments + parser.add_argument( + "--config-path", + type=str, + default=None, + help="Path to stage config YAML file (default: auto-detect)", + ) + parser.add_argument( + "--image", + type=str, + default=None, + help="Path to source image for image-to-image mode", + ) + parser.add_argument( + "--output", + type=str, + default="output_glm_image.png", + help="Output image path (default: output_glm_image.png)", + ) + parser.add_argument( + "--negative-prompt", + type=str, + default=None, + help="Negative prompt for classifier-free guidance", + ) + + # Generation parameters + parser.add_argument( + "--height", + type=int, + default=None, + help="Output image height (default: 1024 for t2i, source size for i2i)", + ) + parser.add_argument( + "--width", + type=int, + default=None, + help="Output image width (default: 1024 for t2i, source size for i2i)", + ) + parser.add_argument( + "--num-inference-steps", + type=int, + default=50, + help="Number of diffusion denoising steps (default: 50)", + ) + parser.add_argument( + "--guidance-scale", + type=float, + default=1.5, + help="Classifier-free guidance scale (default: 1.5)", + ) + parser.add_argument( + "--seed", + type=int, + default=SEED, + help=f"Random seed for reproducibility (default: {SEED})", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=16384, + help="Maximum tokens for AR generation (default: 16384)", + ) + + # Batch processing + parser.add_argument( + "--num-prompts", + type=int, + default=1, + help="Number of images to generate (default: 1)", + ) + + # Runtime options + parser.add_argument( + "--enable-stats", + action="store_true", + default=False, + help="Enable statistics logging", + ) + parser.add_argument( + "--stage-init-timeout", + type=int, + default=600, + help="Timeout for stage initialization in seconds (default: 600)", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + default=False, + help="Enable verbose output", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/glm_image/run_i2i.sh b/examples/offline_inference/glm_image/run_i2i.sh new file mode 100755 index 00000000000..f81b157b0c8 --- /dev/null +++ b/examples/offline_inference/glm_image/run_i2i.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# Run GLM-Image image-to-image (editing) with multistage pipeline + +set -e + +# Default values +MODEL_PATH="${MODEL_PATH:-/path/to/glm-image}" +CONFIG_PATH="${CONFIG_PATH:-vllm_omni/model_executor/stage_configs/glm_image.yaml}" +PROMPT="${PROMPT:-Transform this image into an oil painting style}" +INPUT_IMAGE="" +OUTPUT="${OUTPUT:-output_i2i.png}" +NUM_STEPS="${NUM_STEPS:-50}" +GUIDANCE="${GUIDANCE:-1.5}" +SEED="${SEED:-42}" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --model-path) + MODEL_PATH="$2" + shift 2 + ;; + --config-path) + CONFIG_PATH="$2" + shift 2 + ;; + --prompt) + PROMPT="$2" + shift 2 + ;; + --image) + INPUT_IMAGE="$2" + shift 2 + ;; + --output) + OUTPUT="$2" + shift 2 + ;; + --num-steps) + NUM_STEPS="$2" + shift 2 + ;; + --guidance) + GUIDANCE="$2" + shift 2 + ;; + --seed) + SEED="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Check if input image is provided +if [ -z "${INPUT_IMAGE}" ]; then + echo "Error: --image is required for image-to-image mode" + echo "Usage: ./run_i2i.sh --image /path/to/input.png [--prompt \"edit instruction\"]" + exit 1 +fi + +if [ ! -f "${INPUT_IMAGE}" ]; then + echo "Error: Input image not found: ${INPUT_IMAGE}" + exit 1 +fi + +echo "==============================================" +echo "GLM-Image Image-to-Image Generation" +echo "==============================================" +echo "Model: ${MODEL_PATH}" +echo "Config: ${CONFIG_PATH}" +echo "Input: ${INPUT_IMAGE}" +echo "Prompt: ${PROMPT}" +echo "Output: ${OUTPUT}" +echo "Steps: ${NUM_STEPS}" +echo "Guidance: ${GUIDANCE}" +echo "Seed: ${SEED}" +echo "==============================================" + +python end2end.py \ + --model-path "${MODEL_PATH}" \ + --config-path "${CONFIG_PATH}" \ + --prompt "${PROMPT}" \ + --image "${INPUT_IMAGE}" \ + --output "${OUTPUT}" \ + --num-inference-steps "${NUM_STEPS}" \ + --guidance-scale "${GUIDANCE}" \ + --seed "${SEED}" \ + --verbose diff --git a/examples/offline_inference/glm_image/run_t2i.sh b/examples/offline_inference/glm_image/run_t2i.sh new file mode 100755 index 00000000000..5d249960b8f --- /dev/null +++ b/examples/offline_inference/glm_image/run_t2i.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# Run GLM-Image text-to-image generation with multistage pipeline + +set -e + +# Default values +MODEL_PATH="${MODEL_PATH:-/path/to/glm-image}" +CONFIG_PATH="${CONFIG_PATH:-vllm_omni/model_executor/stage_configs/glm_image.yaml}" +PROMPT="${PROMPT:-A beautiful mountain landscape with snow-capped peaks and a clear blue lake}" +OUTPUT="${OUTPUT:-output_t2i.png}" +HEIGHT="${HEIGHT:-1024}" +WIDTH="${WIDTH:-1024}" +NUM_STEPS="${NUM_STEPS:-50}" +GUIDANCE="${GUIDANCE:-1.5}" +SEED="${SEED:-42}" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --model-path) + MODEL_PATH="$2" + shift 2 + ;; + --config-path) + CONFIG_PATH="$2" + shift 2 + ;; + --prompt) + PROMPT="$2" + shift 2 + ;; + --output) + OUTPUT="$2" + shift 2 + ;; + --height) + HEIGHT="$2" + shift 2 + ;; + --width) + WIDTH="$2" + shift 2 + ;; + --num-steps) + NUM_STEPS="$2" + shift 2 + ;; + --guidance) + GUIDANCE="$2" + shift 2 + ;; + --seed) + SEED="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +echo "==============================================" +echo "GLM-Image Text-to-Image Generation" +echo "==============================================" +echo "Model: ${MODEL_PATH}" +echo "Config: ${CONFIG_PATH}" +echo "Prompt: ${PROMPT}" +echo "Output: ${OUTPUT}" +echo "Size: ${WIDTH}x${HEIGHT}" +echo "Steps: ${NUM_STEPS}" +echo "Guidance: ${GUIDANCE}" +echo "Seed: ${SEED}" +echo "==============================================" + +python end2end.py \ + --model-path "${MODEL_PATH}" \ + --config-path "${CONFIG_PATH}" \ + --prompt "${PROMPT}" \ + --output "${OUTPUT}" \ + --height "${HEIGHT}" \ + --width "${WIDTH}" \ + --num-inference-steps "${NUM_STEPS}" \ + --guidance-scale "${GUIDANCE}" \ + --seed "${SEED}" \ + --verbose From dc8f2c2b9591ae69acbfe38885f1bdf8de5e3a57 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 11:29:47 +0000 Subject: [PATCH 12/53] modify dit configs Signed-off-by: JaredforReal --- vllm_omni/model_executor/stage_configs/glm_image.yaml | 5 ----- .../stage_configs/glm_image_muilticonnector.yaml | 5 ----- 2 files changed, 10 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index 8ba307a8253..59c6a5252d4 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -44,21 +44,16 @@ stage_args: devices: "1" # Can use different GPU, or same GPU if memory allows max_batch_size: 1 engine_args: - model_arch: GlmImagePipeline model_stage: dit # Diffusion-specific parameters num_gpus: 1 - cfg_parallel_size: 1 # Set to 2 for CFG parallelism on 2 GPUs enforce_eager: true trust_remote_code: true - engine_output_type: image # Final output is image distributed_executor_backend: "mp" - enable_prefix_caching: false engine_input_source: [0] # Input from AR stage custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion final_output: true final_output_type: image - is_comprehension: false default_sampling_params: temperature: 0.0 top_p: 1.0 diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml index 7d209d6e3c6..c32b1cd3d07 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml @@ -47,21 +47,16 @@ stage_args: devices: "1" # Use separate GPU for diffusion max_batch_size: 1 engine_args: - model_arch: GlmImagePipeline model_stage: dit # Diffusion-specific parameters num_gpus: 1 - cfg_parallel_size: 1 # Set to 2 for CFG parallelism enforce_eager: true trust_remote_code: true - engine_output_type: image # Final output is image distributed_executor_backend: "mp" - enable_prefix_caching: false engine_input_source: [0] # Input from AR stage custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion final_output: true final_output_type: image - is_comprehension: false default_sampling_params: temperature: 0.0 top_p: 1.0 From ad0da646870810a5a73743df22fe1e40a983de93 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 19:47:55 +0800 Subject: [PATCH 13/53] fix end2end offline examples Signed-off-by: JaredforReal --- .../offline_inference/glm_image/end2end.py | 17 ++++- vllm_omni/entrypoints/omni_diffusion.py | 66 +++++++++++-------- vllm_omni/entrypoints/omni_stage.py | 5 +- .../stage_configs/glm_image.yaml | 4 +- 4 files changed, 61 insertions(+), 31 deletions(-) diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py index bc0ecb68f7e..1134f00b424 100644 --- a/examples/offline_inference/glm_image/end2end.py +++ b/examples/offline_inference/glm_image/end2end.py @@ -225,9 +225,20 @@ def main(args: argparse.Namespace) -> None: detokenize=False, ) - # For multistage, we may need multiple sampling_params (one per LLM stage) - # For GLM-Image: Stage 0 (AR) is LLM, Stage 1 (Diffusion) uses diffusion_kwargs - sampling_params_list = [ar_sampling_params] + # For diffusion stage, sampling_params contains diffusion-specific parameters + # These are passed as kwargs to the diffusion engine + diffusion_sampling_params = { + "num_inference_steps": args.num_inference_steps, + "guidance_scale": args.guidance_scale, + "height": prompt_dict.get("height", 1024), + "width": prompt_dict.get("width", 1024), + "seed": args.seed, + } + + # For multistage, we need sampling_params for each stage + # Stage 0 (AR): SamplingParams for vLLM + # Stage 1 (Diffusion): dict with diffusion kwargs + sampling_params_list = [ar_sampling_params, diffusion_sampling_params] # Run generation print(f"\nGenerating {args.num_prompts} image(s)...") diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py index 43c68dc5cdd..f3546cc1b66 100644 --- a/vllm_omni/entrypoints/omni_diffusion.py +++ b/vllm_omni/entrypoints/omni_diffusion.py @@ -51,34 +51,48 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs): self.od_config = od_config - # Diffusers-style models expose `model_index.json` with `_class_name`. - # Bagel models (and other non-diffusers) typically expose `config.json`. - try: - config_dict = get_hf_file_to_dict( - "model_index.json", - od_config.model, - ) - od_config.model_class_name = config_dict.get("_class_name", None) + # Allow direct specification of model_class_name via model_arch parameter + # This is useful for multistage pipelines where we know the exact pipeline class + model_arch = kwargs.get("model_arch") + if model_arch and od_config.model_class_name is None: + od_config.model_class_name = model_arch + od_config.tf_model_config = TransformerConfig() od_config.update_multimodal_support() - - tf_config_dict = get_hf_file_to_dict( - "transformer/config.json", - od_config.model, - ) - od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) - except (AttributeError, OSError, ValueError): - cfg = get_hf_file_to_dict("config.json", od_config.model) - if cfg is None: - raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") - - model_type = cfg.get("model_type") - architectures = cfg.get("architectures") or [] - if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: - od_config.model_class_name = "BagelPipeline" - od_config.tf_model_config = TransformerConfig() + logger.info(f"Using model_arch '{model_arch}' as model_class_name") + elif od_config.model_class_name is None: + # Diffusers-style models expose `model_index.json` with `_class_name`. + # Bagel models (and other non-diffusers) typically expose `config.json`. + try: + config_dict = get_hf_file_to_dict( + "model_index.json", + od_config.model, + ) + od_config.model_class_name = config_dict.get("_class_name", None) od_config.update_multimodal_support() - else: - raise + + tf_config_dict = get_hf_file_to_dict( + "transformer/config.json", + od_config.model, + ) + od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + except (AttributeError, OSError, ValueError): + cfg = get_hf_file_to_dict("config.json", od_config.model) + if cfg is None: + raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") + + model_type = cfg.get("model_type") + architectures = cfg.get("architectures") or [] + if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: + od_config.model_class_name = "BagelPipeline" + od_config.tf_model_config = TransformerConfig() + od_config.update_multimodal_support() + elif model_type == "glm-image" or "GlmImageForConditionalGeneration" in architectures: + # GLM-Image model detected + od_config.model_class_name = "GlmImagePipeline" + od_config.tf_model_config = TransformerConfig() + od_config.update_multimodal_support() + else: + raise self.engine: DiffusionEngine = DiffusionEngine.make_engine(od_config) diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py index af6f60f0420..84df3397bbc 100644 --- a/vllm_omni/entrypoints/omni_stage.py +++ b/vllm_omni/entrypoints/omni_stage.py @@ -599,7 +599,10 @@ def _stage_worker( ) try: if stage_type == "diffusion": - engine_args.pop("model_stage") + engine_args.pop("model_stage", None) + # Pass model path to OmniDiffusion if not already in engine_args + if "model" not in engine_args: + engine_args["model"] = model stage_engine = OmniDiffusion(**engine_args) else: # Default to LLM engine diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index 59c6a5252d4..288657aa01f 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -24,7 +24,8 @@ stage_args: distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 - hf_config_name: vision_language_encoder # Subfolder in model path + # Model path points to the main GLM-Image directory + # vLLM will load GlmImageForConditionalGeneration from it final_output: false # AR is not the final output is_comprehension: true default_sampling_params: @@ -45,6 +46,7 @@ stage_args: max_batch_size: 1 engine_args: model_stage: dit + model_arch: GlmImagePipeline # Required for diffusion model class resolution # Diffusion-specific parameters num_gpus: 1 enforce_eager: true From 974ed2201ab764dbba68c90f4bbe08ddbaa4a7cb Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 15 Jan 2026 20:02:46 +0800 Subject: [PATCH 14/53] support sub folder and model arch Signed-off-by: JaredforReal --- vllm_omni/entrypoints/omni_diffusion.py | 4 +++- vllm_omni/entrypoints/omni_stage.py | 6 ++++++ vllm_omni/model_executor/stage_configs/glm_image.yaml | 3 +-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py index f3546cc1b66..e5f74beb92b 100644 --- a/vllm_omni/entrypoints/omni_diffusion.py +++ b/vllm_omni/entrypoints/omni_diffusion.py @@ -44,6 +44,9 @@ class OmniDiffusion: """ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs): + # Extract model_arch before passing to OmniDiffusionConfig (not a valid config field) + model_arch = kwargs.pop("model_arch", None) + if od_config is None: od_config = OmniDiffusionConfig.from_kwargs(**kwargs) elif isinstance(od_config, dict): @@ -53,7 +56,6 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs): # Allow direct specification of model_class_name via model_arch parameter # This is useful for multistage pipelines where we know the exact pipeline class - model_arch = kwargs.get("model_arch") if model_arch and od_config.model_class_name is None: od_config.model_class_name = model_arch od_config.tf_model_config = TransformerConfig() diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py index 84df3397bbc..af947448d1a 100644 --- a/vllm_omni/entrypoints/omni_stage.py +++ b/vllm_omni/entrypoints/omni_stage.py @@ -443,6 +443,12 @@ def _stage_worker( connectors_config = stage_payload.get("connectors_config", {}) stage_type = stage_payload.get("stage_type", "llm") + # Handle model_subdir for models with config in subdirectory (e.g., GLM-Image AR model) + model_subdir = engine_args.pop("model_subdir", None) + if model_subdir: + model = _os.path.join(model, model_subdir) + logger.info(f"Using model subdirectory: {model}") + # Aggregates for running average _agg_total_tokens = 0 _agg_total_gen_time_ms = 0.0 diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index 288657aa01f..0e543ff7b8b 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -15,6 +15,7 @@ stage_args: engine_args: model_stage: ar model_arch: GlmImageForConditionalGeneration + model_subdir: vision_language_encoder # AR model config.json is in this subdirectory worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler gpu_memory_utilization: 0.6 @@ -24,8 +25,6 @@ stage_args: distributed_executor_backend: "mp" enable_prefix_caching: false max_num_batched_tokens: 32768 - # Model path points to the main GLM-Image directory - # vLLM will load GlmImageForConditionalGeneration from it final_output: false # AR is not the final output is_comprehension: true default_sampling_params: From f5551f2c2c5a156b497f0485d7e4a4953f33e6a6 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:00:39 +0800 Subject: [PATCH 15/53] fix import error Signed-off-by: JaredforReal --- vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 98fd1dd19db..6dce7072976 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -69,6 +69,7 @@ make_empty_intermediate_tensors_factory, make_layers, ) +from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, @@ -89,8 +90,6 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.v1.attention.backends.registry import AttentionBackendEnum -from .vision import get_vit_attn_backend - logger = init_logger(__name__) From 88439b675445e136eeb4803580bd70971285a273 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:04:25 +0800 Subject: [PATCH 16/53] tokenizer Signed-off-by: JaredforReal --- vllm_omni/entrypoints/omni_stage.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py index af947448d1a..6f62dff7fbb 100644 --- a/vllm_omni/entrypoints/omni_stage.py +++ b/vllm_omni/entrypoints/omni_stage.py @@ -444,11 +444,27 @@ def _stage_worker( stage_type = stage_payload.get("stage_type", "llm") # Handle model_subdir for models with config in subdirectory (e.g., GLM-Image AR model) + # Also handle tokenizer_subdir for when tokenizer is in a different location than model model_subdir = engine_args.pop("model_subdir", None) + tokenizer_subdir = engine_args.pop("tokenizer_subdir", None) + base_model_path = model # Keep original model path for tokenizer + if model_subdir: model = _os.path.join(model, model_subdir) logger.info(f"Using model subdirectory: {model}") + # Set tokenizer path if different from model path + if tokenizer_subdir is not None: + # tokenizer_subdir can be empty string "" to use base_model_path directly + tokenizer_path = _os.path.join(base_model_path, tokenizer_subdir) if tokenizer_subdir else base_model_path + engine_args["tokenizer"] = tokenizer_path + logger.info(f"Using tokenizer from: {tokenizer_path}") + elif model_subdir and "tokenizer" not in engine_args: + # If model is in subdirectory but tokenizer not specified, use base path + # This is common for models like GLM-Image where tokenizer is in root + engine_args["tokenizer"] = base_model_path + logger.info(f"Using tokenizer from base model path: {base_model_path}") + # Aggregates for running average _agg_total_tokens = 0 _agg_total_gen_time_ms = 0.0 From 6676c96f2883391f200be0fe5936dc5f9dea8e09 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:10:24 +0800 Subject: [PATCH 17/53] fix BaseDummyInputsBuilders Signed-off-by: JaredforReal --- vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 6dce7072976..6efeb57a4d6 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -79,13 +79,13 @@ ) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails, ) -from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.v1.attention.backends.registry import AttentionBackendEnum From 324348cbe3bdfee551335142f1ad02abed9cf61d Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:12:45 +0800 Subject: [PATCH 18/53] tokenizer sub dir Signed-off-by: JaredforReal --- vllm_omni/model_executor/stage_configs/glm_image.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index 0e543ff7b8b..d9186769362 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -16,6 +16,7 @@ stage_args: model_stage: ar model_arch: GlmImageForConditionalGeneration model_subdir: vision_language_encoder # AR model config.json is in this subdirectory + tokenizer_subdir: tokenizer # Tokenizer files are in tokenizer/ subdirectory worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler gpu_memory_utilization: 0.6 From c0877a31fd09cfa07119429cc22d31cfb6a95830 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:22:39 +0800 Subject: [PATCH 19/53] fix text2image Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 6efeb57a4d6..a81c1b53ec6 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -229,7 +229,15 @@ def get_dummy_mm_data( ) -> MultiModalDataDict: """ Generate dummy multimodal data for profiling. + + Returns empty dict if no images (text-to-image mode). """ + num_images = mm_counts.get("image", 0) + + # Text-to-image mode: no multimodal data needed + if num_images == 0: + return {} + hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -237,8 +245,6 @@ def get_dummy_mm_data( image_size = getattr(vision_config, "image_size", 2048) width = height = image_size - num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None return { @@ -295,7 +301,13 @@ def _get_mm_fields_config( ) -> Mapping[str, MultiModalFieldConfig]: """ Get the multimodal field configuration. + + Returns empty dict if no image data (text-to-image mode). """ + # Check if we have image data + if "pixel_values" not in hf_inputs: + return {} + return dict( pixel_values=MultiModalFieldConfig.batched("image"), image_grid_thw=MultiModalFieldConfig.batched("image"), @@ -312,7 +324,13 @@ def _get_prompt_updates( GLM-Image replaces each image placeholder with: <|image_start|> + image_tokens + <|image_end|> + + Returns empty list if no images (text-to-image mode). """ + # Check if we have any images + if not mm_items.get_count("image", strict=False): + return [] + hf_config = self.info.get_hf_config() # Get special token IDs from config From 277fd0d1ca9f2a4b232e20c09ee5502706a2197b Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:41:09 +0800 Subject: [PATCH 20/53] fix i2i dummy inputs Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index a81c1b53ec6..b31c9072af2 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -138,10 +138,15 @@ def get_hf_processor(self, **kwargs: object): return None def get_supported_mm_limits(self) -> Mapping[str, int | None]: - # GLM-Image supports multiple source images for image-to-image generation - # or no image for text-to-image generation - # None means no limit on the number of images - return {"image": None} + # GLM-Image is an image GENERATION model, not an image understanding model. + # For text-to-image (t2i) mode: no multimodal input is needed + # For image-to-image (i2i) mode: source images are provided as input + # + # Return empty dict to indicate no multimodal inputs are required for + # profiling. Image-to-image mode will be handled dynamically at runtime. + # This prevents vLLM from trying to create dummy image inputs during + # model initialization. + return {} def get_num_image_tokens( self, @@ -198,16 +203,24 @@ def get_image_size_with_most_features(self) -> tuple[int, int]: class GlmImageDummyInputsBuilder(BaseDummyInputsBuilder[GlmImageProcessingInfo]): """ Builds dummy inputs for GLM-Image model profiling. + + GLM-Image is an image GENERATION model. For text-to-image mode, + no multimodal inputs are needed - just a text prompt. """ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: """ - Generate dummy text with image placeholders. + Generate dummy text for profiling. - GLM-Image uses <|image|> as the image placeholder token. + For text-to-image mode (no images), returns a simple text prompt. + For image-to-image mode, includes image placeholders. """ num_images = mm_counts.get("image", 0) + # Text-to-image mode: return a simple text prompt for profiling + if num_images == 0: + return "A beautiful image." + hf_config = self.info.get_hf_config() # Get image token from config or use default image_token_id = getattr(hf_config, "image_token_id", 167855) From 4883184523b1ca578905b4cf97ea430b09044126 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:45:14 +0800 Subject: [PATCH 21/53] fix gate up load weight Signed-off-by: JaredforReal --- vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index b31c9072af2..ab840ad4428 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -1557,7 +1557,10 @@ class GlmImageForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP "k_proj", "v_proj", ], - "gate_up_proj": ["gate_up_proj"], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], } # Weight mapping from HuggingFace to vLLM format From 393dfd2fe8d63d4a086a2214b1a32a454a57ba9f Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:48:57 +0800 Subject: [PATCH 22/53] fix gate up load weight Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index ab840ad4428..56d9e57def9 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -1785,24 +1785,27 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for name, loaded_weight in weights: # Handle stacked parameters (QKV, gate_up) + is_stacked = False for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - name = name.replace(weight_name, param_name) - if name not in params_dict: + stacked_name = name.replace(weight_name, param_name) + if stacked_name not in params_dict: break - param = params_dict[name] + param = params_dict[stacked_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight, shard_id) + loaded_params.add(stacked_name) + is_stacked = True break - else: + + if not is_stacked: # Regular weight loading if name not in params_dict: continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - - loaded_params.add(name) + loaded_params.add(name) return loaded_params From aa4b586c2f29e3a7296f8cff48818e4e253380d1 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 10:55:02 +0800 Subject: [PATCH 23/53] get transformer/config.json Signed-off-by: JaredforReal --- vllm_omni/entrypoints/omni_diffusion.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py index e5f74beb92b..0b1a1763316 100644 --- a/vllm_omni/entrypoints/omni_diffusion.py +++ b/vllm_omni/entrypoints/omni_diffusion.py @@ -58,7 +58,15 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs): # This is useful for multistage pipelines where we know the exact pipeline class if model_arch and od_config.model_class_name is None: od_config.model_class_name = model_arch - od_config.tf_model_config = TransformerConfig() + # Try to load transformer config from transformer/config.json + try: + tf_config_dict = get_hf_file_to_dict( + "transformer/config.json", + od_config.model, + ) + od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + except (AttributeError, OSError, ValueError): + od_config.tf_model_config = TransformerConfig() od_config.update_multimodal_support() logger.info(f"Using model_arch '{model_arch}' as model_class_name") elif od_config.model_class_name is None: @@ -89,9 +97,17 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs): od_config.tf_model_config = TransformerConfig() od_config.update_multimodal_support() elif model_type == "glm-image" or "GlmImageForConditionalGeneration" in architectures: - # GLM-Image model detected + # GLM-Image model detected - load transformer config od_config.model_class_name = "GlmImagePipeline" - od_config.tf_model_config = TransformerConfig() + # Try to load transformer config from transformer/config.json + try: + tf_config_dict = get_hf_file_to_dict( + "transformer/config.json", + od_config.model, + ) + od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + except (AttributeError, OSError, ValueError): + od_config.tf_model_config = TransformerConfig() od_config.update_multimodal_support() else: raise From daa57c5878fce2967e0ef18668c8b5293d831f3e Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 11:01:31 +0800 Subject: [PATCH 24/53] add glm image mrope Signed-off-by: JaredforReal --- vllm_omni/model_executor/layers/mrope.py | 86 ++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/vllm_omni/model_executor/layers/mrope.py b/vllm_omni/model_executor/layers/mrope.py index 9ca6a36e233..b3517f9492c 100644 --- a/vllm_omni/model_executor/layers/mrope.py +++ b/vllm_omni/model_executor/layers/mrope.py @@ -201,6 +201,17 @@ def get_input_positions_tensor( context_len=context_len, seq_len=seq_len, ) + elif hf_config.model_type == "glm-image": + # GLM-Image is an image generation model. + # For text-to-image mode (no input images), use simple text-only positions. + # For image-to-image mode, use GLM4V-style position encoding. + return cls._glm_image_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + context_len=context_len, + seq_len=seq_len, + ) else: return cls._vl_get_input_positions_tensor( input_tokens=input_tokens, @@ -313,6 +324,81 @@ def _glm4v_get_input_positions_tensor( mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() return llm_positions, mrope_position_delta + @classmethod + def _glm_image_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: list[list[int]] | torch.Tensor | None, + context_len: int = 0, + seq_len: int | None = None, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions for GLM-Image model. + + GLM-Image is an image GENERATION model, not understanding. + - For text-to-image: no input images, just text positions + - For image-to-image: source images have grid positions + + Unlike Qwen2-VL, GLM-Image doesn't have video support. + """ + llm_pos_ids_list: list = [] + + # Check if we have any image inputs (image-to-image mode) + has_image_input = image_grid_thw is not None and len(image_grid_thw) > 0 + + if has_image_input: + # Image-to-image mode: handle source image positions + image_token_id = getattr(hf_config, "image_token_id", None) + spatial_merge_size = getattr(hf_config.vision_config, "spatial_merge_size", 1) + + if isinstance(image_grid_thw, torch.Tensor): + image_grid_thw = image_grid_thw.tolist() + + input_tokens_tensor = torch.tensor(input_tokens) + image_indices = torch.argwhere(input_tokens_tensor == image_token_id).squeeze(1).tolist() + + st = 0 + image_idx = 0 + for i, token_pos in enumerate(image_indices): + # Text before this image + if token_pos > st: + text_len = token_pos - st + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + st = token_pos + + # Image tokens + if image_idx < len(image_grid_thw): + t, h, w = image_grid_thw[image_idx] + llm_grid_t = t + llm_grid_h = h // spatial_merge_size + llm_grid_w = w // spatial_merge_size + + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx) + + num_image_tokens = llm_grid_t * llm_grid_h * llm_grid_w + st += num_image_tokens + image_idx += 1 + + # Remaining text after last image + if st < len(input_tokens): + text_len = len(input_tokens) - st + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + else: + # Text-to-image mode: simple sequential positions + text_len = len(input_tokens) + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:seq_len] + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + return llm_positions, mrope_position_delta + @classmethod def _vl_get_input_positions_tensor( cls, From 82460fae7efa9c9caf01f1c085e4ba1608ffe7a3 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 11:21:55 +0800 Subject: [PATCH 25/53] fix glm_image spelling Signed-off-by: JaredforReal --- vllm_omni/model_executor/layers/mrope.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/layers/mrope.py b/vllm_omni/model_executor/layers/mrope.py index b3517f9492c..49bd56a65db 100644 --- a/vllm_omni/model_executor/layers/mrope.py +++ b/vllm_omni/model_executor/layers/mrope.py @@ -201,7 +201,7 @@ def get_input_positions_tensor( context_len=context_len, seq_len=seq_len, ) - elif hf_config.model_type == "glm-image": + elif hf_config.model_type == "glm_image": # GLM-Image is an image generation model. # For text-to-image mode (no input images), use simple text-only positions. # For image-to-image mode, use GLM4V-style position encoding. From e1d946f38caf694d39bdb082c1519587163b386e Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 11:30:34 +0800 Subject: [PATCH 26/53] fix Signed-off-by: JaredforReal --- vllm_omni/worker/gpu_ar_model_runner.py | 101 ++++++++++-------------- 1 file changed, 40 insertions(+), 61 deletions(-) diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index d4e7e195fe8..78f14d8f1e0 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -12,10 +12,15 @@ import numpy as np import torch from vllm.config import CUDAGraphMode +from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer +from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer, +) from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput -from vllm.v1.outputs import AsyncModelRunnerOutput +from vllm.v1.outputs import AsyncModelRunnerOutput, make_empty_encoder_model_runner_output from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import record_function_or_nullcontext @@ -25,19 +30,12 @@ IntermediateTensors, get_pp_group, get_tp_group, - has_kv_transfer_group, - ) +from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices from vllm.v1.worker.utils import is_residual_scattered_for_sp -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer, -) -from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer -from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group + from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner -from vllm.v1.outputs import make_empty_encoder_model_runner_output -from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices logger = init_logger(__name__) @@ -91,10 +89,7 @@ def execute_model( intermediate_tensors: IntermediateTensors | None = None, ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors | None: if self.execute_model_state is not None: - raise RuntimeError( - "State error: sample_tokens() must be called " - "after execute_model() returns None." - ) + raise RuntimeError("State error: sample_tokens() must be called after execute_model() returns None.") if self.vllm_config.model_config.enable_return_routed_experts: capturer = RoutedExpertsCapturer.get_instance() @@ -104,9 +99,7 @@ def execute_model( logger.error("RoutedExpertsCapturer not initialized.") if scheduler_output.preempted_req_ids and has_kv_transfer_group(): - get_kv_transfer_group().handle_preemptions( - scheduler_output.preempted_req_ids - ) + get_kv_transfer_group().handle_preemptions(scheduler_output.preempted_req_ids) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with ( @@ -126,8 +119,7 @@ def execute_model( if not num_scheduled_tokens: if ( - self.parallel_config.distributed_executor_backend - == "external_launcher" + self.parallel_config.distributed_executor_backend == "external_launcher" and self.parallel_config.data_parallel_size > 1 ): # this is a corner case when both external launcher @@ -196,9 +188,7 @@ def execute_model( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = ( - batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - ) + num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( should_ubatch, num_scheduled_tokens_np, @@ -218,19 +208,17 @@ def execute_model( use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices - attn_metadata, spec_decode_common_attn_metadata = ( - self._build_attention_metadata( - num_tokens=num_tokens_unpadded, - num_tokens_padded=num_tokens_padded if pad_attn else None, - num_reqs=num_reqs, - num_reqs_padded=num_reqs_padded if pad_attn else None, - max_query_len=max_num_scheduled_tokens, - ubatch_slices=ubatch_slices_attn, - logits_indices=logits_indices, - use_spec_decode=use_spec_decode, - num_scheduled_tokens=scheduler_output.num_scheduled_tokens, - cascade_attn_prefix_lens=cascade_attn_prefix_lens, - ) + attn_metadata, spec_decode_common_attn_metadata = self._build_attention_metadata( + num_tokens=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded if pad_attn else None, + num_reqs=num_reqs, + num_reqs_padded=num_reqs_padded if pad_attn else None, + max_query_len=max_num_scheduled_tokens, + ubatch_slices=ubatch_slices_attn, + logits_indices=logits_indices, + use_spec_decode=use_spec_decode, + num_scheduled_tokens=scheduler_output.num_scheduled_tokens, + cascade_attn_prefix_lens=cascade_attn_prefix_lens, ) ( @@ -240,9 +228,7 @@ def execute_model( intermediate_tensors, model_kwargs, ec_connector_output, - ) = self._preprocess( - scheduler_output, num_tokens_padded, intermediate_tensors - ) + ) = self._preprocess(scheduler_output, num_tokens_padded, intermediate_tensors) # Set cudagraph mode to none if calc_kv_scales is true. # KV scales calculation involves dynamic operations that are incompatible @@ -287,10 +273,11 @@ def execute_model( hidden_states = model_output aux_hidden_states = None - multimodal_outputs = model_output.multimodal_outputs - hidden_states = model_output.text_hidden_states + # Extract multimodal outputs if model supports it + # This handles both OmniOutput objects and plain tensors + hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states) - if multimodal_outputs is not None: + if multimodal_outputs is not None and multimodal_outputs: keys_or_type = ( list(multimodal_outputs.keys()) if isinstance(multimodal_outputs, dict) @@ -329,9 +316,7 @@ def execute_model( sample_hidden_states = hidden_states[logits_indices] if not get_pp_group().is_last_rank: all_gather_tensors = { - "residual": not is_residual_scattered_for_sp( - self.vllm_config, num_tokens_padded - ) + "residual": not is_residual_scattered_for_sp(self.vllm_config, num_tokens_padded) } get_pp_group().send_tensor_dict( hidden_states.tensors, @@ -408,9 +393,7 @@ def sample_tokens( # Apply structured output bitmasks if present. if grammar_output is not None: - apply_grammar_bitmask( - scheduler_output, grammar_output, self.input_batch, logits - ) + apply_grammar_bitmask(scheduler_output, grammar_output, self.input_batch, logits) with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) @@ -450,23 +433,19 @@ def propose_draft_token_ids(sampled_token_ids): propose_draft_token_ids(sampled_token_ids) elif self.valid_sampled_token_count_event is not None: assert spec_decode_common_attn_metadata is not None - next_token_ids, valid_sampled_tokens_count = ( - self.drafter.prepare_next_token_ids_padded( - spec_decode_common_attn_metadata, - sampled_token_ids, - self.requests, - self.input_batch, - self.discard_request_mask.gpu, - ) - ) - self._copy_valid_sampled_token_count( - next_token_ids, valid_sampled_tokens_count + next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded( + spec_decode_common_attn_metadata, + sampled_token_ids, + self.requests, + self.input_batch, + self.discard_request_mask.gpu, ) + self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count) # Since we couldn't run the drafter, # just use zeros for the draft tokens. - self._draft_token_ids = torch.zeros( - 1, device=self.device, dtype=torch.int32 - ).expand(len(self.input_batch.req_ids), self.num_spec_tokens) + self._draft_token_ids = torch.zeros(1, device=self.device, dtype=torch.int32).expand( + len(self.input_batch.req_ids), self.num_spec_tokens + ) self._copy_draft_token_ids_to_cpu(scheduler_output, zeros_only=True) else: propose_drafts_after_bookkeeping = input_fits_in_drafter From 468cd04da44615b46a0ad3e74be04502cd276282 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 11:34:38 +0800 Subject: [PATCH 27/53] fix compute_logits Signed-off-by: JaredforReal --- vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 56d9e57def9..6be85f2a861 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -1753,6 +1753,7 @@ def forward( def compute_logits( self, hidden_states: torch.Tensor, + **kwargs: object, ) -> torch.Tensor | None: """Compute logits from hidden states.""" logits = self.logits_processor( From 59df49531d2049aadeb750c4d11fe75d8884fa7e Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 11:54:24 +0800 Subject: [PATCH 28/53] fix glm image stage input processors Signed-off-by: JaredforReal --- .../stage_input_processors/glm_image.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index 4ea7e66c83c..a135176e5a1 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -144,8 +144,17 @@ def ar2diffusion( # Get original prompt info original_prompt = prompt[i] if i < len(prompt) else {} - if isinstance(original_prompt, (OmniTokensPrompt, TextPrompt)): - original_prompt = dict(original_prompt) if hasattr(original_prompt, "__iter__") else {} + # Handle various prompt types - convert to dict for uniform access + # Note: TypedDict (TextPrompt, OmniTokensPrompt) doesn't support isinstance + if isinstance(original_prompt, dict): + pass # Already a dict + elif hasattr(original_prompt, "_asdict"): + # NamedTuple + original_prompt = original_prompt._asdict() + elif hasattr(original_prompt, "__dict__"): + original_prompt = vars(original_prompt) + else: + original_prompt = {} # Extract dimensions from original prompt or use defaults height = original_prompt.get("height", 1024) From 396883cef35967f5a5c19d66c2c11d6c14177a89 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 12:19:33 +0800 Subject: [PATCH 29/53] fix stage input Signed-off-by: JaredforReal --- vllm_omni/entrypoints/omni_stage.py | 61 +++++++++++++++---- .../stage_input_processors/glm_image.py | 7 ++- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py index 6f62dff7fbb..cc8bbdea3ea 100644 --- a/vllm_omni/entrypoints/omni_stage.py +++ b/vllm_omni/entrypoints/omni_stage.py @@ -788,22 +788,51 @@ def handle_profiler_task(task_type: OmniStageTaskType) -> None: gen_outputs: list[Any] = [] _gen_t0 = _time.time() if stage_type == "diffusion": - # For diffusion, batch_engine_inputs should be prompts (strings) - # Convert to list of strings if needed + # For diffusion, batch_engine_inputs can be: + # 1. Strings (direct prompts) + # 2. Dicts with "prompt" and other fields like "extra", "height", "width" + # (from custom_process_input_func like ar2diffusion) + # We need to preserve all fields for proper multistage integration prompts = [] + per_request_kwargs = [] for ein in batch_engine_inputs: if isinstance(ein, str): prompts.append(ein) - elif isinstance(ein, dict) and "prompt" in ein: - prompts.append(ein["prompt"]) + per_request_kwargs.append({}) + elif isinstance(ein, dict): + prompts.append(ein.get("prompt", "")) + # Extract all non-prompt fields as kwargs for this request + req_kwargs = {k: v for k, v in ein.items() if k != "prompt"} + per_request_kwargs.append(req_kwargs) elif hasattr(ein, "prompt"): prompts.append(ein.prompt) + per_request_kwargs.append({}) else: prompts.append(str(ein)) + per_request_kwargs.append({}) # Prepare diffusion kwargs from sampling parameters diffusion_kwargs = prepare_sampling_params(sampling_params, "diffusion") - # Diffusion generate returns results directly, not an iterator - diffusion_results = stage_engine.generate(prompts, **diffusion_kwargs) + # For multistage with extra params (like prior_token_ids), process each request + # with its specific kwargs merged with global diffusion_kwargs + diffusion_results = [] + for i, (prompt, req_kwargs) in enumerate(zip(prompts, per_request_kwargs)): + # Merge global diffusion_kwargs with per-request kwargs + # Per-request kwargs take precedence (they may contain extra, height, width) + merged_kwargs = {**diffusion_kwargs, **req_kwargs} + # Log to verify extra params are being passed + has_extra = "extra" in merged_kwargs + has_prior_tokens = ( + merged_kwargs.get("extra", {}).get("prior_token_ids") is not None if has_extra else False + ) + logger.info( + f"[Diffusion] Request {i}: prompt='{prompt[:30] if prompt else ''}...', " + f"has_extra={has_extra}, has_prior_token_ids={has_prior_tokens}" + ) + result = stage_engine.generate(prompt, **merged_kwargs) + if isinstance(result, list): + diffusion_results.extend(result) + else: + diffusion_results.append(result) # Convert to list format compatible with LLM outputs # Ensure each result has a request_id for proper mapping if isinstance(diffusion_results, list): @@ -1248,12 +1277,19 @@ async def generation_single_request(task: dict[str, Any]): ein = ein[0] if stage_type == "diffusion": - # For diffusion, ein should be prompts (strings) - # Convert to string if needed + # For diffusion, ein can be: + # 1. A string (direct prompt) + # 2. A dict with "prompt" and other fields like "extra", "height", "width" + # (from custom_process_input_func like ar2diffusion) + # We need to preserve all fields for proper multistage integration + prompt = "" + per_request_kwargs = {} if isinstance(ein, str): prompt = ein - elif isinstance(ein, dict) and "prompt" in ein: - prompt = ein["prompt"] + elif isinstance(ein, dict): + prompt = ein.get("prompt", "") + # Extract all non-prompt fields as kwargs for this request + per_request_kwargs = {k: v for k, v in ein.items() if k != "prompt"} elif hasattr(ein, "prompt"): prompt = ein.prompt else: @@ -1261,8 +1297,11 @@ async def generation_single_request(task: dict[str, Any]): # Prepare diffusion kwargs from sampling parameters diffusion_kwargs = prepare_sampling_params(sampling_params, "diffusion") + # Merge global diffusion_kwargs with per-request kwargs + # Per-request kwargs take precedence (they may contain extra, height, width) + merged_kwargs = {**diffusion_kwargs, **per_request_kwargs} # AsyncOmniDiffusion.generate returns a single result, not an async generator - gen_output = await stage_engine.generate(prompt=prompt, request_id=rid, **diffusion_kwargs) + gen_output = await stage_engine.generate(prompt=prompt, request_id=rid, **merged_kwargs) _gen_t1 = _time.time() _gen_ms = (_gen_t1 - _gen_t0) * 1000.0 await generation_out_q.put((rid, gen_output, _gen_ms)) diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index a135176e5a1..ac17f3fcfaa 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -191,7 +191,12 @@ def ar2diffusion( diffusion_input[key] = original_prompt[key] diffusion_inputs.append(diffusion_input) + logger.info( + f"ar2diffusion: request {i}: prompt='{text_prompt[:50]}...', " + f"prior_token_ids shape={prior_token_ids.shape}, " + f"height={pixel_h}, width={pixel_w}" + ) - logger.debug(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs") + logger.info(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs") return diffusion_inputs From 2a36a470fe2a07734411bfc512353fad6c1b09e7 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 12:36:39 +0800 Subject: [PATCH 30/53] fix stage input Signed-off-by: JaredforReal --- .../stage_input_processors/glm_image.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index ac17f3fcfaa..28732300334 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Stage input processor for GLM-Image: AR → Diffusion transition.""" -from math import sqrt from typing import Any import torch @@ -57,16 +56,24 @@ def _parse_generated_tokens( Returns: Tuple of (upsampled_prior_token_ids, pixel_height, pixel_width) """ - # Calculate token dimensions + # Calculate token dimensions for target image token_h = height // factor token_w = width // factor large_image_tokens = token_h * token_w - # Calculate small image dimensions (used in text-to-image) - ratio = token_h / token_w - prev_token_h = int(sqrt(ratio) * (factor // 2)) - prev_token_w = int(sqrt(1 / ratio) * (factor // 2)) - small_image_tokens = prev_token_h * prev_token_w + # Calculate small preview image dimensions (used in text-to-image) + # GLM-Image generates a small preview at 1/4 resolution before the full image + # The preview grid is computed as target_grid / 2 in each dimension + small_token_h = token_h // 2 + small_token_w = token_w // 2 + small_image_tokens = small_token_h * small_token_w + + # Log actual values for debugging + logger.info( + f"_parse_generated_tokens: total_tokens={len(token_ids)}, " + f"large_image_tokens={large_image_tokens} ({token_h}x{token_w}), " + f"small_image_tokens={small_image_tokens} ({small_token_h}x{small_token_w})" + ) # Determine if this is text-to-image (has small + large) or image-to-image (large only) total_expected_t2i = small_image_tokens + large_image_tokens + 1 # +1 for EOS @@ -79,9 +86,11 @@ def _parse_generated_tokens( large_start = small_image_tokens large_end = large_start + large_image_tokens prior_token_ids_d32 = token_tensor[large_start:large_end] + logger.info(f"Text-to-image mode: extracting tokens [{large_start}:{large_end}]") elif len(token_ids) >= total_expected_i2i: # Image-to-image: large image tokens are at the beginning prior_token_ids_d32 = token_tensor[:large_image_tokens] + logger.info(f"Image-to-image mode: extracting tokens [0:{large_image_tokens}]") else: # Fallback: use whatever tokens we have logger.warning( @@ -89,6 +98,13 @@ def _parse_generated_tokens( ) prior_token_ids_d32 = token_tensor[:large_image_tokens] + # Log token value statistics for debugging + logger.info( + f"prior_token_ids_d32: min={prior_token_ids_d32.min().item()}, " + f"max={prior_token_ids_d32.max().item()}, " + f"unique_count={prior_token_ids_d32.unique().numel()}" + ) + # Upsample from 32x to 16x prior_token_ids = _upsample_token_ids(prior_token_ids_d32, token_h, token_w) From 31fa960e43473c696f6f891d070b6fd51d4ed982 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 12:48:48 +0800 Subject: [PATCH 31/53] debug Signed-off-by: JaredforReal --- .../stage_input_processors/glm_image.py | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index 28732300334..eff1fb197e9 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -68,6 +68,8 @@ def _parse_generated_tokens( small_token_w = token_w // 2 small_image_tokens = small_token_h * small_token_w + token_tensor = torch.tensor(token_ids, dtype=torch.long) + # Log actual values for debugging logger.info( f"_parse_generated_tokens: total_tokens={len(token_ids)}, " @@ -75,11 +77,39 @@ def _parse_generated_tokens( f"small_image_tokens={small_image_tokens} ({small_token_h}x{small_token_w})" ) - # Determine if this is text-to-image (has small + large) or image-to-image (large only) + # Analyze token distribution to find image tokens + # Image tokens should be in range [0, 16384) for VQ codebook + # Text tokens are typically higher values + logger.info( + f"Full sequence stats: min={token_tensor.min().item()}, " + f"max={token_tensor.max().item()}, " + f"unique={token_tensor.unique().numel()}" + ) + + # Look for the actual image tokens - they should be consecutive and in VQ range + # Print first 20 and last 20 tokens to understand the structure + logger.info(f"First 20 tokens: {token_tensor[:20].tolist()}") + logger.info(f"Last 20 tokens: {token_tensor[-20:].tolist()}") + + # The actual structure for text-to-image from vLLM AR should be: + # [small_image_tokens (256)] + [large_image_tokens (1024)] + [EOS] + # Total expected: 256 + 1024 + 1 = 1281 tokens + # But we got 16384 tokens - this suggests the output includes prompt tokens + + # For GLM-Image, the expected structure is that the model generates ALL new tokens + # including both small preview and large image tokens + # Since we got 16384 tokens, and 1024*16 = 16384, this might be at 2x downsampling + # Let's try different interpretations + + # Possibility 1: tokens are at 2x scale (64x64 = 4096 for large, 32x32 = 1024 for small) + # Possibility 2: the output is padded or has a different format + # Possibility 3: tokens include repeated EOS or padding + total_expected_t2i = small_image_tokens + large_image_tokens + 1 # +1 for EOS total_expected_i2i = large_image_tokens + 1 - token_tensor = torch.tensor(token_ids, dtype=torch.long) + # Try to detect the end of meaningful tokens by looking for EOS patterns + # EOS token is typically a high value or repeated value at the end if len(token_ids) >= total_expected_t2i: # Text-to-image: extract large image tokens after small image tokens From 1b15a94fdee4f075997943339c31b344ac11d86e Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 12:58:49 +0800 Subject: [PATCH 32/53] diffusion temperature 1.0 Signed-off-by: JaredforReal --- vllm_omni/model_executor/stage_configs/glm_image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index d9186769362..c8cddae9e7c 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -29,7 +29,7 @@ stage_args: final_output: false # AR is not the final output is_comprehension: true default_sampling_params: - temperature: 0.0 + temperature: 1.0 # Must use sampling (not greedy) for image token generation top_p: 1.0 top_k: -1 max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384) From 67ec0afb65616727f619061a9a3d89f6df33b124 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 13:18:27 +0800 Subject: [PATCH 33/53] end2end params temp Signed-off-by: JaredforReal --- examples/offline_inference/glm_image/end2end.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py index 1134f00b424..aa7955e44e7 100644 --- a/examples/offline_inference/glm_image/end2end.py +++ b/examples/offline_inference/glm_image/end2end.py @@ -216,8 +216,11 @@ def main(args: argparse.Namespace) -> None: # For multistage, the AR stage may need sampling params from vllm import SamplingParams + # IMPORTANT: GLM-Image AR model requires sampling (not greedy) for proper + # image token generation. Using temperature=0.0 causes degenerate repetitive + # tokens and black images. Must use temperature > 0 (default: 1.0). ar_sampling_params = SamplingParams( - temperature=0.0, + temperature=1.0, # Must use sampling for image token diversity top_p=1.0, top_k=-1, max_tokens=args.max_tokens, From 15c6a36bdce354706afc8043db71aebdf3cd590d Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 13:38:32 +0800 Subject: [PATCH 34/53] apply_chat_template, prepocessor text Signed-off-by: JaredforReal --- .../offline_inference/glm_image/end2end.py | 10 ++++ vllm_omni/inputs/preprocess.py | 49 ++++++++++++++++ .../models/glm_image/glm_image_ar.py | 58 ++++++++++++++----- 3 files changed, 103 insertions(+), 14 deletions(-) diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py index aa7955e44e7..1a4a0273829 100644 --- a/examples/offline_inference/glm_image/end2end.py +++ b/examples/offline_inference/glm_image/end2end.py @@ -88,6 +88,11 @@ def build_prompt_for_t2i( "prompt": prompt, "height": height, "width": width, + # Pass target dimensions to AR processor for proper grid token generation + "mm_processor_kwargs": { + "target_h": height, + "target_w": width, + }, } @@ -122,6 +127,11 @@ def build_prompt_for_i2i( }, "height": height, "width": width, + # Pass target dimensions to AR processor for proper grid token generation + "mm_processor_kwargs": { + "target_h": height, + "target_w": width, + }, } diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py index 4c1ee5388ec..659d8e9adfe 100644 --- a/vllm_omni/inputs/preprocess.py +++ b/vllm_omni/inputs/preprocess.py @@ -20,6 +20,55 @@ class OmniInputPreprocessor(InputPreprocessor): Supports processing tokens, embeddings, text, and multimodal inputs. """ + def _process_text( + self, + parsed_content: TextPrompt, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> OmniTokenInputs | MultiModalInputs: + """Process text prompts with support for mm_processor_kwargs. + + Override the base class to support passing mm_processor_kwargs even when + there's no multi_modal_data. This is needed for models like GLM-Image + where text-to-image generation requires processor kwargs (target_h, target_w) + to properly format the prompt with grid tokens. + """ + prompt_text = parsed_content["prompt"] + mm_processor_kwargs = parsed_content.get("mm_processor_kwargs") or {} + + inputs: OmniTokenInputs | MultiModalInputs + if multi_modal_data := parsed_content.get("multi_modal_data"): + inputs = self._process_multimodal( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + elif mm_processor_kwargs: + # Handle case where mm_processor_kwargs is provided without multi_modal_data + # This is needed for GLM-Image text-to-image mode where the processor + # needs target_h/target_w to build the prompt with grid tokens + inputs = self._process_multimodal( + prompt_text, + {}, # Empty multi_modal_data + mm_processor_kwargs, + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + else: + prompt_token_ids = self._tokenize_prompt( + prompt_text, + tokenization_kwargs=tokenization_kwargs, + ) + inputs = token_inputs_omni(prompt_token_ids=prompt_token_ids) + + if cache_salt := parsed_content.get("cache_salt"): + inputs["cache_salt"] = cache_salt + + return inputs + def _process_tokens( self, parsed_content: TokensPrompt, diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 6be85f2a861..458013522e5 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -290,14 +290,40 @@ def _call_hf_processor( """ Call the HuggingFace processor. - If no multimodal data is provided (text-to-image mode), - we only tokenize the text. + For text-to-image mode (no images), we need to: + 1. Build the prompt with target grid dimensions + 2. Build the image_grid_thw tensor for M-RoPE position encoding + + For image-to-image mode, we use the full processor. """ if not mm_data or not mm_data.get("image"): - # Text-to-image mode: just tokenize the prompt - tokenizer = self.info.get_tokenizer() - prompt_ids = tokenizer.encode(prompt) - return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + # Text-to-image mode: use GlmImageProcessor with target dimensions + # This is critical - the processor adds grid tokens that tell the model + # what resolution to generate + processor = self.info.get_hf_processor() + if processor is not None: + # Get target dimensions from mm_kwargs or use defaults + target_h = mm_kwargs.get("target_h", 1024) if mm_kwargs else 1024 + target_w = mm_kwargs.get("target_w", 1024) if mm_kwargs else 1024 + + # Build messages format expected by processor + messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] + + # Use apply_chat_template which handles target dimensions + hf_inputs = processor.apply_chat_template( + messages, + tokenize=True, + target_h=target_h, + target_w=target_w, + return_dict=True, + return_tensors="pt", + ) + return hf_inputs + else: + # Fallback: just tokenize (this won't work properly for generation) + tokenizer = self.info.get_tokenizer() + prompt_ids = tokenizer.encode(prompt) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") # Image-to-image mode: use full processor return super()._call_hf_processor( @@ -315,16 +341,20 @@ def _get_mm_fields_config( """ Get the multimodal field configuration. - Returns empty dict if no image data (text-to-image mode). + For text-to-image: only image_grid_thw is needed (no pixel_values) + For image-to-image: both pixel_values and image_grid_thw are needed """ - # Check if we have image data - if "pixel_values" not in hf_inputs: - return {} + result = {} - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_grid_thw=MultiModalFieldConfig.batched("image"), - ) + # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding) + if "image_grid_thw" in hf_inputs: + result["image_grid_thw"] = MultiModalFieldConfig.batched("image") + + # pixel_values only present in image-to-image mode + if "pixel_values" in hf_inputs: + result["pixel_values"] = MultiModalFieldConfig.batched("image") + + return result def _get_prompt_updates( self, From 36bd0f77acc9d2619632fe81c081d30483690258 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 13:59:19 +0800 Subject: [PATCH 35/53] get processor config Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 458013522e5..0bfecb6a7b5 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -127,14 +127,48 @@ def get_hf_config(self) -> GlmImageConfig: return self.ctx.get_hf_config(GlmImageConfig) def get_hf_processor(self, **kwargs: object): - # GLM-Image uses a processor similar to Qwen2-VL - # Try to get GlmImageProcessor if available + """Get the GlmImageProcessor. + + GLM-Image has a special directory structure where: + - Model (AR) is in: {base}/vision_language_encoder/ + - Processor is in: {base}/processor/ + + Since model_subdir is used to load the AR model, the model_config.model + path points to vision_language_encoder/. We need to go up one level + and into processor/ to load the GlmImageProcessor. + """ + import os + try: from transformers import GlmImageProcessor - return self.ctx.get_hf_processor(GlmImageProcessor, **kwargs) - except ImportError: + # Get the model path from config + model_path = self.ctx.model_config.model + + # Check if we're in a subdirectory (vision_language_encoder) + # and need to go to processor/ instead + if model_path.endswith("vision_language_encoder") or "/vision_language_encoder" in model_path: + # Go up one level and into processor/ + base_path = os.path.dirname(model_path.rstrip("/")) + processor_path = os.path.join(base_path, "processor") + else: + # Try processor subdirectory of current path + processor_path = os.path.join(model_path, "processor") + if not os.path.exists(processor_path): + processor_path = model_path + + # Load processor directly from the correct path + return GlmImageProcessor.from_pretrained( + processor_path, + trust_remote_code=self.ctx.model_config.trust_remote_code, + **kwargs, + ) + except (ImportError, OSError) as e: # Fallback: return None and handle in processor + from vllm.logger import init_logger + + logger = init_logger(__name__) + logger.warning(f"Failed to load GlmImageProcessor: {e}") return None def get_supported_mm_limits(self) -> Mapping[str, int | None]: From 186f14992ddaff563fb85915ca983c3ded551e32 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 14:07:55 +0800 Subject: [PATCH 36/53] debug logging Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 20 +++++ .../stage_input_processors/glm_image.py | 87 ++++++++++++------- 2 files changed, 77 insertions(+), 30 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 0bfecb6a7b5..e319359eb93 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -343,6 +343,10 @@ def _call_hf_processor( # Build messages format expected by processor messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] + logger.info("[GLM-Image T2I] Using GlmImageProcessor.apply_chat_template") + logger.info(f"[GLM-Image T2I] target_h={target_h}, target_w={target_w}") + logger.info(f"[GLM-Image T2I] prompt: {prompt[:200]}...") + # Use apply_chat_template which handles target dimensions hf_inputs = processor.apply_chat_template( messages, @@ -352,6 +356,22 @@ def _call_hf_processor( return_dict=True, return_tensors="pt", ) + + # Debug: log the tokenized input + if "input_ids" in hf_inputs: + input_ids = hf_inputs["input_ids"] + if hasattr(input_ids, "shape"): + logger.info(f"[GLM-Image T2I] input_ids shape: {input_ids.shape}") + tokenizer = self.info.get_tokenizer() + if tokenizer is not None and hasattr(input_ids, "__len__"): + # Decode to check the format + ids_list = input_ids[0].tolist() if hasattr(input_ids[0], "tolist") else list(input_ids[0]) + decoded = tokenizer.decode(ids_list) + logger.info(f"[GLM-Image T2I] decoded input: {decoded}") + + if "image_grid_thw" in hf_inputs: + logger.info(f"[GLM-Image T2I] image_grid_thw: {hf_inputs['image_grid_thw']}") + return hf_inputs else: # Fallback: just tokenize (this won't work properly for generation) diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index eff1fb197e9..79f5d91baf9 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -78,65 +78,92 @@ def _parse_generated_tokens( ) # Analyze token distribution to find image tokens - # Image tokens should be in range [0, 16384) for VQ codebook - # Text tokens are typically higher values logger.info( f"Full sequence stats: min={token_tensor.min().item()}, " f"max={token_tensor.max().item()}, " f"unique={token_tensor.unique().numel()}" ) - # Look for the actual image tokens - they should be consecutive and in VQ range # Print first 20 and last 20 tokens to understand the structure logger.info(f"First 20 tokens: {token_tensor[:20].tolist()}") logger.info(f"Last 20 tokens: {token_tensor[-20:].tolist()}") - # The actual structure for text-to-image from vLLM AR should be: - # [small_image_tokens (256)] + [large_image_tokens (1024)] + [EOS] - # Total expected: 256 + 1024 + 1 = 1281 tokens - # But we got 16384 tokens - this suggests the output includes prompt tokens + # Remove EOS token (16385) from the end if present + eos_token_id = 16385 + if len(token_ids) > 0 and token_ids[-1] == eos_token_id: + token_tensor = token_tensor[:-1] + logger.info(f"Removed EOS token, remaining: {len(token_tensor)} tokens") - # For GLM-Image, the expected structure is that the model generates ALL new tokens - # including both small preview and large image tokens - # Since we got 16384 tokens, and 1024*16 = 16384, this might be at 2x downsampling - # Let's try different interpretations + actual_tokens = len(token_tensor) - # Possibility 1: tokens are at 2x scale (64x64 = 4096 for large, 32x32 = 1024 for small) - # Possibility 2: the output is padded or has a different format - # Possibility 3: tokens include repeated EOS or padding - - total_expected_t2i = small_image_tokens + large_image_tokens + 1 # +1 for EOS - total_expected_i2i = large_image_tokens + 1 - - # Try to detect the end of meaningful tokens by looking for EOS patterns - # EOS token is typically a high value or repeated value at the end - - if len(token_ids) >= total_expected_t2i: + if actual_tokens >= small_image_tokens + large_image_tokens: # Text-to-image: extract large image tokens after small image tokens large_start = small_image_tokens large_end = large_start + large_image_tokens prior_token_ids_d32 = token_tensor[large_start:large_end] + actual_h, actual_w = token_h, token_w logger.info(f"Text-to-image mode: extracting tokens [{large_start}:{large_end}]") - elif len(token_ids) >= total_expected_i2i: + elif actual_tokens >= large_image_tokens: # Image-to-image: large image tokens are at the beginning prior_token_ids_d32 = token_tensor[:large_image_tokens] + actual_h, actual_w = token_h, token_w logger.info(f"Image-to-image mode: extracting tokens [0:{large_image_tokens}]") else: - # Fallback: use whatever tokens we have - logger.warning( - f"Unexpected token count: {len(token_ids)}, expected at least {total_expected_i2i}. Using available tokens." - ) - prior_token_ids_d32 = token_tensor[:large_image_tokens] + # Insufficient tokens - try to infer the actual grid size + # The model might have generated for a different resolution + import math + + # Try to find a square grid that fits the available tokens + # First check if it matches any of the small+large patterns + for scale in [1, 2, 4]: + test_h = token_h // scale + test_w = token_w // scale + test_small_h = test_h // 2 + test_small_w = test_w // 2 + test_large = test_h * test_w + test_small = test_small_h * test_small_w + + if actual_tokens >= test_small + test_large: + # Found matching grid for t2i + prior_token_ids_d32 = token_tensor[test_small : test_small + test_large] + actual_h, actual_w = test_h, test_w + # Adjust output dimensions + height = test_h * factor + width = test_w * factor + logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}") + break + elif actual_tokens >= test_large: + # Found matching grid for i2i + prior_token_ids_d32 = token_tensor[:test_large] + actual_h, actual_w = test_h, test_w + height = test_h * factor + width = test_w * factor + logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}") + break + else: + # Last resort: find closest square grid + sqrt_tokens = int(math.sqrt(actual_tokens)) + actual_h = actual_w = sqrt_tokens + usable_tokens = sqrt_tokens * sqrt_tokens + prior_token_ids_d32 = token_tensor[:usable_tokens] + height = sqrt_tokens * factor + width = sqrt_tokens * factor + logger.error( + f"Could not match grid pattern. Using {sqrt_tokens}x{sqrt_tokens} grid " + f"({usable_tokens} tokens), output will be {height}x{width}. " + f"This likely indicates a prompt format issue." + ) # Log token value statistics for debugging logger.info( - f"prior_token_ids_d32: min={prior_token_ids_d32.min().item()}, " + f"prior_token_ids_d32: shape={prior_token_ids_d32.shape}, " + f"min={prior_token_ids_d32.min().item()}, " f"max={prior_token_ids_d32.max().item()}, " f"unique_count={prior_token_ids_d32.unique().numel()}" ) # Upsample from 32x to 16x - prior_token_ids = _upsample_token_ids(prior_token_ids_d32, token_h, token_w) + prior_token_ids = _upsample_token_ids(prior_token_ids_d32, actual_h, actual_w) return prior_token_ids, height, width From 69197a453777d9957088da0a2eb0110f841f68a3 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 14:20:56 +0800 Subject: [PATCH 37/53] use processor.tokenizer Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 14 +++++++++----- .../model_executor/stage_configs/glm_image.yaml | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index e319359eb93..c4c7204aea6 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -362,12 +362,16 @@ def _call_hf_processor( input_ids = hf_inputs["input_ids"] if hasattr(input_ids, "shape"): logger.info(f"[GLM-Image T2I] input_ids shape: {input_ids.shape}") - tokenizer = self.info.get_tokenizer() - if tokenizer is not None and hasattr(input_ids, "__len__"): - # Decode to check the format + # Use processor's tokenizer (not ByT5Tokenizer from tokenizer/ dir) + # GlmImageProcessor has its own tokenizer with a different vocabulary + if hasattr(processor, "tokenizer") and processor.tokenizer is not None: ids_list = input_ids[0].tolist() if hasattr(input_ids[0], "tolist") else list(input_ids[0]) - decoded = tokenizer.decode(ids_list) - logger.info(f"[GLM-Image T2I] decoded input: {decoded}") + try: + decoded = processor.tokenizer.decode(ids_list) + logger.info(f"[GLM-Image T2I] decoded input: {decoded}") + except Exception as e: + logger.warning(f"[GLM-Image T2I] could not decode: {e}") + logger.info(f"[GLM-Image T2I] first 50 token ids: {ids_list[:50]}") if "image_grid_thw" in hf_inputs: logger.info(f"[GLM-Image T2I] image_grid_thw: {hf_inputs['image_grid_thw']}") diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index c8cddae9e7c..d26fa2c73ee 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -16,7 +16,7 @@ stage_args: model_stage: ar model_arch: GlmImageForConditionalGeneration model_subdir: vision_language_encoder # AR model config.json is in this subdirectory - tokenizer_subdir: tokenizer # Tokenizer files are in tokenizer/ subdirectory + tokenizer_subdir: processor # Use processor's tokenizer (not ByT5 from tokenizer/) worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler gpu_memory_utilization: 0.6 From c22875b56d5ef5bf07dd73701d5ca6a14a59e3ab Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 14:30:39 +0800 Subject: [PATCH 38/53] use temperature 0.9 and 0.15 top_p Signed-off-by: JaredforReal --- .../model_executor/models/glm_image/glm_image_ar.py | 9 +++++++++ vllm_omni/model_executor/stage_configs/glm_image.yaml | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index c4c7204aea6..94c0abc069c 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -1824,6 +1824,15 @@ def forward( Returns: Hidden states or intermediate tensors """ + # Debug logging (first call only) + if not hasattr(self, "_logged_forward"): + self._logged_forward = True + logger.info(f"[GLM-Image Forward] input_ids shape: {input_ids.shape if input_ids is not None else None}") + logger.info(f"[GLM-Image Forward] positions shape: {positions.shape if positions is not None else None}") + logger.info(f"[GLM-Image Forward] pixel_values: {pixel_values is not None}") + logger.info(f"[GLM-Image Forward] image_grid_thw: {image_grid_thw}") + logger.info(f"[GLM-Image Forward] kwargs keys: {list(kwargs.keys())}") + if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index d26fa2c73ee..edf7f6219b7 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -29,8 +29,8 @@ stage_args: final_output: false # AR is not the final output is_comprehension: true default_sampling_params: - temperature: 1.0 # Must use sampling (not greedy) for image token generation - top_p: 1.0 + temperature: 0.9 # From model's generation_config.json + top_p: 0.75 # From model's generation_config.json top_k: -1 max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384) seed: 42 From e37bfcb1d3adf9b40836bed2bbfde68c10c7601d Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 14:45:49 +0800 Subject: [PATCH 39/53] align image_grid_thw with transformers Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 87 ++++++++++++++++--- 1 file changed, 74 insertions(+), 13 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 94c0abc069c..5ed879016f9 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -1715,7 +1715,9 @@ def get_image_tokens( def get_mrope_input_positions( self, input_tokens: list[int], - mm_features: list[MultiModalFeatureSpec], + mm_features: list[MultiModalFeatureSpec] | None = None, + image_grid_thw: list[list[int]] | None = None, + **kwargs, ) -> tuple[torch.Tensor, int]: """ Compute M-RoPE position IDs for GLM-Image generation. @@ -1727,19 +1729,30 @@ def get_mrope_input_positions( - height: row position in image grid - width: column position in image grid + For text-to-image generation, we also pre-compute positions for the tokens + that will be generated (small image + large image + EOS), similar to how + transformers GLM-Image caches decode positions. + Args: input_tokens: List of input token IDs - mm_features: Multimodal feature specifications + mm_features: Multimodal feature specifications (optional) + image_grid_thw: Pre-extracted image grid dimensions (optional) + **kwargs: Additional arguments (hf_config, video_grid_thw, etc.) Returns: - Tuple of (position_ids [3, seq_len], mrope_position_delta) + Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta) """ - # Gather image grid info from multimodal features - kwargs = MultiModalFeatureSpec.gather_kwargs( - mm_features, - {"image_grid_thw"}, - ) - image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])] + # Get image_grid_thw from either the direct arg or mm_features + if image_grid_thw is None and mm_features is not None: + # Gather image grid info from multimodal features + feature_kwargs = MultiModalFeatureSpec.gather_kwargs( + mm_features, + {"image_grid_thw"}, + ) + image_grid_thw = [item.tolist() for item in feature_kwargs.get("image_grid_thw", [])] + + if image_grid_thw is None: + image_grid_thw = [] hf_config = self.config image_start_token_id = hf_config.image_start_token_id @@ -1748,6 +1761,9 @@ def get_mrope_input_positions( seq_len = len(input_tokens) llm_pos_ids_list: list[torch.Tensor] = [] + # Count completed images (have end marker) vs images to generate + num_complete_images = sum(1 for t in input_tokens if t == image_end_token_id) + if image_grid_thw: # Build position IDs considering image regions current_pos = 0 @@ -1757,8 +1773,8 @@ def get_mrope_input_positions( while i < seq_len: token = input_tokens[i] - if token == image_start_token_id and image_idx < len(image_grid_thw): - # Start of image region + if token == image_start_token_id and image_idx < num_complete_images: + # This is a completed image (source image for i2i) # Add position for the start marker llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]])) current_pos += 1 @@ -1787,12 +1803,57 @@ def get_mrope_input_positions( i += 1 else: - # Regular text token + # Regular text token (or trailing start marker for generation) llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]])) current_pos += 1 i += 1 - llm_positions = torch.cat(llm_pos_ids_list, dim=1) + prefill_positions = torch.cat(llm_pos_ids_list, dim=1) + + # Pre-compute decode positions for images that will be generated + # This is critical for text-to-image where we need to generate image tokens + num_decode_grids = len(image_grid_thw) - num_complete_images + + if num_decode_grids > 0: + decode_pos_lists: list[torch.Tensor] = [] + decode_pos = current_pos + + # Process grids in reverse order (last grid first for GLM-Image t2i) + # For t2i with grids [[1,32,32], [1,16,16]]: + # - First generate small image (16x16 = 256 tokens) + # - Then generate large image (32x32 = 1024 tokens) + # - Finally generate EOS + for i in range(1, num_decode_grids + 1): + grid_idx = -i + _, h, w = image_grid_thw[grid_idx] + total_tokens = h * w + + # Build 2D positions for this generated image + h_indices = torch.arange(h).unsqueeze(1).expand(h, w).flatten() + w_indices = torch.arange(w).unsqueeze(0).expand(h, w).flatten() + + decode_t = torch.full((total_tokens,), decode_pos, dtype=torch.long) + decode_h = decode_pos + h_indices + decode_w = decode_pos + w_indices + + decode_pos_lists.append(torch.stack([decode_t, decode_h, decode_w], dim=0)) + decode_pos = decode_pos + max(h, w) + + # Add position for EOS token + decode_pos_lists.append(torch.tensor([[decode_pos], [decode_pos], [decode_pos]])) + + decode_positions = torch.cat(decode_pos_lists, dim=1) + + # Concatenate prefill and decode positions + llm_positions = torch.cat([prefill_positions, decode_positions], dim=1) + + # Log for debugging + logger.info( + f"[GLM-Image M-RoPE] prefill_len={prefill_positions.shape[1]}, " + f"decode_len={decode_positions.shape[1]}, total_len={llm_positions.shape[1]}" + ) + else: + llm_positions = prefill_positions else: # Pure text - all dimensions same llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1) From ac8a81b4db5d46780d45de4df7dd149767bdfa20 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 15:20:23 +0800 Subject: [PATCH 40/53] fix params Signed-off-by: JaredforReal --- .../offline_inference/glm_image/end2end.py | 63 ++++++++++++++++--- .../stage_configs/glm_image.yaml | 5 +- .../glm_image_muilticonnector.yaml | 9 +-- 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py index 1a4a0273829..65c3653a3d2 100644 --- a/examples/offline_inference/glm_image/end2end.py +++ b/examples/offline_inference/glm_image/end2end.py @@ -51,6 +51,41 @@ SEED = 42 +# GLM-Image special tokens +GLM_IMAGE_EOS_TOKEN_ID = 16385 # eos_token_id from generation_config.json +GLM_IMAGE_VISION_VOCAB_SIZE = 16512 # top_k should be vision_vocab_size + + +def compute_max_tokens(height: int, width: int, factor: int = 32) -> int: + """ + Compute max_new_tokens for GLM-Image AR generation. + + GLM-Image generates tokens in this order for text-to-image: + 1. Small preview image (half resolution in each dimension) + 2. Large target image (full resolution) + 3. EOS token + + Args: + height: Target image height in pixels + width: Target image width in pixels + factor: Downsampling factor (32 for GLM-Image AR output) + + Returns: + Total number of tokens to generate (small + large + EOS) + """ + # Large image tokens (target resolution) + token_h = height // factor + token_w = width // factor + large_tokens = token_h * token_w + + # Small preview tokens (half resolution in each dimension) + small_h = token_h // 2 + small_w = token_w // 2 + small_tokens = small_h * small_w + + # Total: small + large + EOS + return small_tokens + large_tokens + 1 + def load_image(image_path: str) -> Image.Image: """Load an image from file path.""" @@ -226,14 +261,28 @@ def main(args: argparse.Namespace) -> None: # For multistage, the AR stage may need sampling params from vllm import SamplingParams - # IMPORTANT: GLM-Image AR model requires sampling (not greedy) for proper - # image token generation. Using temperature=0.0 causes degenerate repetitive - # tokens and black images. Must use temperature > 0 (default: 1.0). + # Compute max_tokens dynamically based on target image size + target_height = prompt_dict.get("height", 1024) + target_width = prompt_dict.get("width", 1024) + calculated_max_tokens = compute_max_tokens(target_height, target_width) + + # Use calculated value unless user explicitly specified a different value + # Default args.max_tokens is 16384 (very large), so prefer calculated value + effective_max_tokens = calculated_max_tokens if args.max_tokens == 16384 else args.max_tokens + + if args.verbose: + print(f"AR max_tokens: {effective_max_tokens} (calculated: {calculated_max_tokens}, arg: {args.max_tokens})") + + # IMPORTANT: GLM-Image AR model requires these exact sampling parameters + # from generation_config.json for proper image token generation. + # - temperature=0.9, top_p=0.75, top_k=16512 (vision_vocab_size) + # - stop_token_ids=[16385] (eos_token_id) is CRITICAL to stop generation ar_sampling_params = SamplingParams( - temperature=1.0, # Must use sampling for image token diversity - top_p=1.0, - top_k=-1, - max_tokens=args.max_tokens, + temperature=0.9, # From generation_config.json + top_p=0.75, # From generation_config.json + top_k=GLM_IMAGE_VISION_VOCAB_SIZE, # 16512, vision vocabulary size + max_tokens=effective_max_tokens, + stop_token_ids=[GLM_IMAGE_EOS_TOKEN_ID], # 16385, CRITICAL for stopping seed=args.seed, detokenize=False, ) diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index edf7f6219b7..7deca12c9ba 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -31,8 +31,9 @@ stage_args: default_sampling_params: temperature: 0.9 # From model's generation_config.json top_p: 0.75 # From model's generation_config.json - top_k: -1 - max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384) + top_k: 16512 # vision_vocab_size from generation_config.json + max_tokens: 1281 # For 1024x1024: small(16x16=256) + large(32x32=1024) + EOS(1) + stop_token_ids: [16385] # eos_token_id from generation_config.json seed: 42 detokenize: false diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml index c32b1cd3d07..d1e10cb4065 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml @@ -31,10 +31,11 @@ stage_args: final_output: false # AR is not the final output is_comprehension: true default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 16384 # Support up to 2048x2048 images + temperature: 0.9 # From model's generation_config.json + top_p: 0.75 # From model's generation_config.json + top_k: 16512 # vision_vocab_size from generation_config.json + max_tokens: 1281 # For 1024x1024: small(16x16=256) + large(32x32=1024) + EOS(1) + stop_token_ids: [16385] # eos_token_id from generation_config.json seed: 42 detokenize: false From a6ae872728cfd1e74065e9e3220d1b2599670295 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 15:50:24 +0800 Subject: [PATCH 41/53] fix mrope calc Signed-off-by: JaredforReal --- vllm_omni/worker/gpu_model_runner.py | 143 ++++++++++++++++++--------- 1 file changed, 97 insertions(+), 46 deletions(-) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 24d4ffd028e..b59298b9a25 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -8,7 +8,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.models.interfaces import supports_mrope, supports_mm_encoder_only +from vllm.model_executor.models.interfaces import supports_mm_encoder_only, supports_mrope from vllm.model_executor.models.interfaces_base import VllmModelForPooling from vllm.sampling_params import SamplingType from vllm.utils.import_utils import LazyLoader @@ -17,6 +17,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState from vllm.v1.worker.gpu_model_runner import GPUModelRunner, IntermediateTensors, PerLayerAttnMetadata from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices + from vllm_omni.model_executor.models.output_templates import OmniOutput if TYPE_CHECKING: @@ -116,6 +117,79 @@ def _init_mrope_positions(self, req_state: CachedRequestState): use_audio_in_video=use_audio_in_video, ) + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + """Calculate M-RoPE positions for scheduled tokens. + + This method overrides the base vLLM implementation to support models + like GLM-Image that pre-compute decode positions with 2D spatial encoding. + + For GLM-Image text-to-image generation: + - Prefill positions: Use pre-computed positions from get_mrope_input_positions + - Decode positions: Also use pre-computed 2D spatial positions instead of + the default linear positions from get_next_input_positions_tensor + + The key difference from vLLM's default behavior: + - Default vLLM: decode positions use linear [N, N+1, N+2, ...] for all 3 dims + - GLM-Image needs: temporal=constant, height/width=2D grid pattern + """ + from vllm.multimodal.utils import length_from_prompt_token_ids_or_embeds + + mrope_pos_ptr = 0 + for index, req_id in enumerate(self.input_batch.req_ids): + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = length_from_prompt_token_ids_or_embeds(req.prompt_token_ids, req.prompt_embeds) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens) + completion_part_len = max(0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions.np[:, dst_start:dst_end] = req.mrope_positions[:, src_start:src_end] + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + dst_start = mrope_pos_ptr + + # Check if pre-computed decode positions are available + # GLM-Image's get_mrope_input_positions returns positions for + # both prefill and decode phases with proper 2D spatial encoding + total_precomputed = req.mrope_positions.shape[1] + decode_start = num_computed_tokens + prompt_part_len + decode_end = decode_start + completion_part_len + + if decode_end <= total_precomputed: + # Use pre-computed decode positions (for GLM-Image 2D spatial) + self.mrope_positions.np[:, dst_start : dst_start + completion_part_len] = req.mrope_positions[ + :, decode_start:decode_end + ] + else: + # Fallback to default linear positions for text-only generation + assert req.mrope_position_delta is not None + MRotaryEmbedding.get_next_input_positions_tensor( + out=self.mrope_positions.np, + out_offset=dst_start, + mrope_position_delta=req.mrope_position_delta, + context_len=num_computed_tokens + prompt_part_len, + num_new_tokens=completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -248,7 +322,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: except Exception as e: logger.error(f"Error decoding additional information: {e}") pass - + if sampling_params and sampling_params.prompt_logprobs is not None: self.num_prompt_logprobs[req_id] = ( self.input_batch.vocab_size @@ -258,11 +332,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: self._init_mrope_positions(req_state) - + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) if self.uses_xdrope_dim > 0: self._init_xdrope_positions(req_state) - + reqs_to_add.append(self.requests[req_id]) # Update the states of the running/resumed requests. @@ -281,14 +355,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: resumed_from_preemption = req_id in req_data.resumed_req_ids num_output_tokens = req_data.num_output_tokens[i] req_index = self.input_batch.req_id_to_index.get(req_id) - + if req_state.prev_num_draft_len and self.use_async_scheduling: # prev_num_draft_len is used in async scheduling mode with # spec decode. it indicates if need to update num_computed_tokens # of the request. for example: # fist step: num_computed_tokens = 0, spec_tokens = [], # prev_num_draft_len = 0. - # second step: num_computed_tokens = 100(prompt lenth), + # second step: num_computed_tokens = 100(prompt length), # spec_tokens = [a,b], prev_num_draft_len = 0. # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d], # prev_num_draft_len = 2. @@ -305,7 +379,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: num_rejected = req_state.prev_num_draft_len - num_accepted num_computed_tokens -= num_rejected req_state.output_token_ids.extend([-1] * num_accepted) - + # Update the cached states. req_state.num_computed_tokens = num_computed_tokens @@ -327,12 +401,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # failure. Align the cached state. del req_state.output_token_ids[num_output_tokens:] if req_index is not None: - end_idx = ( - self.input_batch.num_prompt_tokens[req_index] - + num_output_tokens - ) + end_idx = self.input_batch.num_prompt_tokens[req_index] + num_output_tokens self.input_batch.num_tokens_no_spec[req_index] = end_idx - + # Update the block IDs. if not resumed_from_preemption: if new_block_ids is not None: @@ -372,15 +443,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Add new_token_ids to token_ids_cpu. start_token_index = num_computed_tokens end_token_index = num_computed_tokens + len(new_token_ids) - self.input_batch.token_ids_cpu[ - req_index, start_token_index:end_token_index - ] = new_token_ids + self.input_batch.token_ids_cpu[req_index, start_token_index:end_token_index] = new_token_ids self.input_batch.num_tokens_no_spec[req_index] = end_token_index # Add spec_token_ids to token_ids_cpu. self.input_batch.update_req_spec_token_ids(req_state, scheduled_spec_tokens) - # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. for request in reqs_to_add: @@ -457,10 +525,7 @@ def _dummy_run( # mm encoder dummy run may need to add in the future. return torch.tensor([]), torch.tensor([]) - assert ( - cudagraph_runtime_mode is None - or cudagraph_runtime_mode.valid_runtime_modes() - ) + assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.separate_routine(). This means that we are using @@ -521,8 +586,7 @@ def _dummy_run( max_num_scheduled_tokens=max_query_len, use_cascade_attn=False, allow_microbatching=allow_microbatching, - force_eager=is_profile - or (cudagraph_runtime_mode == CUDAGraphMode.NONE), + force_eager=is_profile or (cudagraph_runtime_mode == CUDAGraphMode.NONE), # `force_uniform_decode` is used for cudagraph capture; because for # capturing mixed prefill-decode batches, we sometimes use # num_tokens == num_reqs which looks like a uniform decode batch to the @@ -544,9 +608,7 @@ def _dummy_run( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = ( - batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - ) + num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( should_ubatch, num_scheduled_tokens, @@ -625,17 +687,13 @@ def _dummy_run( intermediate_tensors = None else: if self.intermediate_tensors is None: - self.intermediate_tensors = ( - self.model.make_empty_intermediate_tensors( - batch_size=self.max_num_tokens, - dtype=self.model_config.dtype, - device=self.device, - ) + self.intermediate_tensors = self.model.make_empty_intermediate_tensors( + batch_size=self.max_num_tokens, + dtype=self.model_config.dtype, + device=self.device, ) - intermediate_tensors = self.sync_and_slice_intermediate_tensors( - num_tokens_padded, None, False - ) + intermediate_tensors = self.sync_and_slice_intermediate_tensors(num_tokens_padded, None, False) if ubatch_slices_padded is not None: # Adjust values to reflect a single ubatch. @@ -676,14 +734,8 @@ def _dummy_run( # Therefore only use cudagraphs if the main model uses PIECEWISE # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - ( - is_graph_capturing - and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ) - or ( - not is_graph_capturing - and cudagraph_runtime_mode != CUDAGraphMode.NONE - ) + (is_graph_capturing and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE) + or (not is_graph_capturing and cudagraph_runtime_mode != CUDAGraphMode.NONE) ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two @@ -721,9 +773,7 @@ def _dummy_run( self.eplb_step(is_dummy=True, is_profile=is_profile) logit_indices = np.cumsum(num_scheduled_tokens) - 1 - logit_indices_device = torch.from_numpy(logit_indices).to( - self.device, non_blocking=True - ) + logit_indices_device = torch.from_numpy(logit_indices).to(self.device, non_blocking=True) return hidden_states, hidden_states[logit_indices_device] def _decode_and_store_request_payloads(self, scheduler_output: "SchedulerOutput") -> None: @@ -1005,9 +1055,10 @@ def _preprocess( except Exception as e: logger.error(f"Error in preprocess for request {req_id}: {e}") import traceback + traceback.print_exc() raise e - #TODO: This is Model Specific Code, need to be generalized in the future ZTC + # TODO: This is Model Specific Code, need to be generalized in the future ZTC # run talker mtp decode if hasattr(self.model, "talker_mtp"): _cudagraph_mode, batch_desc, _, _, _ = self._determine_batch_execution_and_padding( From 48881ed8276f4724046aa50d7d30a32ea22abcce Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 16:36:51 +0800 Subject: [PATCH 42/53] fix import Signed-off-by: JaredforReal --- vllm_omni/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index b59298b9a25..93cb3771566 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -132,7 +132,7 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): - Default vLLM: decode positions use linear [N, N+1, N+2, ...] for all 3 dims - GLM-Image needs: temporal=constant, height/width=2D grid pattern """ - from vllm.multimodal.utils import length_from_prompt_token_ids_or_embeds + from vllm.utils import length_from_prompt_token_ids_or_embeds mrope_pos_ptr = 0 for index, req_id in enumerate(self.input_batch.req_ids): From 2dda88fd2ead135bfacc0691b88dad70957b8a4c Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 16:43:42 +0800 Subject: [PATCH 43/53] add debug logging Signed-off-by: JaredforReal --- vllm_omni/worker/gpu_model_runner.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 93cb3771566..72f5051abf2 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -106,6 +106,12 @@ def _init_mrope_positions(self, req_state: CachedRequestState): audio_feature_lengths=audio_feature_lengths, use_audio_in_video=use_audio_in_video, ) + logger.info( + f"[M-RoPE Init] prompt_len={len(req_state.prompt_token_ids)}, " + f"mrope_positions_shape={req_state.mrope_positions.shape}, " + f"mrope_position_delta={req_state.mrope_position_delta}, " + f"image_grid_thw={image_grid_thw}" + ) else: req_state.mrope_positions, req_state.mrope_position_delta = MRotaryEmbedding.get_input_positions_tensor( req_state.prompt_token_ids, @@ -177,8 +183,18 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): self.mrope_positions.np[:, dst_start : dst_start + completion_part_len] = req.mrope_positions[ :, decode_start:decode_end ] + logger.debug( + f"[M-RoPE] Using pre-computed decode positions: " + f"decode_start={decode_start}, decode_end={decode_end}, " + f"total_precomputed={total_precomputed}" + ) else: # Fallback to default linear positions for text-only generation + logger.warning( + f"[M-RoPE] Falling back to linear positions! " + f"decode_end={decode_end} > total_precomputed={total_precomputed}, " + f"num_prompt_tokens={num_prompt_tokens}, completion_part_len={completion_part_len}" + ) assert req.mrope_position_delta is not None MRotaryEmbedding.get_next_input_positions_tensor( out=self.mrope_positions.np, From e676c9392699cc8a80daa62fa0c724489cb73e2d Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 16:52:47 +0800 Subject: [PATCH 44/53] more logs Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 42 ++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 5ed879016f9..75ea6fbd902 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -405,8 +405,17 @@ def _get_mm_fields_config( result = {} # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding) + # For text-to-image, we don't have pixel_values but still need image_grid_thw + # Use "image" modality so it gets processed, or use flat for metadata-only fields if "image_grid_thw" in hf_inputs: - result["image_grid_thw"] = MultiModalFieldConfig.batched("image") + # Check if we have pixel_values (image-to-image) or not (text-to-image) + if "pixel_values" in hf_inputs: + # Image-to-image: batch with image modality + result["image_grid_thw"] = MultiModalFieldConfig.batched("image") + else: + # Text-to-image: use flat config to ensure it's passed through + # This is metadata that doesn't depend on actual image data + result["image_grid_thw"] = MultiModalFieldConfig.flat("image", allow_missing=True) # pixel_values only present in image-to-image mode if "pixel_values" in hf_inputs: @@ -1742,6 +1751,12 @@ def get_mrope_input_positions( Returns: Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta) """ + logger.info( + f"[GLM-Image M-RoPE] get_mrope_input_positions called: " + f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, " + f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}" + ) + # Get image_grid_thw from either the direct arg or mm_features if image_grid_thw is None and mm_features is not None: # Gather image grid info from multimodal features @@ -1758,6 +1773,31 @@ def get_mrope_input_positions( image_start_token_id = hf_config.image_start_token_id image_end_token_id = hf_config.image_end_token_id + # For text-to-image: parse grid info from input tokens if not provided + # Input format: "textH Wh w<|dit_token_16384|>" + # where H W is large image grid (e.g., 32 32) and h w is small image grid (e.g., 16 16) + if not image_grid_thw: + # Try to parse from kwargs (passed from processor) + hf_config_arg = kwargs.get("hf_config") + if hf_config_arg is not None and hasattr(hf_config_arg, "image_grid_thw"): + image_grid_thw = hf_config_arg.image_grid_thw + + # If still empty, try to infer from input tokens + if not image_grid_thw: + # Check if this is a text-to-image request by looking for dit_token + # dit_token_id = image_start_token_id = 16384 + has_dit_token = image_start_token_id in input_tokens + has_end_token = image_end_token_id in input_tokens + + # Text-to-image: has dit_token but no end_token (nothing generated yet) + if has_dit_token and not has_end_token: + # Default grids for text-to-image: large (32x32) and small (16x16) + # These are the standard GLM-Image generation grids + # The actual grid sizes should be parsed from the prompt, but for now use defaults + # TODO: Parse grid sizes from prompt tokens like "32 32" + image_grid_thw = [[1, 32, 32], [1, 16, 16]] + logger.info(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}") + seq_len = len(input_tokens) llm_pos_ids_list: list[torch.Tensor] = [] From fc540f720f1d4dd76b7f93a24a5b4b9be30f2e99 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 16:56:47 +0800 Subject: [PATCH 45/53] fix config Signed-off-by: JaredforReal --- .../model_executor/models/glm_image/glm_image_ar.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 75ea6fbd902..4c33611a959 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -406,16 +406,10 @@ def _get_mm_fields_config( # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding) # For text-to-image, we don't have pixel_values but still need image_grid_thw - # Use "image" modality so it gets processed, or use flat for metadata-only fields + # Always use batched("image") - the fallback in get_mrope_input_positions will handle + # the case where image_grid_thw is not passed through mm_features if "image_grid_thw" in hf_inputs: - # Check if we have pixel_values (image-to-image) or not (text-to-image) - if "pixel_values" in hf_inputs: - # Image-to-image: batch with image modality - result["image_grid_thw"] = MultiModalFieldConfig.batched("image") - else: - # Text-to-image: use flat config to ensure it's passed through - # This is metadata that doesn't depend on actual image data - result["image_grid_thw"] = MultiModalFieldConfig.flat("image", allow_missing=True) + result["image_grid_thw"] = MultiModalFieldConfig.batched("image") # pixel_values only present in image-to-image mode if "pixel_values" in hf_inputs: From 585cecdce99d31e59e816649899dfb90fa8b745c Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 17:03:22 +0800 Subject: [PATCH 46/53] more logs Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 8 +++++--- vllm_omni/worker/gpu_model_runner.py | 14 ++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 4c33611a959..9df38a2d0b8 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -1745,7 +1745,7 @@ def get_mrope_input_positions( Returns: Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta) """ - logger.info( + logger.warning( f"[GLM-Image M-RoPE] get_mrope_input_positions called: " f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, " f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}" @@ -1790,7 +1790,7 @@ def get_mrope_input_positions( # The actual grid sizes should be parsed from the prompt, but for now use defaults # TODO: Parse grid sizes from prompt tokens like "32 32" image_grid_thw = [[1, 32, 32], [1, 16, 16]] - logger.info(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}") + logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}") seq_len = len(input_tokens) llm_pos_ids_list: list[torch.Tensor] = [] @@ -1882,15 +1882,17 @@ def get_mrope_input_positions( llm_positions = torch.cat([prefill_positions, decode_positions], dim=1) # Log for debugging - logger.info( + logger.warning( f"[GLM-Image M-RoPE] prefill_len={prefill_positions.shape[1]}, " f"decode_len={decode_positions.shape[1]}, total_len={llm_positions.shape[1]}" ) else: llm_positions = prefill_positions + logger.warning(f"[GLM-Image M-RoPE] No decode grids, prefill_len={prefill_positions.shape[1]}") else: # Pure text - all dimensions same llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1) + logger.warning(f"[GLM-Image M-RoPE] Pure text mode, positions_len={seq_len}") mrope_position_delta = (llm_positions.max() + 1 - seq_len).item() return llm_positions, mrope_position_delta diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 72f5051abf2..55d7a5d2e32 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -96,6 +96,12 @@ def _init_mrope_positions(self, req_state: CachedRequestState): use_audio_in_video = bool(use_audio_in_video_value.item()) if supports_mrope(self.model): + logger.warning( + f"[M-RoPE Init] Calling get_mrope_input_positions: " + f"prompt_len={len(req_state.prompt_token_ids)}, " + f"mm_features_count={len(req_state.mm_features) if req_state.mm_features else 0}, " + f"image_grid_thw={image_grid_thw}" + ) req_state.mrope_positions, req_state.mrope_position_delta = self.model.get_mrope_input_positions( req_state.prompt_token_ids, mm_features=req_state.mm_features, @@ -106,13 +112,13 @@ def _init_mrope_positions(self, req_state: CachedRequestState): audio_feature_lengths=audio_feature_lengths, use_audio_in_video=use_audio_in_video, ) - logger.info( - f"[M-RoPE Init] prompt_len={len(req_state.prompt_token_ids)}, " + logger.warning( + f"[M-RoPE Init] Result: " f"mrope_positions_shape={req_state.mrope_positions.shape}, " - f"mrope_position_delta={req_state.mrope_position_delta}, " - f"image_grid_thw={image_grid_thw}" + f"mrope_position_delta={req_state.mrope_position_delta}" ) else: + logger.warning("[M-RoPE Init] Model does not support M-RoPE, using default") req_state.mrope_positions, req_state.mrope_position_delta = MRotaryEmbedding.get_input_positions_tensor( req_state.prompt_token_ids, hf_config=self.model_config.hf_config, From 96208755d15328615f2312042e0ef6a240056e34 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 17:19:06 +0800 Subject: [PATCH 47/53] correct text-to-image detection for M-RoPE position computation Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 134 +++++++++++++++--- 1 file changed, 116 insertions(+), 18 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 9df38a2d0b8..8e359a9eb2e 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -1715,6 +1715,95 @@ def get_image_tokens( """Tokenize image features with VQ-VAE.""" return self.model.get_image_tokens(hidden_states, image_grid_thw) + def _parse_grid_from_tokens( + self, + input_tokens: list[int], + hf_config, + ) -> list[list[int]] | None: + """ + Parse image grid dimensions from prompt tokens. + + For text-to-image, the prompt format is: + "textH Wh w" + + Where: + - is grid_bos_token_id (start of phrase, marks grid dimension start) + - is grid_eos_token_id (end of phrase, marks grid dimension end) + - H W is large image grid (e.g., "32 32" for 1024x1024) + - h w is small image grid (e.g., "16 16" for preview) + - is image_start_token_id (16384, marks start of image generation) + + Returns: + List of grids [[1, H, W], [1, h, w]] or None if parsing fails + """ + try: + # Get special token IDs from config or tokenizer + # We need grid_bos_token_id and grid_eos_token_id + # These are typically and tokens + + # First try to get from hf_config + grid_bos_id = getattr(hf_config, "grid_bos_token_id", None) + grid_eos_id = getattr(hf_config, "grid_eos_token_id", None) + + # If not in config, we need to infer from token patterns + # For GLM-Image, looking at the processor code: + # - grid_bos_token = tokenizer.grid_bos_token + # - grid_eos_token = tokenizer.grid_eos_token + # These are typically single-token markers + + if grid_bos_id is None or grid_eos_id is None: + # Try to find pattern in tokens: look for repeated pattern of + # [marker] [number] [number] [marker] + # where numbers are small positive integers (grid dimensions like 16, 32) + + # Use heuristics: grid dimensions are typically between 8 and 128 + # represented as single tokens that decode to numbers + + # For now, return None and let caller use defaults + logger.warning( + "[GLM-Image M-RoPE] Cannot find grid_bos_token_id/grid_eos_token_id, will use default grids" + ) + return None + + # Find all ... regions + grids = [] + i = 0 + while i < len(input_tokens): + if input_tokens[i] == grid_bos_id: + # Found start of grid region, find end + j = i + 1 + while j < len(input_tokens) and input_tokens[j] != grid_eos_id: + j += 1 + + if j < len(input_tokens): + # Extract tokens between and + grid_tokens = input_tokens[i + 1 : j] + + # These should decode to "H W" format + # For now, we assume they're numeric token IDs that represent the dimensions + # This is a simplification - actual implementation would need tokenizer + + if len(grid_tokens) >= 2: + # Assume first two tokens are H and W values + # This is a heuristic - actual values depend on tokenizer + # For GLM-Image with ChatGLM tokenizer, numbers are tokenized specially + h = grid_tokens[0] if grid_tokens[0] < 256 else 32 # fallback + w = grid_tokens[1] if grid_tokens[1] < 256 else 32 # fallback + grids.append([1, h, w]) + + i = j + 1 + else: + i += 1 + + if len(grids) >= 2: + return grids + + return None + + except Exception as e: + logger.warning(f"[GLM-Image M-RoPE] Error parsing grids from tokens: {e}") + return None + def get_mrope_input_positions( self, input_tokens: list[int], @@ -1745,10 +1834,17 @@ def get_mrope_input_positions( Returns: Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta) """ + hf_config = self.config + image_start_token_id = hf_config.image_start_token_id + image_end_token_id = hf_config.image_end_token_id + logger.warning( f"[GLM-Image M-RoPE] get_mrope_input_positions called: " f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, " - f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}" + f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}, " + f"last_token={input_tokens[-1] if input_tokens else None}, " + f"image_start_token_id={image_start_token_id}, " + f"image_end_token_id={image_end_token_id}" ) # Get image_grid_thw from either the direct arg or mm_features @@ -1763,13 +1859,9 @@ def get_mrope_input_positions( if image_grid_thw is None: image_grid_thw = [] - hf_config = self.config - image_start_token_id = hf_config.image_start_token_id - image_end_token_id = hf_config.image_end_token_id - # For text-to-image: parse grid info from input tokens if not provided - # Input format: "textH Wh w<|dit_token_16384|>" - # where H W is large image grid (e.g., 32 32) and h w is small image grid (e.g., 16 16) + # Input format: "textH Wh w" where =image_start_token_id=16384 + # For 1024x1024: H=32, W=32 (large), h=16, w=16 (small preview) if not image_grid_thw: # Try to parse from kwargs (passed from processor) hf_config_arg = kwargs.get("hf_config") @@ -1778,19 +1870,25 @@ def get_mrope_input_positions( # If still empty, try to infer from input tokens if not image_grid_thw: - # Check if this is a text-to-image request by looking for dit_token - # dit_token_id = image_start_token_id = 16384 - has_dit_token = image_start_token_id in input_tokens + # Check if this is a text-to-image request: + # - Prompt ends with image_start_token_id (16384, the token for image generation) + # - No image_end_token_id (16385) in prompt (no completed images) + prompt_ends_with_start = len(input_tokens) > 0 and input_tokens[-1] == image_start_token_id has_end_token = image_end_token_id in input_tokens - # Text-to-image: has dit_token but no end_token (nothing generated yet) - if has_dit_token and not has_end_token: - # Default grids for text-to-image: large (32x32) and small (16x16) - # These are the standard GLM-Image generation grids - # The actual grid sizes should be parsed from the prompt, but for now use defaults - # TODO: Parse grid sizes from prompt tokens like "32 32" - image_grid_thw = [[1, 32, 32], [1, 16, 16]] - logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}") + # Text-to-image: ends with start token but no end token + if prompt_ends_with_start and not has_end_token: + # Parse grid dimensions from prompt tokens + # Format: ... H W h w + # We need to find the grid_bos_token () and grid_eos_token () + # and extract the dimensions between them + image_grid_thw = self._parse_grid_from_tokens(input_tokens, hf_config) + if image_grid_thw: + logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, parsed grids: {image_grid_thw}") + else: + # Fallback to default 1024x1024 grids if parsing fails + image_grid_thw = [[1, 32, 32], [1, 16, 16]] + logger.warning(f"[GLM-Image M-RoPE] Text-to-image, using default grids: {image_grid_thw}") seq_len = len(input_tokens) llm_pos_ids_list: list[torch.Tensor] = [] From 0dad1617729b529e14c3df5f21a67083a633003e Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 17:31:54 +0800 Subject: [PATCH 48/53] override config detection Signed-off-by: JaredforReal --- vllm_omni/worker/gpu_model_runner.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 55d7a5d2e32..087a6f44e2d 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -40,6 +40,22 @@ def __init__(self, *args, **kwargs): self._omni_num_scheduled_tokens_np: np.ndarray | None = None self._omni_last_model_output: object | None = None + # Override uses_mrope for models that use M-RoPE but vLLM's default + # detection fails (e.g., GLM-Image has mrope_section in text_config, + # but vLLM only checks top-level hf_config.rope_parameters) + if not self.uses_mrope: + hf_config = self.model_config.hf_config + # Check text_config for nested configs (like GLM-Image) + text_config = getattr(hf_config, "text_config", None) + if text_config is not None: + rope_params = getattr(text_config, "rope_parameters", None) + if rope_params is not None and rope_params.get("mrope_section") is not None: + self.uses_mrope = True + logger.info( + f"[OmniGPUModelRunner] Enabling M-RoPE for model_type={hf_config.model_type} " + f"(detected mrope_section in text_config)" + ) + def load_model(self, *args, **kwargs) -> None: super().load_model(*args, **kwargs) # TODO move this model specific logic to a separate class From ed232c7d4b525531dbd3ab5dd80f6bf4a25a2b7e Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 17:37:08 +0800 Subject: [PATCH 49/53] use a straight detection of mrope_section Signed-off-by: JaredforReal --- vllm_omni/worker/gpu_model_runner.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 087a6f44e2d..8a24b65a0d1 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -41,20 +41,17 @@ def __init__(self, *args, **kwargs): self._omni_last_model_output: object | None = None # Override uses_mrope for models that use M-RoPE but vLLM's default - # detection fails (e.g., GLM-Image has mrope_section in text_config, - # but vLLM only checks top-level hf_config.rope_parameters) + # detection fails. GLM-Image uses M-RoPE (mrope_section in config.json) + # but transformers ignores it with warning: + # "Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'mrope_section'}" + # So we hardcode the detection based on model_type. if not self.uses_mrope: hf_config = self.model_config.hf_config - # Check text_config for nested configs (like GLM-Image) - text_config = getattr(hf_config, "text_config", None) - if text_config is not None: - rope_params = getattr(text_config, "rope_parameters", None) - if rope_params is not None and rope_params.get("mrope_section") is not None: - self.uses_mrope = True - logger.info( - f"[OmniGPUModelRunner] Enabling M-RoPE for model_type={hf_config.model_type} " - f"(detected mrope_section in text_config)" - ) + model_type = getattr(hf_config, "model_type", None) + # GLM-Image uses M-RoPE with mrope_section: [8, 12, 12] + if model_type in ("glm_image",): + self.uses_mrope = True + logger.info(f"[OmniGPUModelRunner] Enabling M-RoPE for model_type={model_type}") def load_model(self, *args, **kwargs) -> None: super().load_model(*args, **kwargs) From eff86c1ffd48153497111a328d8ab6e3fadc9850 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 18:07:09 +0800 Subject: [PATCH 50/53] use get_model Signed-off-by: JaredforReal --- vllm_omni/worker/gpu_model_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 8a24b65a0d1..4c42054b24d 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -108,14 +108,17 @@ def _init_mrope_positions(self, req_state: CachedRequestState): if use_audio_in_video_value is not None: use_audio_in_video = bool(use_audio_in_video_value.item()) - if supports_mrope(self.model): + # Get unwrapped model - self.model may be wrapped in CUDAGraphWrapper + # after load_model(), which would break the isinstance check in supports_mrope() + model = self.get_model() + if supports_mrope(model): logger.warning( f"[M-RoPE Init] Calling get_mrope_input_positions: " f"prompt_len={len(req_state.prompt_token_ids)}, " f"mm_features_count={len(req_state.mm_features) if req_state.mm_features else 0}, " f"image_grid_thw={image_grid_thw}" ) - req_state.mrope_positions, req_state.mrope_position_delta = self.model.get_mrope_input_positions( + req_state.mrope_positions, req_state.mrope_position_delta = model.get_mrope_input_positions( req_state.prompt_token_ids, mm_features=req_state.mm_features, hf_config=self.model_config.hf_config, From e93c18b355bfde73594717986e33e1c2c96e4a94 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 18:37:23 +0800 Subject: [PATCH 51/53] cleanup: remove debug logging and simplify docstrings Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 88 ++----------------- vllm_omni/worker/gpu_model_runner.py | 35 +------- 2 files changed, 10 insertions(+), 113 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 8e359a9eb2e..8cddd80ceec 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -343,10 +343,6 @@ def _call_hf_processor( # Build messages format expected by processor messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] - logger.info("[GLM-Image T2I] Using GlmImageProcessor.apply_chat_template") - logger.info(f"[GLM-Image T2I] target_h={target_h}, target_w={target_w}") - logger.info(f"[GLM-Image T2I] prompt: {prompt[:200]}...") - # Use apply_chat_template which handles target dimensions hf_inputs = processor.apply_chat_template( messages, @@ -357,25 +353,6 @@ def _call_hf_processor( return_tensors="pt", ) - # Debug: log the tokenized input - if "input_ids" in hf_inputs: - input_ids = hf_inputs["input_ids"] - if hasattr(input_ids, "shape"): - logger.info(f"[GLM-Image T2I] input_ids shape: {input_ids.shape}") - # Use processor's tokenizer (not ByT5Tokenizer from tokenizer/ dir) - # GlmImageProcessor has its own tokenizer with a different vocabulary - if hasattr(processor, "tokenizer") and processor.tokenizer is not None: - ids_list = input_ids[0].tolist() if hasattr(input_ids[0], "tolist") else list(input_ids[0]) - try: - decoded = processor.tokenizer.decode(ids_list) - logger.info(f"[GLM-Image T2I] decoded input: {decoded}") - except Exception as e: - logger.warning(f"[GLM-Image T2I] could not decode: {e}") - logger.info(f"[GLM-Image T2I] first 50 token ids: {ids_list[:50]}") - - if "image_grid_thw" in hf_inputs: - logger.info(f"[GLM-Image T2I] image_grid_thw: {hf_inputs['image_grid_thw']}") - return hf_inputs else: # Fallback: just tokenize (this won't work properly for generation) @@ -1759,10 +1736,7 @@ def _parse_grid_from_tokens( # Use heuristics: grid dimensions are typically between 8 and 128 # represented as single tokens that decode to numbers - # For now, return None and let caller use defaults - logger.warning( - "[GLM-Image M-RoPE] Cannot find grid_bos_token_id/grid_eos_token_id, will use default grids" - ) + # Cannot find grid tokens, let caller use defaults return None # Find all ... regions @@ -1800,8 +1774,7 @@ def _parse_grid_from_tokens( return None - except Exception as e: - logger.warning(f"[GLM-Image M-RoPE] Error parsing grids from tokens: {e}") + except Exception: return None def get_mrope_input_positions( @@ -1811,42 +1784,20 @@ def get_mrope_input_positions( image_grid_thw: list[list[int]] | None = None, **kwargs, ) -> tuple[torch.Tensor, int]: - """ - Compute M-RoPE position IDs for GLM-Image generation. - - GLM-Image uses 3D position encoding: - - For text tokens: all 3 dimensions (temporal, height, width) are the same - - For image tokens: - - temporal: constant (marks image region) - - height: row position in image grid - - width: column position in image grid + """Compute M-RoPE position IDs for GLM-Image. - For text-to-image generation, we also pre-compute positions for the tokens - that will be generated (small image + large image + EOS), similar to how - transformers GLM-Image caches decode positions. + GLM-Image uses 3D positional encoding where text tokens have identical + values across all dimensions, while image tokens use 2D grid positions. - Args: - input_tokens: List of input token IDs - mm_features: Multimodal feature specifications (optional) - image_grid_thw: Pre-extracted image grid dimensions (optional) - **kwargs: Additional arguments (hf_config, video_grid_thw, etc.) + For text-to-image, also pre-computes decode positions for generated tokens. Returns: - Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta) + Tuple of (position_ids [3, total_len], mrope_position_delta) """ hf_config = self.config image_start_token_id = hf_config.image_start_token_id image_end_token_id = hf_config.image_end_token_id - logger.warning( - f"[GLM-Image M-RoPE] get_mrope_input_positions called: " - f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, " - f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}, " - f"last_token={input_tokens[-1] if input_tokens else None}, " - f"image_start_token_id={image_start_token_id}, " - f"image_end_token_id={image_end_token_id}" - ) - # Get image_grid_thw from either the direct arg or mm_features if image_grid_thw is None and mm_features is not None: # Gather image grid info from multimodal features @@ -1879,16 +1830,10 @@ def get_mrope_input_positions( # Text-to-image: ends with start token but no end token if prompt_ends_with_start and not has_end_token: # Parse grid dimensions from prompt tokens - # Format: ... H W h w - # We need to find the grid_bos_token () and grid_eos_token () - # and extract the dimensions between them image_grid_thw = self._parse_grid_from_tokens(input_tokens, hf_config) - if image_grid_thw: - logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, parsed grids: {image_grid_thw}") - else: + if not image_grid_thw: # Fallback to default 1024x1024 grids if parsing fails image_grid_thw = [[1, 32, 32], [1, 16, 16]] - logger.warning(f"[GLM-Image M-RoPE] Text-to-image, using default grids: {image_grid_thw}") seq_len = len(input_tokens) llm_pos_ids_list: list[torch.Tensor] = [] @@ -1978,19 +1923,11 @@ def get_mrope_input_positions( # Concatenate prefill and decode positions llm_positions = torch.cat([prefill_positions, decode_positions], dim=1) - - # Log for debugging - logger.warning( - f"[GLM-Image M-RoPE] prefill_len={prefill_positions.shape[1]}, " - f"decode_len={decode_positions.shape[1]}, total_len={llm_positions.shape[1]}" - ) else: llm_positions = prefill_positions - logger.warning(f"[GLM-Image M-RoPE] No decode grids, prefill_len={prefill_positions.shape[1]}") else: # Pure text - all dimensions same llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1) - logger.warning(f"[GLM-Image M-RoPE] Pure text mode, positions_len={seq_len}") mrope_position_delta = (llm_positions.max() + 1 - seq_len).item() return llm_positions, mrope_position_delta @@ -2019,15 +1956,6 @@ def forward( Returns: Hidden states or intermediate tensors """ - # Debug logging (first call only) - if not hasattr(self, "_logged_forward"): - self._logged_forward = True - logger.info(f"[GLM-Image Forward] input_ids shape: {input_ids.shape if input_ids is not None else None}") - logger.info(f"[GLM-Image Forward] positions shape: {positions.shape if positions is not None else None}") - logger.info(f"[GLM-Image Forward] pixel_values: {pixel_values is not None}") - logger.info(f"[GLM-Image Forward] image_grid_thw: {image_grid_thw}") - logger.info(f"[GLM-Image Forward] kwargs keys: {list(kwargs.keys())}") - if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 4c42054b24d..c80c3202626 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -112,12 +112,6 @@ def _init_mrope_positions(self, req_state: CachedRequestState): # after load_model(), which would break the isinstance check in supports_mrope() model = self.get_model() if supports_mrope(model): - logger.warning( - f"[M-RoPE Init] Calling get_mrope_input_positions: " - f"prompt_len={len(req_state.prompt_token_ids)}, " - f"mm_features_count={len(req_state.mm_features) if req_state.mm_features else 0}, " - f"image_grid_thw={image_grid_thw}" - ) req_state.mrope_positions, req_state.mrope_position_delta = model.get_mrope_input_positions( req_state.prompt_token_ids, mm_features=req_state.mm_features, @@ -128,13 +122,7 @@ def _init_mrope_positions(self, req_state: CachedRequestState): audio_feature_lengths=audio_feature_lengths, use_audio_in_video=use_audio_in_video, ) - logger.warning( - f"[M-RoPE Init] Result: " - f"mrope_positions_shape={req_state.mrope_positions.shape}, " - f"mrope_position_delta={req_state.mrope_position_delta}" - ) else: - logger.warning("[M-RoPE Init] Model does not support M-RoPE, using default") req_state.mrope_positions, req_state.mrope_position_delta = MRotaryEmbedding.get_input_positions_tensor( req_state.prompt_token_ids, hf_config=self.model_config.hf_config, @@ -148,17 +136,8 @@ def _init_mrope_positions(self, req_state: CachedRequestState): def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): """Calculate M-RoPE positions for scheduled tokens. - This method overrides the base vLLM implementation to support models - like GLM-Image that pre-compute decode positions with 2D spatial encoding. - - For GLM-Image text-to-image generation: - - Prefill positions: Use pre-computed positions from get_mrope_input_positions - - Decode positions: Also use pre-computed 2D spatial positions instead of - the default linear positions from get_next_input_positions_tensor - - The key difference from vLLM's default behavior: - - Default vLLM: decode positions use linear [N, N+1, N+2, ...] for all 3 dims - - GLM-Image needs: temporal=constant, height/width=2D grid pattern + Overrides base vLLM to use pre-computed 2D spatial positions for decode + phase (for models like GLM-Image) instead of linear positions. """ from vllm.utils import length_from_prompt_token_ids_or_embeds @@ -205,18 +184,8 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): self.mrope_positions.np[:, dst_start : dst_start + completion_part_len] = req.mrope_positions[ :, decode_start:decode_end ] - logger.debug( - f"[M-RoPE] Using pre-computed decode positions: " - f"decode_start={decode_start}, decode_end={decode_end}, " - f"total_precomputed={total_precomputed}" - ) else: # Fallback to default linear positions for text-only generation - logger.warning( - f"[M-RoPE] Falling back to linear positions! " - f"decode_end={decode_end} > total_precomputed={total_precomputed}, " - f"num_prompt_tokens={num_prompt_tokens}, completion_part_len={completion_part_len}" - ) assert req.mrope_position_delta is not None MRotaryEmbedding.get_next_input_positions_tensor( out=self.mrope_positions.np, From ad849f09757ff7602d6627ff81e8f650ce2db3ad Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 16 Jan 2026 19:08:32 +0800 Subject: [PATCH 52/53] feat: add profiling points for stage timing analysis Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 58 +++++---- vllm_omni/entrypoints/omni.py | 13 ++ .../stage_input_processors/glm_image.py | 112 +++--------------- 3 files changed, 57 insertions(+), 126 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 9a03a934983..35d0e107f4d 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -16,6 +16,7 @@ import logging import os import re +import time from collections.abc import Iterable import numpy as np @@ -812,77 +813,59 @@ def _prepare_condition_image_kv_cache( @torch.inference_mode() def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: - """ - Main generation forward pass. - - Args: - req: OmniDiffusionRequest with generation parameters + """Main generation forward pass.""" + t_forward_start = time.perf_counter() - Returns: - DiffusionOutput containing generated image - """ prompt = req.prompt or "" if isinstance(prompt, list): prompt = prompt[0] if prompt else "" - # Get pre-computed prompt embeddings if provided prompt_embeds = req.prompt_embeds if isinstance(req.prompt_embeds, torch.Tensor) else None - - # Get condition images for Image Edit mode - # Use pre-processed images from pre_process_func preprocessed_images = req.preprocessed_image condition_images = getattr(req, "prompt_image", None) img_height = req.height img_width = req.width - is_image_edit = preprocessed_images is not None - # Use image dimensions as default if available height = req.height or img_height or self.default_sample_size * self.vae_scale_factor width = req.width or img_width or self.default_sample_size * self.vae_scale_factor num_inference_steps = req.num_inference_steps or 50 guidance_scale = req.guidance_scale or 1.5 - # 0. Validate inputs self.check_inputs(prompt=prompt, height=height, width=width, prompt_embeds=prompt_embeds) batch_size = 1 do_classifier_free_guidance = guidance_scale > 1.0 - # Set seed if provided generator = None if req.seed is not None: generator = torch.Generator(device=self.device).manual_seed(req.seed) - # 1. Get prior tokens - either from external source (multistage) or generate internally - # Check if prior_token_ids are provided externally (from AR stage in multistage mode) + # 1. Get prior tokens + t_prior_start = time.perf_counter() external_prior_tokens = req.extra.get("prior_token_ids") if req.extra else None external_prior_image_ids = req.extra.get("prior_token_image_ids") if req.extra else None if external_prior_tokens is not None: - # Multistage mode: use externally provided prior tokens from vLLM AR stage - logger.info("Using externally provided prior tokens from AR stage...") prior_token_id = external_prior_tokens if isinstance(prior_token_id, list): prior_token_id = torch.tensor(prior_token_id, dtype=torch.long, device=self.device) elif isinstance(prior_token_id, torch.Tensor): prior_token_id = prior_token_id.to(device=self.device, dtype=torch.long) - # Ensure shape is [1, num_tokens] for batch processing if prior_token_id.dim() == 1: prior_token_id = prior_token_id.unsqueeze(0) prior_token_image_ids = external_prior_image_ids else: - # Single-stage mode: generate prior tokens with internal AR model - logger.info("Generating prior tokens with AR model...") prior_token_id, prior_token_image_ids = self.generate_prior_tokens( prompt=prompt, image=condition_images, height=height, width=width, ) + t_prior_end = time.perf_counter() # 2. Encode prompt for glyph embeddings - logger.info("Encoding prompt...") + t_encode_start = time.perf_counter() prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, do_classifier_free_guidance=do_classifier_free_guidance, @@ -891,19 +874,20 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: device=self.device, dtype=self.transformer.dtype, ) + t_encode_end = time.perf_counter() # 3. Prepare KV cache for Image Edit mode + t_kvcache_start = time.perf_counter() kv_caches = None if is_image_edit and prior_token_image_ids is not None: - logger.info("Preparing KV cache for Image Edit mode...") kv_caches = self._prepare_condition_image_kv_cache( condition_images=preprocessed_images, prior_token_image_ids=prior_token_image_ids, prompt_embeds=prompt_embeds, generator=generator, ) - # Switch to read mode for denoising kv_caches.set_mode("read") + t_kvcache_end = time.perf_counter() # 4. Prepare latents latent_channels = self.transformer.in_channels @@ -937,8 +921,8 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: target_size = torch.tensor([[height, width]], dtype=prompt_embeds.dtype, device=self.device) crop_coords = torch.zeros((1, 2), dtype=prompt_embeds.dtype, device=self.device) - # 7. Denoising loop with CFG-parallel support - logger.info(f"Starting denoising loop with {num_inference_steps} steps...") + # 7. Denoising loop + t_denoise_start = time.perf_counter() latents = self.diffuse( latents=latents, prior_token_id=prior_token_id, @@ -951,9 +935,10 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: do_classifier_free_guidance=do_classifier_free_guidance, kv_caches=kv_caches, ) + t_denoise_end = time.perf_counter() # 8. VAE decode - logger.info("Decoding latents with VAE...") + t_vae_start = time.perf_counter() latents = latents.to(self.vae.dtype) latents_mean = ( torch.tensor(self.vae.config.latents_mean) @@ -967,8 +952,19 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: ) latents = latents * latents_std + latents_mean image = self.vae.decode(latents, return_dict=False, generator=generator)[0] - - # 9. Leave post-process to vllm-omni pipeline + t_vae_end = time.perf_counter() + + t_forward_end = time.perf_counter() + + # Profile logging + logger.info( + f"[Profile] Diffusion forward: total={t_forward_end - t_forward_start:.3f}s | " + f"prior_tokens={t_prior_end - t_prior_start:.3f}s, " + f"prompt_encode={t_encode_end - t_encode_start:.3f}s, " + f"kv_cache={t_kvcache_end - t_kvcache_start:.3f}s, " + f"denoise({num_inference_steps} steps)={t_denoise_end - t_denoise_start:.3f}s, " + f"vae_decode={t_vae_end - t_vae_start:.3f}s" + ) return DiffusionOutput(output=image) diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py index 109d5f85473..3ea80ae47ec 100644 --- a/vllm_omni/entrypoints/omni.py +++ b/vllm_omni/entrypoints/omni.py @@ -608,6 +608,7 @@ def _run_generation( # Mark first input time for stage-0 metrics.stage_first_ts[0] = metrics.stage_first_ts[0] or time.time() + _req_start_perf_ts: dict[str, float] = {} # perf_counter for profiling for req_id, prompt in request_id_to_prompt.items(): sp0 = sampling_params_list[0] # type: ignore[index] task = { @@ -617,6 +618,7 @@ def _run_generation( } self.stage_list[0].submit(task) _req_start_ts[req_id] = time.time() + _req_start_perf_ts[req_id] = time.perf_counter() logger.debug(f"[{self._name}] Enqueued request {req_id} to stage-0") pbar = None @@ -659,6 +661,11 @@ def _run_generation( continue engine_outputs = _load(result, obj_key="engine_outputs", shm_key="engine_outputs_shm") + t_stage_completed = time.perf_counter() + stage_elapsed = t_stage_completed - _req_start_perf_ts.get(req_id, t_stage_completed) + logger.info( + f"[Profile] Stage {stage_id} completed: req_id={req_id}, elapsed_from_start={stage_elapsed:.3f}s" + ) # Mark last output time for this stage whenever we receive outputs metrics.stage_last_ts[stage_id] = max(metrics.stage_last_ts[stage_id] or 0.0, time.time()) try: @@ -723,6 +730,7 @@ def _run_generation( next_stage_id = stage_id + 1 if next_stage_id <= final_stage_id_to_prompt[req_id]: next_stage: OmniStage = self.stage_list[next_stage_id] + t_transition_start = time.perf_counter() try: next_inputs = next_stage.process_engine_inputs(self.stage_list, [request_id_to_prompt[req_id]]) except Exception as e: @@ -731,6 +739,11 @@ def _run_generation( f" at stage {next_stage_id}: {e}", ) continue + t_transition_end = time.perf_counter() + logger.info( + f"[Profile] Stage {stage_id}→{next_stage_id} transition: " + f"process_inputs={t_transition_end - t_transition_start:.4f}s, req_id={req_id}" + ) sp_next = sampling_params_list[next_stage_id] # type: ignore[index] # Check if we have a connector for this edge diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index 79f5d91baf9..f8e7aba302c 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Stage input processor for GLM-Image: AR → Diffusion transition.""" +import time from typing import Any import torch @@ -39,60 +40,23 @@ def _parse_generated_tokens( width: int, factor: int = 32, ) -> tuple[torch.Tensor, int, int]: - """Parse AR-generated tokens to extract prior_token_ids. - - The AR model generates tokens in a specific format: - - For text-to-image: small_image_tokens + large_image_tokens + EOS - - For image-to-image: large_image_tokens + EOS - - We need to extract the large_image_tokens and upsample them. - - Args: - token_ids: Generated token IDs from AR model - height: Target image height - width: Target image width - factor: Downsampling factor (default 32 for AR output) - - Returns: - Tuple of (upsampled_prior_token_ids, pixel_height, pixel_width) - """ + """Parse AR-generated tokens to extract prior_token_ids.""" # Calculate token dimensions for target image token_h = height // factor token_w = width // factor large_image_tokens = token_h * token_w # Calculate small preview image dimensions (used in text-to-image) - # GLM-Image generates a small preview at 1/4 resolution before the full image - # The preview grid is computed as target_grid / 2 in each dimension small_token_h = token_h // 2 small_token_w = token_w // 2 small_image_tokens = small_token_h * small_token_w token_tensor = torch.tensor(token_ids, dtype=torch.long) - # Log actual values for debugging - logger.info( - f"_parse_generated_tokens: total_tokens={len(token_ids)}, " - f"large_image_tokens={large_image_tokens} ({token_h}x{token_w}), " - f"small_image_tokens={small_image_tokens} ({small_token_h}x{small_token_w})" - ) - - # Analyze token distribution to find image tokens - logger.info( - f"Full sequence stats: min={token_tensor.min().item()}, " - f"max={token_tensor.max().item()}, " - f"unique={token_tensor.unique().numel()}" - ) - - # Print first 20 and last 20 tokens to understand the structure - logger.info(f"First 20 tokens: {token_tensor[:20].tolist()}") - logger.info(f"Last 20 tokens: {token_tensor[-20:].tolist()}") - # Remove EOS token (16385) from the end if present eos_token_id = 16385 if len(token_ids) > 0 and token_ids[-1] == eos_token_id: token_tensor = token_tensor[:-1] - logger.info(f"Removed EOS token, remaining: {len(token_tensor)} tokens") actual_tokens = len(token_tensor) @@ -102,19 +66,14 @@ def _parse_generated_tokens( large_end = large_start + large_image_tokens prior_token_ids_d32 = token_tensor[large_start:large_end] actual_h, actual_w = token_h, token_w - logger.info(f"Text-to-image mode: extracting tokens [{large_start}:{large_end}]") elif actual_tokens >= large_image_tokens: # Image-to-image: large image tokens are at the beginning prior_token_ids_d32 = token_tensor[:large_image_tokens] actual_h, actual_w = token_h, token_w - logger.info(f"Image-to-image mode: extracting tokens [0:{large_image_tokens}]") else: # Insufficient tokens - try to infer the actual grid size - # The model might have generated for a different resolution import math - # Try to find a square grid that fits the available tokens - # First check if it matches any of the small+large patterns for scale in [1, 2, 4]: test_h = token_h // scale test_w = token_w // scale @@ -124,43 +83,27 @@ def _parse_generated_tokens( test_small = test_small_h * test_small_w if actual_tokens >= test_small + test_large: - # Found matching grid for t2i prior_token_ids_d32 = token_tensor[test_small : test_small + test_large] actual_h, actual_w = test_h, test_w - # Adjust output dimensions height = test_h * factor width = test_w * factor - logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}") + logger.warning(f"Adjusted grid to {test_h}x{test_w}, output will be {height}x{width}") break elif actual_tokens >= test_large: - # Found matching grid for i2i prior_token_ids_d32 = token_tensor[:test_large] actual_h, actual_w = test_h, test_w height = test_h * factor width = test_w * factor - logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}") + logger.warning(f"Adjusted grid to {test_h}x{test_w}, output will be {height}x{width}") break else: - # Last resort: find closest square grid sqrt_tokens = int(math.sqrt(actual_tokens)) actual_h = actual_w = sqrt_tokens usable_tokens = sqrt_tokens * sqrt_tokens prior_token_ids_d32 = token_tensor[:usable_tokens] height = sqrt_tokens * factor width = sqrt_tokens * factor - logger.error( - f"Could not match grid pattern. Using {sqrt_tokens}x{sqrt_tokens} grid " - f"({usable_tokens} tokens), output will be {height}x{width}. " - f"This likely indicates a prompt format issue." - ) - - # Log token value statistics for debugging - logger.info( - f"prior_token_ids_d32: shape={prior_token_ids_d32.shape}, " - f"min={prior_token_ids_d32.min().item()}, " - f"max={prior_token_ids_d32.max().item()}, " - f"unique_count={prior_token_ids_d32.unique().numel()}" - ) + logger.error(f"Grid pattern mismatch. Using {sqrt_tokens}x{sqrt_tokens}, output: {height}x{width}") # Upsample from 32x to 16x prior_token_ids = _upsample_token_ids(prior_token_ids_d32, actual_h, actual_w) @@ -174,26 +117,9 @@ def ar2diffusion( prompt: OmniTokensPrompt | TextPrompt | list | None = None, requires_multimodal_data: bool = False, ) -> list[dict[str, Any]]: - """ - Process AR stage outputs to create Diffusion stage inputs. - - This function bridges the AR model (which generates prior_token_ids) and - the Diffusion pipeline (which uses them for conditioned denoising). - - Workflow: - 1. Extract generated token_ids from AR stage output - 2. Parse and upsample prior_token_ids (32x → 16x) - 3. Package into diffusion request format with original prompt info + """Process AR stage outputs to create Diffusion stage inputs.""" + t_start = time.perf_counter() - Args: - stage_list: List of stage objects containing outputs - engine_input_source: Source stage IDs (typically [0] for AR stage) - prompt: Original prompt data (contains height, width, prompt text, images) - requires_multimodal_data: Whether to pass multimodal data (condition images) - - Returns: - List of dicts containing diffusion request parameters - """ if not engine_input_source: raise ValueError("engine_input_source cannot be empty") @@ -217,59 +143,55 @@ def ar2diffusion( # Get original prompt info original_prompt = prompt[i] if i < len(prompt) else {} - # Handle various prompt types - convert to dict for uniform access - # Note: TypedDict (TextPrompt, OmniTokensPrompt) doesn't support isinstance if isinstance(original_prompt, dict): - pass # Already a dict + pass elif hasattr(original_prompt, "_asdict"): - # NamedTuple original_prompt = original_prompt._asdict() elif hasattr(original_prompt, "__dict__"): original_prompt = vars(original_prompt) else: original_prompt = {} - # Extract dimensions from original prompt or use defaults height = original_prompt.get("height", 1024) width = original_prompt.get("width", 1024) text_prompt = original_prompt.get("prompt", "") # Parse and upsample prior tokens + t_parse_start = time.perf_counter() prior_token_ids, pixel_h, pixel_w = _parse_generated_tokens(generated_token_ids, height, width) + t_parse_end = time.perf_counter() - # Build diffusion input - # The diffusion stage expects these in OmniDiffusionRequest format diffusion_input = { "prompt": text_prompt, "height": pixel_h, "width": pixel_w, "extra": { "prior_token_ids": prior_token_ids, - # Pass condition image info for image-to-image mode "prior_token_image_ids": output.multimodal_output.get("prior_token_image_ids") if hasattr(output, "multimodal_output") and output.multimodal_output else None, }, } - # Include multimodal data (condition images) if required if requires_multimodal_data: mm_data = original_prompt.get("multi_modal_data") if mm_data: diffusion_input["pil_image"] = mm_data.get("image") - # Copy other relevant parameters from original prompt for key in ["seed", "num_inference_steps", "guidance_scale", "negative_prompt"]: if key in original_prompt: diffusion_input[key] = original_prompt[key] diffusion_inputs.append(diffusion_input) logger.info( - f"ar2diffusion: request {i}: prompt='{text_prompt[:50]}...', " - f"prior_token_ids shape={prior_token_ids.shape}, " - f"height={pixel_h}, width={pixel_w}" + f"[Profile] ar2diffusion request {i}: parse_tokens={t_parse_end - t_parse_start:.4f}s, " + f"num_ar_tokens={len(generated_token_ids)}, prior_shape={prior_token_ids.shape}" ) - logger.info(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs") + t_end = time.perf_counter() + logger.info( + f"[Profile] ar2diffusion total: {t_end - t_start:.4f}s, " + f"processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs" + ) return diffusion_inputs From 44f2d30c59339aa74adeabd3fdba1cd696322b89 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Tue, 20 Jan 2026 10:28:09 +0800 Subject: [PATCH 53/53] try implement i2i mode Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_ar.py | 662 +++++++++++++++--- .../stage_configs/glm_image.yaml | 1 + .../glm_image_muilticonnector.yaml | 1 + .../stage_input_processors/glm_image.py | 16 +- 4 files changed, 587 insertions(+), 93 deletions(-) diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index 8cddd80ceec..f1bae87be46 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -77,19 +77,20 @@ MultiModalFieldConfig, MultiModalKwargsItems, ) -from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, - PromptUpdateDetails, ) from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm_omni.model_executor.models.output_templates import OmniOutput + logger = init_logger(__name__) @@ -172,15 +173,13 @@ def get_hf_processor(self, **kwargs: object): return None def get_supported_mm_limits(self) -> Mapping[str, int | None]: - # GLM-Image is an image GENERATION model, not an image understanding model. - # For text-to-image (t2i) mode: no multimodal input is needed - # For image-to-image (i2i) mode: source images are provided as input + # GLM-Image is an image GENERATION model that supports: + # - Text-to-image (t2i): no multimodal input needed + # - Image-to-image (i2i): source images provided as input # - # Return empty dict to indicate no multimodal inputs are required for - # profiling. Image-to-image mode will be handled dynamically at runtime. - # This prevents vLLM from trying to create dummy image inputs during - # model initialization. - return {} + # For i2i mode, we support up to 1 image as condition. + # The model architecture supports multiple images but typical usage is 1. + return {"image": 1} def get_num_image_tokens( self, @@ -238,36 +237,34 @@ class GlmImageDummyInputsBuilder(BaseDummyInputsBuilder[GlmImageProcessingInfo]) """ Builds dummy inputs for GLM-Image model profiling. - GLM-Image is an image GENERATION model. For text-to-image mode, - no multimodal inputs are needed - just a text prompt. + GLM-Image is an image GENERATION model that supports: + - Text-to-image (t2i): no multimodal input needed + - Image-to-image (i2i): source images provided as input + + For profiling purposes, we need to provide dummy multimodal data when + mm_counts["image"] > 0, which happens because get_supported_mm_limits + declares image support. """ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: """ Generate dummy text for profiling. - For text-to-image mode (no images), returns a simple text prompt. - For image-to-image mode, includes image placeholders. + When images are requested (i2i mode profiling), include image placeholders + so that _get_prompt_updates can find and replace them. Each <|image|> token + will be expanded to grid_h * grid_w tokens by the replacement function. """ num_images = mm_counts.get("image", 0) - # Text-to-image mode: return a simple text prompt for profiling - if num_images == 0: + if num_images > 0: + # i2i mode: include image placeholders that will be expanded + # The <|image|> placeholder will be tokenized to image_token_id (167855) + # and then replaced by _get_prompt_updates with actual grid tokens + return "<|image|>" * num_images + "A beautiful image." + else: + # t2i mode: simple text prompt, no image placeholders needed return "A beautiful image." - hf_config = self.info.get_hf_config() - # Get image token from config or use default - image_token_id = getattr(hf_config, "image_token_id", 167855) - - tokenizer = self.info.get_tokenizer() - # Try to get the image token string - try: - image_token = tokenizer.convert_ids_to_tokens(image_token_id) - except Exception: - image_token = "<|image|>" - - return image_token * num_images - def get_dummy_mm_data( self, seq_len: int, @@ -277,18 +274,20 @@ def get_dummy_mm_data( """ Generate dummy multimodal data for profiling. - Returns empty dict if no images (text-to-image mode). + When images are requested, provide actual dummy images so the vision + encoder can be profiled. The image size is set to maximize features + for accurate memory profiling. """ num_images = mm_counts.get("image", 0) - # Text-to-image mode: no multimodal data needed + # No images requested: t2i mode, no multimodal data needed if num_images == 0: return {} hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config - # Default image size from config + # Use image size from config for maximum features profiling image_size = getattr(vision_config, "image_size", 2048) width = height = image_size @@ -328,18 +327,29 @@ def _call_hf_processor( 1. Build the prompt with target grid dimensions 2. Build the image_grid_thw tensor for M-RoPE position encoding - For image-to-image mode, we use the full processor. + For image-to-image mode: + 1. Process source images through the image processor + 2. Build prompt with image placeholders expanded + 3. Build image_grid_thw including source and target grids """ - if not mm_data or not mm_data.get("image"): - # Text-to-image mode: use GlmImageProcessor with target dimensions - # This is critical - the processor adds grid tokens that tell the model - # what resolution to generate - processor = self.info.get_hf_processor() - if processor is not None: - # Get target dimensions from mm_kwargs or use defaults - target_h = mm_kwargs.get("target_h", 1024) if mm_kwargs else 1024 - target_w = mm_kwargs.get("target_w", 1024) if mm_kwargs else 1024 + processor = self.info.get_hf_processor() + + # Debug: log mm_data contents + # NOTE: vLLM's ImageProcessorItems.get_processor_data() returns {"images": [...]} (plural) + # because ProcessorBatchItems adds 's' suffix: {f"{self.modality}s": self.get_all()} + logger.debug( + f"_call_hf_processor: mm_data keys={list(mm_data.keys()) if mm_data else None}, " + f"has_images={bool(mm_data and mm_data.get('images'))}" + ) + + # Get target dimensions from mm_kwargs or use defaults + target_h = mm_kwargs.get("target_h", 1024) if mm_kwargs else 1024 + target_w = mm_kwargs.get("target_w", 1024) if mm_kwargs else 1024 + if not mm_data or not mm_data.get("images"): + # Text-to-image mode + logger.debug("_call_hf_processor: entering t2i mode (no images)") + if processor is not None: # Build messages format expected by processor messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] @@ -360,14 +370,253 @@ def _call_hf_processor( prompt_ids = tokenizer.encode(prompt) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - # Image-to-image mode: use full processor - return super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - tok_kwargs=tok_kwargs, + # Image-to-image mode + # NOTE: Use "images" (plural) - this is what vLLM's ImageProcessorItems.get_processor_data() returns + images = mm_data.get("images") + if not isinstance(images, list): + images = [images] + + logger.debug( + f"_call_hf_processor i2i: num_images={len(images)}, image_types={[type(img).__name__ for img in images]}" ) + if processor is not None: + # Build messages with image objects directly in content + # This is how GlmImageProcessor expects images - embedded in the content dict + # NOT as a separate images= parameter + content = [] + for img in images: + content.append({"type": "image", "image": img}) + content.append({"type": "text", "text": prompt}) + messages = [{"role": "user", "content": content}] + + logger.debug(f"_call_hf_processor: calling apply_chat_template with {len(images)} images in content") + + # Use apply_chat_template - processor will process images when they're in content + hf_inputs = processor.apply_chat_template( + messages, + tokenize=True, + target_h=target_h, + target_w=target_w, + return_dict=True, + return_tensors="pt", + ) + + logger.debug(f"_call_hf_processor: apply_chat_template returned keys: {list(hf_inputs.keys())}") + + # CRITICAL: Slice image_grid_thw to only include source image grids + # GLM-Image's image_grid_thw has [num_source_images + 1, 3] shape: + # - First N entries are for source images (these need visual encoding) + # - Last entry is for the target image (for generation, no visual encoding) + # We need to slice it so batching works correctly with num_images + image_grid_thw = hf_inputs.get("image_grid_thw") + if image_grid_thw is not None and len(image_grid_thw) > 1: + num_source_images = len(image_grid_thw) - 1 + # Keep only source image grids for multimodal processing + source_grids = image_grid_thw[:num_source_images] + hf_inputs["image_grid_thw"] = source_grids + logger.debug( + f"_call_hf_processor: sliced image_grid_thw from {len(image_grid_thw)} \ + to {len(source_grids)} entries" + ) + logger.debug(f"_call_hf_processor: source_grids={source_grids.tolist()}") + + # Debug: Analyze input_ids for image tokens + input_ids = hf_inputs.get("input_ids") + if input_ids is not None: + if hasattr(input_ids, "tolist"): + ids_list = input_ids.tolist() + if isinstance(ids_list[0], list): + ids_list = ids_list[0] # Unbatch + else: + ids_list = list(input_ids) + + # Get image token ID from config + hf_config = self.info.get_hf_config() + image_token_id = getattr(hf_config, "image_token_id", 167855) + + # Count image tokens + image_token_count = ids_list.count(image_token_id) + logger.debug( + f"_call_hf_processor: input_ids length={len(ids_list)}, " + f"image_token_id={image_token_id}, " + f"image_token_count={image_token_count}" + ) + + # Log first/last few tokens to understand structure + logger.debug(f"_call_hf_processor: first 20 tokens: {ids_list[:20]}") + logger.debug(f"_call_hf_processor: last 20 tokens: {ids_list[-20:]}") + + # Find positions of image tokens + image_positions = [i for i, t in enumerate(ids_list) if t == image_token_id] + if image_positions: + logger.debug(f"_call_hf_processor: image token positions (first 10): {image_positions[:10]}") + + return hf_inputs + else: + # Fallback without processor - this is not ideal but prevents crashes + logger.warning("GlmImageProcessor not available, using fallback for i2i") + tokenizer = self.info.get_tokenizer() + hf_config = self.info.get_hf_config() + + # Get image token + image_token_id = getattr(hf_config, "image_token_id", 167855) + try: + image_token = tokenizer.convert_ids_to_tokens(image_token_id) + except Exception: + image_token = "<|image|>" + + # Build prompt with image placeholders + image_placeholders = image_token * len(images) + full_prompt = f"{image_placeholders}{prompt}" + prompt_ids = tokenizer.encode(full_prompt) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + def _apply_hf_processor_mm_only( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> BatchFeature: + """ + Apply the HF processor on the multi-modal data only. + + GLM-Image requires special handling because apply_chat_template always + adds a target <|image|> placeholder in addition to source image placeholders. + This causes an IndexError when the HF processor tries to find grid info + for the target placeholder (which doesn't exist for source-only processing). + + Solution: Call the image processor directly to get pixel_values and + image_grid_thw, bypassing apply_chat_template's target handling. + """ + mm_counts = mm_items.get_all_counts() + num_images = mm_counts.get("image", 0) + + if num_images == 0: + # No images - call parent implementation + return super()._apply_hf_processor_mm_only( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + ) + + # For i2i mode, we need to process images directly with the image processor + # to avoid the apply_chat_template target placeholder issue + processor = self.info.get_hf_processor() + image_processor = processor.image_processor + + # Get images from mm_items + images = mm_items.get_items("image", ImageProcessorItems) + image_list = [images.get(i) for i in range(images.get_count())] + + logger.debug(f"_apply_hf_processor_mm_only: processing {len(image_list)} images directly") + + # Process images directly with image processor + image_inputs = image_processor( + images=image_list, + return_tensors="pt", + ) + + # Get grid info for source images only (no target) + pixel_values = image_inputs.get("pixel_values") + image_grid_thw = image_inputs.get("image_grid_thw") + + logger.debug( + f"_apply_hf_processor_mm_only: pixel_values shape=\ + {pixel_values.shape if pixel_values is not None else None}, " + f"image_grid_thw shape={image_grid_thw.shape if image_grid_thw is not None else None}" + ) + + # Build input_ids with image token placeholders + # The _get_prompt_updates returns PromptReplacement(target=[image_token_id], ...) + # which needs to find image tokens in input_ids to replace them. + # We need to include one image_token_id per image so the replacement can work. + tokenizer = self.info.get_tokenizer() + image_token_id = tokenizer.convert_tokens_to_ids("<|image|>") + + # Build input_ids: [image_token] * num_images + tokenized text + # This way _apply_prompt_updates can find the image tokens and replace them + dummy_text = self.dummy_inputs.get_dummy_text(mm_counts) + text_ids = tokenizer.encode(dummy_text, add_special_tokens=False) + input_ids = [image_token_id] * num_images + text_ids + + logger.debug( + f"_apply_hf_processor_mm_only: built input_ids with {num_images} image tokens + {len(text_ids)} text tokens" + ) + + return BatchFeature( + dict( + input_ids=[input_ids], + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ), + tensor_type="pt", + ) + + def _apply_hf_processor_main( + self, + prompt: str | list[int], + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + *, + enable_hf_prompt_update: bool, + ) -> tuple[list[int], BatchFeature, bool]: + """ + Override to handle GLM-Image i2i mode correctly. + + Problem: When vLLM processes cached mm items (enable_hf_prompt_update=False), + the base implementation: + 1. Gets prompt_ids from _apply_hf_processor_text_only (no image tokens) + 2. Gets mm_data from _apply_hf_processor_mm_only + 3. Returns is_update_applied=False + + This causes _apply_prompt_updates to fail because prompt_ids has no image tokens. + + Solution: For i2i mode, we build prompt_ids that include image placeholders, + and return is_update_applied=False so _apply_prompt_updates can expand them. + """ + num_images = mm_items.get_all_counts().get("image", 0) + + if num_images == 0 or enable_hf_prompt_update: + # t2i mode or normal flow - use parent implementation + return super()._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + enable_hf_prompt_update=enable_hf_prompt_update, + ) + + # i2i mode with enable_hf_prompt_update=False (cache miss scenario) + # We need to build prompt_ids with image placeholders + logger.debug(f"_apply_hf_processor_main: i2i mode with enable_hf_prompt_update=False, num_images={num_images}") + + # Get mm data from our overridden _apply_hf_processor_mm_only + mm_processed_data = self._apply_hf_processor_mm_only( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + ) + + # Build prompt_ids with image placeholders + # _apply_prompt_updates will replace each [image_token_id] with expanded tokens + tokenizer = self.info.get_tokenizer() + image_token_id = tokenizer.convert_tokens_to_ids("<|image|>") + + if isinstance(prompt, str): + text_ids = tokenizer.encode(prompt, add_special_tokens=False) + else: + text_ids = list(prompt) + + # Prepend image placeholders - one per image + prompt_ids = [image_token_id] * num_images + text_ids + + logger.debug(f"_apply_hf_processor_main: built prompt_ids with {num_images} image placeholders") + + # Return is_update_applied=False so _apply_prompt_updates will expand the placeholders + return prompt_ids, mm_processed_data, False + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -376,24 +625,74 @@ def _get_mm_fields_config( """ Get the multimodal field configuration. - For text-to-image: only image_grid_thw is needed (no pixel_values) - For image-to-image: both pixel_values and image_grid_thw are needed + For GLM-Image i2i mode: + - image_grid_thw has been sliced in _call_hf_processor to only include source images + - pixel_values has shape [total_patches, C, H, W] - only for source images + + For t2i mode: + - No pixel_values, no source images - return empty config """ result = {} - # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding) - # For text-to-image, we don't have pixel_values but still need image_grid_thw - # Always use batched("image") - the fallback in get_mrope_input_positions will handle - # the case where image_grid_thw is not passed through mm_features - if "image_grid_thw" in hf_inputs: - result["image_grid_thw"] = MultiModalFieldConfig.batched("image") + # Debug: log hf_inputs keys + logger.debug(f"_get_mm_fields_config: hf_inputs keys: {list(hf_inputs.keys())}") - # pixel_values only present in image-to-image mode - if "pixel_values" in hf_inputs: - result["pixel_values"] = MultiModalFieldConfig.batched("image") + # Get image_grid_thw if present (already sliced in _call_hf_processor) + image_grid_thw = hf_inputs.get("image_grid_thw") + + if "pixel_values" in hf_inputs and image_grid_thw is not None: + # i2i mode: pixel_values contains patches for source images + # image_grid_thw has already been sliced to only include source grids + num_source_images = len(image_grid_thw) + logger.debug( + f"_get_mm_fields_config: num_source_images={num_source_images}, image_grid_thw={image_grid_thw.shape}" + ) + + if num_source_images > 0: + # Calculate grid sizes for source images + image_grid_sizes = image_grid_thw.prod(-1) + + result["pixel_values"] = MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes) + + # Register image_grid_thw - it's been sliced in _call_hf_processor + # to only include source image grids, so batching will work correctly + result["image_grid_thw"] = MultiModalFieldConfig.batched("image") + + logger.debug(f"_get_mm_fields_config: result keys: {list(result.keys())}") return result + def _hf_processor_applies_updates( + self, + prompt_text: str, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> bool: + """ + Return whether the HF processor applies prompt updates. + + For GLM-Image i2i mode, the HF processor's apply_chat_template already + expands <|image|> to N tokens (e.g., 4096 for 64x64 grid). + + By returning True, we tell vLLM that HF processor DID apply prompt updates, + so vLLM will use _find_mm_placeholders to locate the expanded tokens + instead of trying to apply replacements. + + For t2i mode (no images), there are no image placeholders to expand. + """ + # Check if we have images (i2i mode) + num_images = mm_items.get_all_counts().get("image", 0) + if num_images > 0: + logger.debug( + f"_hf_processor_applies_updates: returning True for i2i mode " + f"(num_images={num_images}) - HF processor already expanded tokens" + ) + return True + + # For t2i mode (no images), use default behavior + return True + def _get_prompt_updates( self, mm_items: MultiModalDataItems, @@ -403,41 +702,93 @@ def _get_prompt_updates( """ Get prompt updates for image tokens. - GLM-Image replaces each image placeholder with: - <|image_start|> + image_tokens + <|image_end|> + For GLM-Image image-to-image mode, the HF processor's apply_chat_template + already expands each <|image|> placeholder to the correct number of + image tokens (grid_h * grid_w tokens per source image). - Returns empty list if no images (text-to-image mode). - """ - # Check if we have any images - if not mm_items.get_count("image", strict=False): - return [] + The HF processor does: + 1. Replace each <|image|> with num_image_tokens copies of <|placeholder|> + 2. Replace all <|placeholder|> back to <|image|> + So the tokenized input already has the expanded tokens. We use + target=[image_token_id] to match each occurrence of the image token, + similar to how Qwen2VL handles this pattern. + + We use image_grid_thw from out_mm_kwargs to get the actual processed grid + size, following the Qwen2VL pattern. This is critical because the HF processor + resizes images, so the original image size doesn't match the processed size. + + For t2i mode (no images), we return an empty list since there are no + image placeholders to replace. + """ hf_config = self.info.get_hf_config() - # Get special token IDs from config + # Get image token ID - this is the token that appears multiple times + # in the tokenized input after HF processor expansion image_token_id = getattr(hf_config, "image_token_id", 167855) - image_start_id = getattr(hf_config, "image_start_token_id", 16384) - image_end_id = getattr(hf_config, "image_end_token_id", 16385) - # Get image grid info to determine number of tokens per image - # For now, use a simple approach based on config - vision_config = hf_config.vision_config - image_size = getattr(vision_config, "image_size", 2048) - patch_size = getattr(vision_config, "patch_size", 16) + # Debug: log mm_items info + logger.debug(f"_get_prompt_updates: image_token_id={image_token_id}") + logger.debug(f"_get_prompt_updates: mm_items modalities={list(mm_items.get_all_counts().keys())}") + logger.debug(f"_get_prompt_updates: mm_items counts={mm_items.get_all_counts()}") + logger.debug( + f"_get_prompt_updates: out_mm_kwargs key={list(out_mm_kwargs.get_data().keys()) if out_mm_kwargs else None}" + ) - # Default number of image tokens - num_image_tokens = (image_size // patch_size) ** 2 - image_tokens = [image_token_id] * num_image_tokens + # Check if there are any images to process + num_images = mm_items.get_count("image", strict=False) + if num_images == 0: + # t2i mode: no images, no prompt updates needed + logger.debug("_get_prompt_updates: no images, returning empty list (t2i mode)") + return [] + + def get_replacement_glm_image(item_idx: int) -> list[int]: + """ + Return replacement token IDs for an image placeholder. + + For GLM-Image, each source image is represented by grid_h * grid_w tokens. + These are placeholder tokens that will be replaced by actual VQ-VAE + tokens during model forward pass. + + IMPORTANT: We use image_grid_thw from out_mm_kwargs to get the actual + processed grid size. The HF processor resizes images, so the original + image size (from mm_items) doesn't match the actual token count. + """ + # Get grid info from out_mm_kwargs (set by _get_mm_fields_config) + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item.get("image_grid_thw") + + if grid_thw is not None: + grid_data = grid_thw.data if hasattr(grid_thw, "data") else grid_thw + if isinstance(grid_data, torch.Tensor): + # grid is [t, h, w] - for images, t=1, so num_tokens = h * w + num_tokens = int(grid_data.prod().item()) + else: + num_tokens = int(grid_data[0] * grid_data[1] * grid_data[2]) + logger.debug( + f"get_replacement_glm_image: item_idx={item_idx}, \ + grid={grid_data.tolist() if isinstance(grid_data, torch.Tensor) else grid_data},\ + num_tokens={num_tokens}" + ) + else: + # Fallback: use default 1024x1024 grid size + # (1024/16) * (1024/16) = 64 * 64 = 4096 tokens + num_tokens = 64 * 64 + logger.warning( + f"get_replacement_glm_image: item_idx={item_idx}, \ + no grid_thw found, using default num_tokens={num_tokens}" + ) + + return [image_token_id] * num_tokens return [ PromptReplacement( modality="image", + # Use [token_id] to match each occurrence of image token + # The HF processor has already expanded <|image|> to multiple tokens target=[image_token_id], - replacement=PromptUpdateDetails.select_token_id( - [image_start_id] + image_tokens + [image_end_id], - embed_token_id=image_token_id, - ), - ) + replacement=get_replacement_glm_image, + ), ] @@ -1539,7 +1890,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, pixel_values: torch.Tensor | None = None, image_grid_thw: torch.Tensor | None = None, - ) -> torch.Tensor | IntermediateTensors: + ) -> tuple[torch.Tensor | IntermediateTensors, dict | None]: """ Forward pass through the GLM-Image model. @@ -1558,16 +1909,20 @@ def forward( image_grid_thw: Grid dimensions for source images Returns: - Hidden states or intermediate tensors for PP + Tuple of (hidden_states, prior_token_image_ids_info) + prior_token_image_ids_info is a dict with VQ-VAE tokens for i2i mode """ + prior_token_image_ids_info = None + # Handle intermediate tensors for pipeline parallelism if intermediate_tensors is not None: - return self.language_model( + hidden_states = self.language_model( input_ids=None, positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=None, ) + return hidden_states, None # Process source images if provided (image-to-image generation) if pixel_values is not None and image_grid_thw is not None: @@ -1577,6 +1932,27 @@ def forward( image_tokens = self.get_image_tokens(image_features, image_grid_thw) image_tokens = image_tokens.to(input_ids.device) + # Store prior_token_image_ids for diffusion stage (i2i mode) + # The tokens need to be upsampled from d32 to d16 (2x) for the DiT + # We store the raw tokens here; upsampling happens in ar2diffusion + split_sizes = (image_grid_thw.prod(dim=-1)).tolist() + image_tokens_list = torch.split(image_tokens, split_sizes, dim=0) + + # Upsample each image's tokens for DiT (from d32 to d16) + upsampled_token_ids = [] + for i, tokens in enumerate(image_tokens_list): + grid_t, grid_h, grid_w = image_grid_thw[i].tolist() + # Reshape to 2D grid + tokens_2d = tokens.view(1, 1, grid_h, grid_w) + # Upsample by 2x (nearest neighbor) + tokens_upsampled = F.interpolate(tokens_2d.float(), scale_factor=2, mode="nearest").to(dtype=torch.long) + upsampled_token_ids.append(tokens_upsampled.view(-1)) + + prior_token_image_ids_info = { + "prior_token_image_ids": upsampled_token_ids, + "image_grid_thw": image_grid_thw.tolist(), + } + # Replace placeholder tokens with actual image tokens special_image_mask = input_ids == self.image_token_id if special_image_mask.sum() > 0: @@ -1596,7 +1972,7 @@ def forward( inputs_embeds=inputs_embeds, ) - return hidden_states + return hidden_states, prior_token_image_ids_info @MULTIMODAL_REGISTRY.register_processor( @@ -1692,6 +2068,97 @@ def get_image_tokens( """Tokenize image features with VQ-VAE.""" return self.model.get_image_tokens(hidden_states, image_grid_thw) + def _parse_and_validate_image_input( + self, + pixel_values: torch.Tensor | None = None, + image_grid_thw: torch.Tensor | None = None, + **kwargs: object, + ) -> dict | None: + """Parse and validate image inputs.""" + if pixel_values is None: + return None + return { + "pixel_values": pixel_values, + "image_grid_thw": image_grid_thw, + } + + def _process_image_input( + self, + image_input: dict, + ) -> list[torch.Tensor]: + """ + Process image input through vision encoder to get embeddings. + + For GLM-Image, we extract features using the vision encoder. + These are used for multimodal profiling. The actual VQ-VAE tokenization + happens during the forward pass. + """ + pixel_values = image_input["pixel_values"] + image_grid_thw = image_input["image_grid_thw"] + + # Get image features from vision encoder + image_features = self.model.get_image_features(pixel_values, image_grid_thw) + + # Split by image grid sizes + split_sizes = (image_grid_thw.prod(dim=-1)).tolist() + image_features_list = torch.split(image_features, split_sizes, dim=0) + + return list(image_features_list) + + def embed_multimodal( + self, + **kwargs: object, + ) -> tuple[torch.Tensor, ...] | None: + """ + Embed multimodal inputs (images) for vLLM's multimodal processing. + + For GLM-Image, this extracts image features using the vision encoder. + These embeddings are used by vLLM for multimodal budget profiling. + The actual token replacement (via VQ-VAE) happens in the forward pass. + + Returns: + Tuple of image embedding tensors, one per image + """ + # Debug: log kwargs keys + logger.debug(f"embed_multimodal called with kwargs keys: {list(kwargs.keys())}") + + # Parse image inputs - check for multiple possible keys + pixel_values = kwargs.get("pixel_values") + image_embeds = kwargs.get("image_embeds") # Alternative key + image_grid_thw = kwargs.get("image_grid_thw") + + # Debug: log what we found + logger.debug(f"pixel_values type: {type(pixel_values)}, image_grid_thw type: {type(image_grid_thw)}") + + if pixel_values is None and image_embeds is None: + # No image inputs + logger.debug("No pixel_values or image_embeds found in kwargs") + return () + + # Use pixel_values if available, otherwise use image_embeds + if pixel_values is not None: + image_input = self._parse_and_validate_image_input( + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + else: + # Handle image_embeds case - these are pre-computed embeddings + if isinstance(image_embeds, torch.Tensor): + # Split by image grid sizes if available + if image_grid_thw is not None: + split_sizes = (image_grid_thw.prod(dim=-1)).tolist() + return tuple(torch.split(image_embeds, split_sizes, dim=0)) + else: + return (image_embeds,) + return () + + if image_input is None: + return () + + # Process images through vision encoder + image_embeddings = self._process_image_input(image_input) + return tuple(image_embeddings) + def _parse_grid_from_tokens( self, input_tokens: list[int], @@ -1932,6 +2399,9 @@ def get_mrope_input_positions( mrope_position_delta = (llm_positions.max() + 1 - seq_len).item() return llm_positions, mrope_position_delta + # Flag to indicate this model can output multimodal data (prior_token_image_ids for i2i) + have_multimodal_outputs = True + def forward( self, input_ids: torch.Tensor, @@ -1941,7 +2411,7 @@ def forward( pixel_values: torch.Tensor | None = None, image_grid_thw: torch.Tensor | None = None, **kwargs: object, - ) -> torch.Tensor | IntermediateTensors: + ) -> OmniOutput | IntermediateTensors: """ Forward pass through GLM-Image. @@ -1954,12 +2424,12 @@ def forward( image_grid_thw: Grid dimensions for images Returns: - Hidden states or intermediate tensors + OmniOutput with hidden states and optional prior_token_image_ids for i2i """ if intermediate_tensors is not None: inputs_embeds = None - hidden_states = self.model( + hidden_states, prior_token_image_ids_info = self.model( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, @@ -1968,7 +2438,19 @@ def forward( image_grid_thw=image_grid_thw, ) - return hidden_states + # For intermediate tensors (PP), just return hidden states + if isinstance(hidden_states, IntermediateTensors): + return hidden_states + + # Build multimodal outputs for i2i mode + multimodal_outputs = None + if prior_token_image_ids_info is not None: + multimodal_outputs = prior_token_image_ids_info + + return OmniOutput( + text_hidden_states=hidden_states, + multimodal_outputs=multimodal_outputs, + ) def compute_logits( self, diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml index 7deca12c9ba..20eea93a8ae 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml @@ -45,6 +45,7 @@ stage_args: process: true devices: "1" # Can use different GPU, or same GPU if memory allows max_batch_size: 1 + requires_multimodal_data: true # Required for i2i mode to pass condition images engine_args: model_stage: dit model_arch: GlmImagePipeline # Required for diffusion model class resolution diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml index d1e10cb4065..9f481463043 100644 --- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml +++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml @@ -47,6 +47,7 @@ stage_args: process: true devices: "1" # Use separate GPU for diffusion max_batch_size: 1 + requires_multimodal_data: true # Required for i2i mode to pass condition images engine_args: model_stage: dit # Diffusion-specific parameters diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py index f8e7aba302c..6d3d1efae2a 100644 --- a/vllm_omni/model_executor/stage_input_processors/glm_image.py +++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py @@ -161,15 +161,25 @@ def ar2diffusion( prior_token_ids, pixel_h, pixel_w = _parse_generated_tokens(generated_token_ids, height, width) t_parse_end = time.perf_counter() + # Get prior_token_image_ids from AR model output (for i2i mode) + # This contains VQ-VAE tokens from input image, used for KV cache conditioning + prior_token_image_ids = None + if hasattr(output, "multimodal_output") and output.multimodal_output: + raw_prior_image_ids = output.multimodal_output.get("prior_token_image_ids") + if raw_prior_image_ids is not None: + # Wrap in list if it's a single tensor (expected by diffusion pipeline) + if isinstance(raw_prior_image_ids, torch.Tensor): + prior_token_image_ids = [raw_prior_image_ids] + elif isinstance(raw_prior_image_ids, list): + prior_token_image_ids = raw_prior_image_ids + diffusion_input = { "prompt": text_prompt, "height": pixel_h, "width": pixel_w, "extra": { "prior_token_ids": prior_token_ids, - "prior_token_image_ids": output.multimodal_output.get("prior_token_image_ids") - if hasattr(output, "multimodal_output") and output.multimodal_output - else None, + "prior_token_image_ids": prior_token_image_ids, }, }