From fc54ff98c17b3905094d90d9dba61278d7886062 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 10:57:35 +0800
Subject: [PATCH 01/53] refine glm-image implementation

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_transformer.py | 21 ++++++-------------
 .../models/glm_image/pipeline_glm_image.py    |  5 ++---
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
index 09f7b17e133..d783a11b319 100644
--- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
+++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
@@ -8,8 +8,8 @@
 import torch
 import torch.nn as nn
 from diffusers.models.attention import FeedForward
-from diffusers.models.transformers.transformer_glm_image import GlmImageCombinedTimestepSizeEmbeddings
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.transformers.transformer_glm_image import GlmImageCombinedTimestepSizeEmbeddings
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVParallelLinear
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -17,7 +17,6 @@
 from vllm_omni.diffusion.attention.layer import Attention
 from vllm_omni.diffusion.cache.base import CachedTransformer
 from vllm_omni.diffusion.data import OmniDiffusionConfig
-from vllm_omni.diffusion.layers.rope import RotaryEmbedding
 
 logger = init_logger(__name__)
 
@@ -354,8 +353,7 @@ def __init__(
             nn.Dropout(0.0),
         )
 
-        # RoPE and attention
-        self.rope = RotaryEmbedding(is_neox_style=False)
+        # Attention
         self.attn = Attention(
             num_heads=num_heads,
             head_size=head_dim,
@@ -368,7 +366,6 @@ def forward(
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
         image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
-        attention_mask: torch.Tensor | None = None,
         kv_cache: GlmImageLayerKVCache | None = None,
         kv_cache_mode: KVCacheMode | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -379,7 +376,6 @@ def forward(
             hidden_states: Image hidden states [B, img_seq_len, D]
             encoder_hidden_states: Text hidden states [B, text_seq_len, D]
             image_rotary_emb: Tuple of (cos, sin) for RoPE
-            attention_mask: Optional attention mask for text tokens
             kv_cache: Optional layer KV cache for image editing
             kv_cache_mode: Cache mode (WRITE, READ, SKIP)
 
@@ -407,16 +403,13 @@ def forward(
 
         # Apply RoPE only to image tokens (not text tokens)
         if image_rotary_emb is not None:
-            cos, sin = image_rotary_emb
-            cos = cos.to(query.dtype)
-            sin = sin.to(query.dtype)
             # Only apply RoPE to image part (after text_seq_length)
             query_img = query[:, text_seq_length:, :, :]
             key_img = key[:, text_seq_length:, :, :]
             from diffusers.models.embeddings import apply_rotary_emb
-            query_img = apply_rotary_emb(query_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2)
-            # key_img = self.rope(key_img, cos, sin)
-            key_img = apply_rotary_emb(key_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2)
+
+            query_img = apply_rotary_emb(query_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2)
+            key_img = apply_rotary_emb(key_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2)
             query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1)
             key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1)
 
@@ -555,7 +548,7 @@ def __init__(
         od_config: OmniDiffusionConfig,
     ):
         super().__init__()
-        
+
         patch_size = od_config.tf_model_config.patch_size
         in_channels = od_config.tf_model_config.in_channels
         out_channels = od_config.tf_model_config.out_channels
@@ -565,8 +558,6 @@ def __init__(
         condition_dim = od_config.tf_model_config.condition_dim
         prior_vq_quantizer_codebook_size = od_config.tf_model_config.prior_vq_quantizer_codebook_size
         text_embed_dim = od_config.tf_model_config.text_embed_dim
-        
-        
 
         # Get num_layers from config if available
         model_config = od_config.tf_model_config
diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index f582c3b9b69..74a1ecac334 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -73,7 +73,7 @@ def get_glm_image_post_process_func(od_config: OmniDiffusionConfig):
     image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
 
     def post_process_func(images: PIL.Image.Image):
-        return images
+        return image_processor.postprocess(images, output_type="pil")
 
     return post_process_func
 
@@ -951,8 +951,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         latents = latents * latents_std + latents_mean
         image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
 
-        # 9. Post-process
-        image = self.image_processor.postprocess(image, output_type="pil")[0]
+        # 9. Leave post-process to vllm-omni pipeline
 
         return DiffusionOutput(output=image)
 

From 91bff0639366816426208949584edb102a1d4d86 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 12:59:54 +0800
Subject: [PATCH 02/53] implement GLM Image vllm AR

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 1775 +++++++++++++++++
 1 file changed, 1775 insertions(+)
 create mode 100644 vllm_omni/diffusion/models/glm_image/glm_image_ar.py

diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_ar.py b/vllm_omni/diffusion/models/glm_image/glm_image_ar.py
new file mode 100644
index 00000000000..98fd1dd19db
--- /dev/null
+++ b/vllm_omni/diffusion/models/glm_image/glm_image_ar.py
@@ -0,0 +1,1775 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm_image/modeling_glm_image.py
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The ZhipuAI Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-Image model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Literal
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature
+from transformers.models.glm_image.configuration_glm_image import (
+    GlmImageConfig,
+    GlmImageTextConfig,
+    GlmImageVisionConfig,
+    GlmImageVQVAEConfig,
+)
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, MultiModalConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
+from vllm.model_executor.layers.conv import Conv2dLayer
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from vllm.model_executor.models.utils import (
+    WeightsMapper,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+
+# === Multimodal Processing ===
+
+
+class GlmImagePixelInputs(TensorSchema):
+    """
+    Schema for GLM-Image pixel inputs.
+
+    Dimensions:
+        - np: Number of patches (total across all images)
+        - cpp: channels * patch_size * patch_size
+        - ni: Number of images
+        - g: Grid dimensions (3 for temporal, height, width)
+    """
+
+    type: Literal["pixel_values"] = "pixel_values"
+
+    pixel_values: Annotated[torch.Tensor, TensorShape("np", "cpp")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
+
+
+class GlmImageProcessingInfo(BaseProcessingInfo):
+    """
+    Processing information for GLM-Image model.
+
+    GLM-Image is an image generation model that uses:
+    - Vision encoder for encoding source images (image-to-image)
+    - VQ-VAE for tokenizing image features
+    - Text model for generating image tokens
+    """
+
+    def get_hf_config(self) -> GlmImageConfig:
+        return self.ctx.get_hf_config(GlmImageConfig)
+
+    def get_hf_processor(self, **kwargs: object):
+        # GLM-Image uses a processor similar to Qwen2-VL
+        # Try to get GlmImageProcessor if available
+        try:
+            from transformers import GlmImageProcessor
+
+            return self.ctx.get_hf_processor(GlmImageProcessor, **kwargs)
+        except ImportError:
+            # Fallback: return None and handle in processor
+            return None
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        # GLM-Image supports multiple source images for image-to-image generation
+        # or no image for text-to-image generation
+        # None means no limit on the number of images
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        """
+        Calculate the number of image tokens for a given image size.
+
+        GLM-Image processes images through a patch embedding with patch_size=16,
+        then quantizes through VQ-VAE. The number of tokens is:
+        (image_height // patch_size) * (image_width // patch_size)
+        """
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+
+        # Number of patches in each dimension
+        num_patches_h = image_height // patch_size
+        num_patches_w = image_width // patch_size
+
+        return num_patches_h * num_patches_w
+
+    def get_max_image_tokens(self) -> int:
+        """
+        Get the maximum number of image tokens.
+
+        Based on the default image size (2048x2048) and patch size (16).
+        """
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        # Default max size
+        image_size = getattr(vision_config, "image_size", 2048)
+        patch_size = getattr(vision_config, "patch_size", 16)
+
+        max_patches = (image_size // patch_size) ** 2
+        return max_patches
+
+    def get_image_size_with_most_features(self) -> tuple[int, int]:
+        """
+        Get the image size that produces the most features.
+
+        Returns:
+            Tuple of (width, height)
+        """
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        image_size = getattr(vision_config, "image_size", 2048)
+        return (image_size, image_size)
+
+
+class GlmImageDummyInputsBuilder(BaseDummyInputsBuilder[GlmImageProcessingInfo]):
+    """
+    Builds dummy inputs for GLM-Image model profiling.
+    """
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        """
+        Generate dummy text with image placeholders.
+
+        GLM-Image uses <|image|> as the image placeholder token.
+        """
+        num_images = mm_counts.get("image", 0)
+
+        hf_config = self.info.get_hf_config()
+        # Get image token from config or use default
+        image_token_id = getattr(hf_config, "image_token_id", 167855)
+
+        tokenizer = self.info.get_tokenizer()
+        # Try to get the image token string
+        try:
+            image_token = tokenizer.convert_ids_to_tokens(image_token_id)
+        except Exception:
+            image_token = "<|image|>"
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        """
+        Generate dummy multimodal data for profiling.
+        """
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        # Default image size from config
+        image_size = getattr(vision_config, "image_size", 2048)
+        width = height = image_size
+
+        num_images = mm_counts.get("image", 0)
+
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=width,
+                height=height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
+        }
+
+
+class GlmImageMultiModalProcessor(BaseMultiModalProcessor[GlmImageProcessingInfo]):
+    """
+    Multimodal processor for GLM-Image.
+
+    Handles:
+    - Image preprocessing and tokenization
+    - Prompt construction with image placeholders
+    - Grid dimension calculation for M-RoPE position encoding
+    """
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        """
+        Call the HuggingFace processor.
+
+        If no multimodal data is provided (text-to-image mode),
+        we only tokenize the text.
+        """
+        if not mm_data or not mm_data.get("image"):
+            # Text-to-image mode: just tokenize the prompt
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        # Image-to-image mode: use full processor
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        """
+        Get the multimodal field configuration.
+        """
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        """
+        Get prompt updates for image tokens.
+
+        GLM-Image replaces each image placeholder with:
+        <|image_start|> + image_tokens + <|image_end|>
+        """
+        hf_config = self.info.get_hf_config()
+
+        # Get special token IDs from config
+        image_token_id = getattr(hf_config, "image_token_id", 167855)
+        image_start_id = getattr(hf_config, "image_start_token_id", 16384)
+        image_end_id = getattr(hf_config, "image_end_token_id", 16385)
+
+        # Get image grid info to determine number of tokens per image
+        # For now, use a simple approach based on config
+        vision_config = hf_config.vision_config
+        image_size = getattr(vision_config, "image_size", 2048)
+        patch_size = getattr(vision_config, "patch_size", 16)
+
+        # Default number of image tokens
+        num_image_tokens = (image_size // patch_size) ** 2
+        image_tokens = [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=PromptUpdateDetails.select_token_id(
+                    [image_start_id] + image_tokens + [image_end_id],
+                    embed_token_id=image_token_id,
+                ),
+            )
+        ]
+
+
+# === VQ-VAE Components ===
+
+
+class GlmImageVQVAEVectorQuantizer(nn.Module):
+    """
+    Vector Quantizer module for GLM-Image VQ-VAE (Inference-optimized).
+
+    This module quantizes continuous latent vectors into discrete codebook vectors
+    using L2-normalized distance computation for better stability.
+
+    Key differences from Chameleon's VQ-VAE:
+    - GLM-Image uses L2 normalization on both input and codebook embeddings
+    - Distance is computed as cosine similarity in normalized space
+
+    Optimizations for inference (compared to transformers implementation):
+    1. Uses matmul + argmax(similarity) instead of einsum + argmin(distance)
+       - Mathematically equivalent: argmin(2 - 2*sim) = argmax(sim)
+       - More efficient and clearer for L2-normalized vectors
+    2. Removes redundant normalization (transformers normalizes twice)
+    3. Removes training-only components (loss, straight-through estimator, beta)
+    4. Directly returns quantized vectors without gradient preservation
+
+    Args:
+        config: GlmImageVQVAEConfig containing:
+            - num_embeddings: Number of codebook vectors (typically 16384)
+            - embed_dim: Dimension of each embedding vector (typically 2048)
+
+    Mathematical Verification:
+        For L2-normalized vectors where ||z|| = ||e|| = 1:
+        - distance = ||z - e||^2 = 2 - 2*(z·e) = 2(1 - cosine_similarity)
+        - Therefore: argmin(distance) ≡ argmax(cosine_similarity)
+        This equivalence has been verified numerically (see verify_vqvae_correctness.py)
+    """
+
+    def __init__(self, config: GlmImageVQVAEConfig):
+        super().__init__()
+        self.num_embeddings = config.num_embeddings
+        self.embedding_dim = config.embed_dim
+
+        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
+
+    def forward(self, hidden_state: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Quantize the input hidden states.
+
+        Args:
+            hidden_state: Input tensor of shape (batch, channels, height, width)
+
+        Returns:
+            Tuple of:
+                - hidden_state_quant: Quantized tensor, same shape as input
+                - min_encoding_indices: Codebook indices of shape
+                  (batch * height * width,)
+        """
+        batch_size, channels, height, width = hidden_state.shape
+
+        # Permute to (batch, height, width, channels) and flatten for processing
+        hidden_state_flat = hidden_state.permute(0, 2, 3, 1).reshape(-1, self.embedding_dim)
+
+        # L2 normalize both hidden states and embeddings
+        # This is the key difference from Chameleon's implementation
+        hidden_state_normalized = F.normalize(hidden_state_flat, p=2, dim=-1)
+        embedding_normalized = F.normalize(self.embedding.weight, p=2, dim=-1)
+
+        # Compute cosine similarity (since both are L2 normalized)
+        # Higher similarity = closer match, so we negate for argmin
+        # Using matmul for efficiency: (N, D) @ (D, K) -> (N, K)
+        similarity = torch.matmul(hidden_state_normalized, embedding_normalized.t())
+
+        # Find nearest codebook entry (highest similarity)
+        min_encoding_indices = torch.argmax(similarity, dim=1)
+
+        # Get quantized vectors using normalized embeddings
+        # For inference, we directly return the quantized vectors without
+        # straight-through estimator (no gradients needed)
+        hidden_state_quant = embedding_normalized[min_encoding_indices]
+
+        # Reshape back to (batch, height, width, channels)
+        # then (batch, channels, height, width)
+        hidden_state_quant = (
+            hidden_state_quant.view(batch_size, height, width, self.embedding_dim).permute(0, 3, 1, 2).contiguous()
+        )
+
+        return hidden_state_quant, min_encoding_indices
+
+
+class GlmImageVQVAE(nn.Module):
+    """
+    VQ-VAE module for GLM-Image.
+
+    Unlike Chameleon's VQ-VAE which includes a full encoder, GLM-Image's VQ-VAE
+    only contains:
+    - quant_conv: Projects from latent_channels to embed_dim
+    - quantize: Vector quantizer
+    - post_quant_conv: Projects from embed_dim back to latent_channels
+
+    The encoder functionality is handled by GlmImageVisionModel instead.
+
+    This module is always in eval mode as the VQ-VAE is frozen during inference.
+
+    Args:
+        config: GlmImageVQVAEConfig
+    """
+
+    def __init__(self, config: GlmImageVQVAEConfig):
+        super().__init__()
+        self.config = config
+
+        # Vector quantizer
+        self.quantize = GlmImageVQVAEVectorQuantizer(config)
+
+        # Convolutions for projecting to/from embedding space
+        # Using vLLM's optimized Conv2dLayer
+        self.quant_conv = Conv2dLayer(
+            in_channels=config.latent_channels,
+            out_channels=config.embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.post_quant_conv = Conv2dLayer(
+            in_channels=config.embed_dim,
+            out_channels=config.latent_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
+        # VQ-VAE is always frozen in GLM-Image
+        self.eval()
+
+    def encode(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Encode input features into quantized latent codes.
+
+        Args:
+            hidden_states: Input tensor of shape (batch, latent_channels, height, width)
+                          This is typically the output from GlmImageVisionModel reshaped
+                          into spatial format.
+
+        Returns:
+            Tuple of:
+                - quant: Quantized tensor of shape (batch, embed_dim, height, width)
+                - indices: Codebook indices of shape (batch * height * width,)
+        """
+        # Project to embedding dimension
+        hidden_states = self.quant_conv(hidden_states)
+
+        # Quantize
+        quant, indices = self.quantize(hidden_states)
+
+        return quant, indices
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """Get the dtype of the model."""
+        return self.quant_conv.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        """Get the device of the model."""
+        return self.quant_conv.weight.device
+
+
+# === Vision Model Components ===
+
+
+class GlmImageVisionMLP(nn.Module):
+    """
+    MLP module for GLM-Image vision model.
+
+    Uses GELU activation with standard fc1 -> fc2 structure.
+    Key difference from Glm4vVisionMLP: uses GELU instead of SwiGLU.
+    """
+
+    def __init__(
+        self,
+        config: GlmImageVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" if multimodal_config else False
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
+        )
+        self.act_fn = nn.GELU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act_fn(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class GlmImageVisionAttention(nn.Module):
+    """
+    Multi-headed attention for GLM-Image vision model.
+
+    Key differences from Glm4vVisionAttention:
+    - No RoPE - uses learned position embeddings instead
+    - Uses standard qkv projection (not separate q, k, v)
+    - attention_bias from config controls bias in linear layers
+    """
+
+    def __init__(
+        self,
+        config: GlmImageVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" if multimodal_config else False
+        self.tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        attention_bias = getattr(config, "attention_bias", True)
+
+        self.num_heads_per_partition = dist_utils.divide(self.num_heads, self.tp_size)
+
+        # QKV projection - uses bias based on config
+        self.qkv = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            total_num_kv_heads=self.num_heads,  # No GQA in vision model
+            bias=attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+        self.proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            bias=attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
+
+        # MMEncoderAttention for efficient vision attention
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads_per_partition,
+            head_size=self.head_dim,
+            scale=self.head_dim**-0.5,
+            multimodal_config=multimodal_config,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int | None = None,
+    ) -> torch.Tensor:
+        # hidden_states: [seq_len, hidden_size] (no batch dim)
+        seq_len = hidden_states.shape[0]
+
+        # QKV projection
+        qkv, _ = self.qkv(hidden_states)
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        # Reshape for attention: [seq, hidden] -> [1, seq, heads, head_dim]
+        q = q.view(seq_len, self.num_heads_per_partition, self.head_dim).unsqueeze(0)
+        k = k.view(seq_len, self.num_heads_per_partition, self.head_dim).unsqueeze(0)
+        v = v.view(seq_len, self.num_heads_per_partition, self.head_dim).unsqueeze(0)
+
+        # No RoPE in GLM-Image vision model - position info comes from embeddings
+
+        # Apply attention
+        attn_output = self.attn(
+            query=q,
+            key=k,
+            value=v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+
+        # Reshape back: [1, seq, heads, head_dim] -> [seq, hidden]
+        attn_output = attn_output.view(seq_len, -1)
+
+        # Output projection
+        output, _ = self.proj(attn_output)
+        return output
+
+
+class GlmImageVisionPatchEmbed(nn.Module):
+    """
+    Patch embedding for GLM-Image vision model.
+
+    Key difference from Glm4vVisionPatchEmbed:
+    - Uses 2D convolution (no temporal dimension)
+    - GLM-Image processes single images, not videos
+    """
+
+    def __init__(self, config: GlmImageVisionConfig) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+
+        # 2D convolution for patch embedding
+        self.proj = Conv2dLayer(
+            in_channels=self.in_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding=0,
+            bias=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: Packed pixel values of shape
+                [total_patches, in_channels * patch_size * patch_size]
+
+        Returns:
+            Patch embeddings of shape [total_patches, embed_dim]
+        """
+        target_dtype = self.proj.weight.dtype
+        # Reshape from [N, C*P*P] to [N, C, P, P]
+        hidden_states = hidden_states.view(-1, self.in_channels, self.patch_size, self.patch_size)
+        # Conv2d and flatten: [N, C, P, P] -> [N, embed_dim, 1, 1] -> [N, embed_dim]
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+
+
+class GlmImageVisionEmbeddings(nn.Module):
+    """
+    Vision embeddings for GLM-Image.
+
+    Uses learned 2D position embeddings with bilinear interpolation
+    for variable resolution support.
+
+    Key difference from Glm4vVisionEmbeddings:
+    - Uses bilinear interpolation (not bicubic) for position embedding adaptation
+    """
+
+    def __init__(self, config: GlmImageVisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+        # GLM-Image uses bilinear, Glm4v uses bicubic
+        self.interpolation_mode = "bilinear"
+
+    def forward(
+        self,
+        embeddings: torch.Tensor,
+        lengths: list[int] | torch.Tensor,
+        image_shapes: torch.Tensor,
+        h_coords: torch.Tensor,
+        w_coords: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Add adapted position embeddings to patch embeddings.
+
+        Args:
+            embeddings: Patch embeddings [total_seq, embed_dim]
+            lengths: Sequence length for each image
+            image_shapes: [num_images, 3] with (t, h, w) for each image
+            h_coords: Height coordinates for each patch [total_seq]
+            w_coords: Width coordinates for each patch [total_seq]
+
+        Returns:
+            Embeddings with position encoding added [total_seq, embed_dim]
+        """
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(0, hidden_size, device=device, dtype=pos_embed_weight.dtype)
+        else:
+            # Convert to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths, device=device, dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(image_shapes, device=device, dtype=torch.long)
+
+            # Prepare 2D position embedding for interpolation
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (
+                pos_embed_weight.view(orig_size, orig_size, hidden_size)
+                .permute(2, 0, 1)  # [H, W, C] -> [C, H, W]
+                .unsqueeze(0)  # [1, C, H, W]
+                .to(device=device, dtype=torch.float32)
+            )
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat([image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+            target_w = torch.cat([image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+
+            # Normalize coordinates to [-1, 1] for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid [1, total_seq, 1, 2]
+            grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2)
+
+            # Bilinear interpolation (GLM-Image uses bilinear, not bicubic)
+            interpolated_embed = F.grid_sample(
+                pos_embed_2d,
+                grid,
+                mode=self.interpolation_mode,
+                align_corners=False,
+                padding_mode="border",
+            )
+
+            # Reshape: [1, C, total_seq, 1] -> [total_seq, C]
+            adapted_pos_embed = (interpolated_embed.squeeze(0).squeeze(-1).permute(1, 0)).to(pos_embed_weight.dtype)
+
+        # Add position embedding to patch embeddings
+        embeddings = embeddings + adapted_pos_embed.to(embeddings.device)
+        return embeddings
+
+
+class GlmImageVisionBlock(nn.Module):
+    """
+    Transformer block for GLM-Image vision model.
+
+    Key differences from Glm4vVisionBlock:
+    - Uses LayerNorm instead of RMSNorm
+    - No RoPE position embeddings (handled in GlmImageVisionEmbeddings)
+    - Uses GELU MLP instead of SwiGLU
+    """
+
+    def __init__(
+        self,
+        config: GlmImageVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = GlmImageVisionAttention(
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.mlp = GlmImageVisionMLP(
+            config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int | None = None,
+    ) -> torch.Tensor:
+        # Pre-norm attention
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        hidden_states = residual + hidden_states
+
+        # Pre-norm MLP
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class GlmImageVisionModel(nn.Module):
+    """
+    Vision encoder for GLM-Image.
+
+    Key differences from Glm4vVisionTransformer:
+    - No RoPE - uses learned position embeddings with bilinear interpolation
+    - No merger, downsample, or post-processing layers
+    - Uses LayerNorm instead of RMSNorm in blocks
+    - No temporal dimension (images only, no video)
+    """
+
+    def __init__(
+        self,
+        config: GlmImageVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.patch_size = config.patch_size
+        self.spatial_merge_size = config.spatial_merge_size
+
+        # Patch embedding
+        self.patch_embed = GlmImageVisionPatchEmbed(config)
+
+        # Position embeddings
+        self.embeddings = GlmImageVisionEmbeddings(config)
+
+        # Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                GlmImageVisionBlock(
+                    config,
+                    quant_config=quant_config,
+                    multimodal_config=multimodal_config,
+                    prefix=f"{prefix}.blocks.{i}",
+                )
+                for i in range(config.depth)
+            ]
+        )
+
+        # Attention backend selection
+        self.attn_backend = get_vit_attn_backend(
+            head_size=self.head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=(multimodal_config.mm_encoder_attn_backend if multimodal_config else None),
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def compute_position_ids(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Compute position IDs for each patch based on grid dimensions.
+
+        Args:
+            grid_thw: [num_images, 3] with (t, h, w) for each image
+
+        Returns:
+            Position IDs [total_patches, 2] with (h_pos, w_pos) for each patch
+        """
+        pos_ids = []
+        for t, h, w in grid_thw:
+            # Create h and w position grids
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+
+            # Reshape for spatial merge
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+
+            # Stack and repeat for temporal dimension
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+
+        return torch.cat(pos_ids, dim=0)
+
+    def compute_attn_mask_seqlen(
+        self,
+        cu_seqlens: torch.Tensor,
+    ) -> int | None:
+        """Compute max sequence length for flash attention."""
+        if (
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
+        ):
+            return (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        return None
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass through vision encoder.
+
+        Args:
+            pixel_values: Packed pixel values
+                [total_patches, num_channels * patch_size * patch_size]
+            grid_thw: [num_images, 3] with (t, h, w) for each image
+
+        Returns:
+            Hidden states [total_patches, hidden_size]
+        """
+        # Patch embedding
+        hidden_states = self.patch_embed(pixel_values.to(self.device, self.dtype))
+
+        # Compute position IDs
+        position_ids = self.compute_position_ids(grid_thw)
+
+        # Compute cumulative sequence lengths for attention
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0, dtype=torch.int32
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        cu_seqlens = cu_seqlens.to(self.device)
+
+        # Get sequence lengths for position embedding
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+
+        # Add position embeddings
+        hidden_states = self.embeddings(
+            hidden_states,
+            seqlens,
+            grid_thw,
+            position_ids[:, 0].to(hidden_states.device),
+            position_ids[:, 1].to(hidden_states.device),
+        )
+
+        # Compute max seqlen for flash attention
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+
+        # Transformer blocks
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+            )
+
+        return hidden_states
+
+
+# === Text Model Components ===
+
+
+class GlmImageTextMLP(nn.Module):
+    """
+    MLP module for GLM-Image text model.
+
+    Uses SiLU activation with gated linear units (SwiGLU variant).
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. Only silu is supported for GLM-Image.")
+        # Import here to avoid circular dependency
+        from vllm.model_executor.layers.activation import SiluAndMul
+
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GlmImageTextAttention(nn.Module):
+    """
+    Multi-headed attention for GLM-Image text model.
+
+    Uses Grouped Query Attention (GQA) with M-RoPE position embeddings.
+    """
+
+    def __init__(
+        self,
+        config: GlmImageTextConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position_embeddings: int = 32768,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = True,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # M-RoPE for 3D position encoding
+        rope_parameters = getattr(config, "rope_parameters", None)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=rope_parameters,
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GlmImageTextDecoderLayer(nn.Module):
+    """
+    Decoder layer for GLM-Image text model.
+
+    Key difference from standard LLaMA-style decoder:
+    - Uses 4 RMSNorm layers instead of 2:
+      - input_layernorm: before self-attention
+      - post_self_attn_layernorm: after self-attention, before residual add
+      - post_attention_layernorm: before MLP
+      - post_mlp_layernorm: after MLP, before residual add
+    """
+
+    def __init__(
+        self,
+        config: GlmImageTextConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+        attention_bias = getattr(config, "attention_bias", True)
+
+        self.self_attn = GlmImageTextAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads", config.num_attention_heads),
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = GlmImageTextMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=False,
+            prefix=f"{prefix}.mlp",
+        )
+
+        # GLM-Image uses 4 RMSNorm layers per decoder layer
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Save residual for first add
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(positions=positions, hidden_states=hidden_states)
+
+        # Post self-attention norm and residual add
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # MLP
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Return hidden_states and None for residual (already added)
+        return hidden_states, None
+
+
+class GlmImageTextModel(nn.Module):
+    """
+    Text model (language backbone) for GLM-Image.
+
+    This is the decoder-only transformer that generates discrete image tokens.
+    Uses M-RoPE (3D position encoding) for multimodal position awareness.
+    """
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        config: GlmImageTextConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+
+        # Embedding layer
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.embed_tokens = None
+
+        # Decoder layers
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GlmImageTextDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=vllm_config.quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        # Final norm
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = None
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def get_input_embeddings(self) -> VocabParallelEmbedding:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class GlmImageModel(nn.Module):
+    """
+    GLM-Image model that combines Vision Encoder, VQ-VAE, and Text Model.
+
+    This model is used for image generation tasks:
+    - Image-to-Image: Source image → Vision Encoder → VQ-VAE tokens → Text Model
+    - Text-to-Image: Text tokens → Text Model → Generate image tokens
+
+    Key components:
+    - visual: GlmImageVisionModel for encoding input images
+    - vqmodel: GlmImageVQVAE for tokenizing image features
+    - language_model: GlmImageTextModel for text/token generation
+
+    The model uses M-RoPE (3D position encoding) for multimodal position awareness:
+    - temporal: constant for image tokens, incremental for text
+    - height: row position for image tokens
+    - width: column position for image tokens
+    """
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Vision encoder
+        self.visual = GlmImageVisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.visual" if prefix else "visual",
+        )
+
+        # VQ-VAE for image tokenization (frozen)
+        self.vqmodel = GlmImageVQVAE(config.vq_config)
+
+        # Text/Language model
+        self.language_model = GlmImageTextModel(
+            vllm_config=vllm_config,
+            config=config.text_config,
+            prefix=f"{prefix}.language_model" if prefix else "language_model",
+        )
+
+        # Store special token IDs
+        self.image_token_id = config.image_token_id
+        self.image_start_token_id = config.image_start_token_id
+        self.image_end_token_id = config.image_end_token_id
+
+        self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors
+
+    def get_input_embeddings(self) -> VocabParallelEmbedding:
+        return self.language_model.get_input_embeddings()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.Tensor,
+        image_grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Extract image features using the vision encoder.
+
+        Args:
+            pixel_values: Packed pixel values
+                [total_patches, num_channels * patch_size * patch_size]
+            image_grid_thw: [num_images, 3] with (t, h, w) for each image
+
+        Returns:
+            Image features [total_patches, hidden_size]
+        """
+        return self.visual(pixel_values, image_grid_thw)
+
+    def get_image_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        image_grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Tokenize image features into discrete tokens using VQ-VAE.
+
+        Args:
+            hidden_states: Image features [total_patches, hidden_size]
+            image_grid_thw: [num_images, 3] with (t, h, w) for each image
+
+        Returns:
+            Discrete token indices [total_patches]
+        """
+        hidden_size = hidden_states.shape[-1]
+        split_sizes = (image_grid_thw.prod(dim=-1)).tolist()
+        hidden_states_list = torch.split(hidden_states, split_sizes, dim=0)
+
+        all_image_tokens = []
+        for i, hs in enumerate(hidden_states_list):
+            grid_t, grid_h, grid_w = image_grid_thw[i].tolist()
+            # Reshape to spatial format: [t, h, w, c] -> [t, c, h, w]
+            hs = hs.view(grid_t, grid_h, grid_w, hidden_size)
+            hs = hs.permute(0, 3, 1, 2).contiguous()
+            # Encode with VQ-VAE
+            _, indices = self.vqmodel.encode(hs)
+            all_image_tokens.append(indices)
+
+        return torch.cat(all_image_tokens, dim=0)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        image_grid_thw: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        """
+        Forward pass through the GLM-Image model.
+
+        For image-to-image generation:
+        1. Encode source images with vision encoder
+        2. Tokenize features with VQ-VAE
+        3. Replace placeholder tokens with actual image tokens
+        4. Run through language model
+
+        Args:
+            input_ids: Input token IDs [batch_size, seq_len]
+            positions: Position IDs, shape (3, seq_len) for M-RoPE
+            intermediate_tensors: For pipeline parallelism
+            inputs_embeds: Pre-computed embeddings (optional)
+            pixel_values: Source image pixels (for image-to-image)
+            image_grid_thw: Grid dimensions for source images
+
+        Returns:
+            Hidden states or intermediate tensors for PP
+        """
+        # Handle intermediate tensors for pipeline parallelism
+        if intermediate_tensors is not None:
+            return self.language_model(
+                input_ids=None,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=None,
+            )
+
+        # Process source images if provided (image-to-image generation)
+        if pixel_values is not None and image_grid_thw is not None:
+            # Encode images
+            image_features = self.get_image_features(pixel_values, image_grid_thw)
+            # Tokenize with VQ-VAE
+            image_tokens = self.get_image_tokens(image_features, image_grid_thw)
+            image_tokens = image_tokens.to(input_ids.device)
+
+            # Replace placeholder tokens with actual image tokens
+            special_image_mask = input_ids == self.image_token_id
+            if special_image_mask.sum() > 0:
+                input_ids = input_ids.clone()
+                input_ids[special_image_mask] = image_tokens
+
+        # Get embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            input_ids = None
+
+        # Forward through language model
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    GlmImageMultiModalProcessor,
+    info=GlmImageProcessingInfo,
+    dummy_inputs=GlmImageDummyInputsBuilder,
+)
+class GlmImageForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, SupportsMRoPE):
+    """
+    GLM-Image model for conditional image generation.
+
+    This is the main entry point for GLM-Image in vLLM. It wraps:
+    - GlmImageModel (Vision Encoder + VQ-VAE + Text Model)
+    - LM Head for token prediction
+
+    Supports:
+    - Multimodal inputs (images for image-to-image generation)
+    - M-RoPE (3D position encoding) for multimodal generation
+    - Pipeline Parallelism
+    - Image-to-Image and Text-to-Image generation
+    """
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["gate_up_proj"],
+    }
+
+    # Weight mapping from HuggingFace to vLLM format
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "lm_head.",
+            "model.language_model.": "model.language_model.",
+            "model.visual.": "model.visual.",
+            "model.vqmodel.": "model.vqmodel.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config: GlmImageConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.vllm_config = vllm_config
+
+        # Main model (Vision + VQ-VAE + Text)
+        self.model = GlmImageModel(
+            vllm_config=vllm_config,
+            prefix=f"{prefix}.model" if prefix else "model",
+        )
+
+        # LM head for token prediction
+        # GLM-Image outputs to vision_vocab_size (16512) not full vocab
+        self.lm_head = ParallelLMHead(
+            config.text_config.vision_vocab_size,
+            config.text_config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.lm_head" if prefix else "lm_head",
+        )
+
+        # Logits processor
+        self.logits_processor = LogitsProcessor(
+            config.text_config.vision_vocab_size,
+            soft_cap=None,
+        )
+
+        self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
+
+    def get_input_embeddings(self) -> VocabParallelEmbedding:
+        return self.model.get_input_embeddings()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.Tensor,
+        image_grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        """Extract image features using vision encoder."""
+        return self.model.get_image_features(pixel_values, image_grid_thw)
+
+    def get_image_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        image_grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        """Tokenize image features with VQ-VAE."""
+        return self.model.get_image_tokens(hidden_states, image_grid_thw)
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> tuple[torch.Tensor, int]:
+        """
+        Compute M-RoPE position IDs for GLM-Image generation.
+
+        GLM-Image uses 3D position encoding:
+        - For text tokens: all 3 dimensions (temporal, height, width) are the same
+        - For image tokens:
+          - temporal: constant (marks image region)
+          - height: row position in image grid
+          - width: column position in image grid
+
+        Args:
+            input_tokens: List of input token IDs
+            mm_features: Multimodal feature specifications
+
+        Returns:
+            Tuple of (position_ids [3, seq_len], mrope_position_delta)
+        """
+        # Gather image grid info from multimodal features
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+
+        hf_config = self.config
+        image_start_token_id = hf_config.image_start_token_id
+        image_end_token_id = hf_config.image_end_token_id
+
+        seq_len = len(input_tokens)
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        if image_grid_thw:
+            # Build position IDs considering image regions
+            current_pos = 0
+            image_idx = 0
+            i = 0
+
+            while i < seq_len:
+                token = input_tokens[i]
+
+                if token == image_start_token_id and image_idx < len(image_grid_thw):
+                    # Start of image region
+                    # Add position for the start marker
+                    llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]]))
+                    current_pos += 1
+                    i += 1
+
+                    # Get grid dimensions for this image
+                    _, h, w = image_grid_thw[image_idx]
+                    total_image_tokens = h * w
+
+                    # Build 2D position IDs for image tokens
+                    t_indices = torch.full((total_image_tokens,), current_pos)
+                    h_indices = torch.arange(h).unsqueeze(1).expand(h, w).flatten() + current_pos
+                    w_indices = torch.arange(w).unsqueeze(0).expand(h, w).flatten() + current_pos
+
+                    llm_pos_ids_list.append(torch.stack([t_indices, h_indices, w_indices], dim=0))
+
+                    # Skip image tokens
+                    i += total_image_tokens
+                    current_pos += max(h, w)
+                    image_idx += 1
+
+                elif token == image_end_token_id:
+                    # End marker - just add normal position
+                    llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]]))
+                    current_pos += 1
+                    i += 1
+
+                else:
+                    # Regular text token
+                    llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]]))
+                    current_pos += 1
+                    i += 1
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        else:
+            # Pure text - all dimensions same
+            llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1)
+
+        mrope_position_delta = (llm_positions.max() + 1 - seq_len).item()
+        return llm_positions, mrope_position_delta
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        image_grid_thw: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        """
+        Forward pass through GLM-Image.
+
+        Args:
+            input_ids: Input token IDs [seq_len]
+            positions: Position IDs, shape (3, seq_len) for M-RoPE
+            intermediate_tensors: For pipeline parallelism
+            inputs_embeds: Pre-computed embeddings
+            pixel_values: Source image pixels (for image-to-image)
+            image_grid_thw: Grid dimensions for images
+
+        Returns:
+            Hidden states or intermediate tensors
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        """Compute logits from hidden states."""
+        logits = self.logits_processor(
+            self.lm_head,
+            hidden_states,
+        )
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """
+        Load weights from HuggingFace checkpoint.
+
+        Handles weight mapping for:
+        - Vision encoder weights
+        - VQ-VAE weights
+        - Text model weights
+        - LM head weights
+        """
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            # Handle stacked parameters (QKV, gate_up)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name not in params_dict:
+                    break
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Regular weight loading
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+
+        return loaded_params

From 21df56b1e2b610f268e63c8189cc233c91f25f5c Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 13:10:59 +0800
Subject: [PATCH 03/53] init multistage

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py                             | 0
 vllm_omni/model_executor/stage_configs/glm_image.yaml            | 1 +
 .../model_executor/stage_configs/glm_image_muilticonnector.yaml  | 1 +
 vllm_omni/model_executor/stage_input_processors/glm_image.py     | 1 +
 4 files changed, 3 insertions(+)
 rename vllm_omni/{diffusion => model_executor}/models/glm_image/glm_image_ar.py (100%)
 create mode 100644 vllm_omni/model_executor/stage_configs/glm_image.yaml
 create mode 100644 vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
 create mode 100644 vllm_omni/model_executor/stage_input_processors/glm_image.py

diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
similarity index 100%
rename from vllm_omni/diffusion/models/glm_image/glm_image_ar.py
rename to vllm_omni/model_executor/models/glm_image/glm_image_ar.py
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
new file mode 100644
index 00000000000..e1ad7ddae76
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -0,0 +1 @@
+# init placeholder
diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
new file mode 100644
index 00000000000..e1ad7ddae76
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
@@ -0,0 +1 @@
+# init placeholder
diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
new file mode 100644
index 00000000000..e1ad7ddae76
--- /dev/null
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -0,0 +1 @@
+# init placeholder

From b02a12a3d80f8ed44223d4935de64ee585dada59 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 06:30:47 +0000
Subject: [PATCH 04/53] revert attention_mask in GlmImageAttention forward()

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/diffusion/models/glm_image/glm_image_transformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
index d783a11b319..f3f8f98ff17 100644
--- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
+++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
@@ -366,6 +366,7 @@ def forward(
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
         image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
+        attention_mask: torch.Tensor | None = None,
         kv_cache: GlmImageLayerKVCache | None = None,
         kv_cache_mode: KVCacheMode | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:

From d5697906333726c70b619943ff970df086a4af30 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 06:40:18 +0000
Subject: [PATCH 05/53] init and registry

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/models/glm_image/__init__.py | 3 +++
 vllm_omni/model_executor/models/registry.py           | 5 +++++
 2 files changed, 8 insertions(+)
 create mode 100644 vllm_omni/model_executor/models/glm_image/__init__.py

diff --git a/vllm_omni/model_executor/models/glm_image/__init__.py b/vllm_omni/model_executor/models/glm_image/__init__.py
new file mode 100644
index 00000000000..d37044c09f1
--- /dev/null
+++ b/vllm_omni/model_executor/models/glm_image/__init__.py
@@ -0,0 +1,3 @@
+from .glm_image_ar import GlmImageForConditionalGeneration
+
+__all__ = ["GlmImageForConditionalGeneration"]
diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py
index 56bceae41ab..72a371cec4f 100644
--- a/vllm_omni/model_executor/models/registry.py
+++ b/vllm_omni/model_executor/models/registry.py
@@ -48,6 +48,11 @@
         "qwen3_omni_code2wav",
         "Qwen3OmniMoeCode2Wav",
     ),
+    "GlmImageForConditionalGeneration": (
+        "glm_image",
+        "glm_image_ar",
+        "GlmImageForConditionalGeneration",
+    ),
 }
 
 _VLLM_OMNI_MODELS = {

From 234e49ff1bc106becafef5cd5cd4348f097fc7de Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 15:20:21 +0800
Subject: [PATCH 06/53] implement stage config and stage input processor

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/pipeline_glm_image.py    |  34 +++-
 .../stage_configs/glm_image.yaml              |  71 ++++++-
 .../glm_image_muilticonnector.yaml            |  87 +++++++-
 .../stage_input_processors/glm_image.py       | 189 +++++++++++++++++-
 4 files changed, 370 insertions(+), 11 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index 74a1ecac334..20c06d30b88 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -855,14 +855,32 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         if req.seed is not None:
             generator = torch.Generator(device=self.device).manual_seed(req.seed)
 
-        # 1. Generate prior tokens with AR model
-        logger.info("Generating prior tokens with AR model...")
-        prior_token_id, prior_token_image_ids = self.generate_prior_tokens(
-            prompt=prompt,
-            image=condition_images,
-            height=height,
-            width=width,
-        )
+        # 1. Get prior tokens - either from external source (multistage) or generate internally
+        # Check if prior_token_ids are provided externally (from AR stage in multistage mode)
+        external_prior_tokens = req.extra.get("prior_token_ids") if req.extra else None
+        external_prior_image_ids = req.extra.get("prior_token_image_ids") if req.extra else None
+
+        if external_prior_tokens is not None:
+            # Multistage mode: use externally provided prior tokens from vLLM AR stage
+            logger.info("Using externally provided prior tokens from AR stage...")
+            prior_token_id = external_prior_tokens
+            if isinstance(prior_token_id, list):
+                prior_token_id = torch.tensor(prior_token_id, dtype=torch.long, device=self.device)
+            elif isinstance(prior_token_id, torch.Tensor):
+                prior_token_id = prior_token_id.to(device=self.device, dtype=torch.long)
+            # Ensure shape is [1, num_tokens] for batch processing
+            if prior_token_id.dim() == 1:
+                prior_token_id = prior_token_id.unsqueeze(0)
+            prior_token_image_ids = external_prior_image_ids
+        else:
+            # Single-stage mode: generate prior tokens with internal AR model
+            logger.info("Generating prior tokens with AR model...")
+            prior_token_id, prior_token_image_ids = self.generate_prior_tokens(
+                prompt=prompt,
+                image=condition_images,
+                height=height,
+                width=width,
+            )
 
         # 2. Encode prompt for glyph embeddings
         logger.info("Encoding prompt...")
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index e1ad7ddae76..21ee05f3a4f 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -1 +1,70 @@
-# init placeholder
+# Stage config for running GLM-Image with 2-stage architecture
+# Stage 0: AR Model (vLLM implementation) - generates prior_token_ids
+# Stage 1: Diffusion (DiT + VAE) - denoising and image decoding
+
+# The following config is designed for H100-80G GPUs.
+stage_args:
+  # Stage 0: AR Model (GlmImageForConditionalGeneration)
+  # This stage uses the vLLM-optimized AR model to generate prior tokens
+  # for conditioning the diffusion process.
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      process: true
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: ar
+      model_arch: GlmImageForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.6
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: token_ids # Output prior_token_ids for diffusion stage
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      hf_config_name: vision_language_encoder # Subfolder in model path
+    final_output: false # AR is not the final output
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384)
+      seed: 42
+      detokenize: false
+
+  # Stage 1: Diffusion (DiT + VAE)
+  # This stage receives prior_token_ids from AR and performs denoising + VAE decode
+  - stage_id: 1
+    stage_type: diffusion
+    runtime:
+      process: true
+      devices: "1" # Can use different GPU, or same GPU if memory allows
+      max_batch_size: 1
+    engine_args:
+      model_arch: GlmImagePipeline
+      # Diffusion-specific parameters
+      num_gpus: 1
+      cfg_parallel_size: 1 # Set to 2 for CFG parallelism on 2 GPUs
+    engine_input_source: [0] # Input from AR stage
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion
+    final_output: true
+    final_output_type: image
+    default_sampling_params:
+      num_inference_steps: 50
+      guidance_scale: 1.5
+      height: 1024
+      width: 1024
+
+# Top-level runtime config
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1 # Trigger downstream only after full upstream completion
+    max_inflight: 1 # Process serially within each stage
+
+  edges:
+    - from: 0 # AR → Diffusion: trigger after AR completes
+      to: 1
+      window_size: -1
diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
index e1ad7ddae76..e5ee76e6a54 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
@@ -1 +1,86 @@
-# init placeholder
+# Stage config for running GLM-Image with 2-stage architecture (MultiConnector version)
+# Stage 0: AR Model (vLLM implementation) - generates prior_token_ids
+# Stage 1: Diffusion (DiT + VAE) - denoising and image decoding
+#
+# This config uses OmniConnectors for inter-stage communication,
+# enabling efficient tensor transfer between stages on different processes/nodes.
+
+# The following config is designed for multi-GPU setups (e.g., 2x H100-80G).
+stage_args:
+  # Stage 0: AR Model (GlmImageForConditionalGeneration)
+  # This stage uses the vLLM-optimized AR model to generate prior tokens
+  # for conditioning the diffusion process.
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      process: true
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: ar
+      model_arch: GlmImageForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.6
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: token_ids # Output prior_token_ids for diffusion stage
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      hf_config_name: vision_language_encoder # Subfolder in model path
+    final_output: false # AR is not the final output
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 16384 # Support up to 2048x2048 images
+      seed: 42
+      detokenize: false
+
+  # Stage 1: Diffusion (DiT + VAE)
+  # This stage receives prior_token_ids from AR and performs denoising + VAE decode
+  - stage_id: 1
+    stage_type: diffusion
+    runtime:
+      process: true
+      devices: "1" # Use separate GPU for diffusion
+      max_batch_size: 1
+    engine_args:
+      model_arch: GlmImagePipeline
+      # Diffusion-specific parameters
+      num_gpus: 1
+      cfg_parallel_size: 1 # Set to 2 for CFG parallelism
+    engine_input_source: [0] # Input from AR stage
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion
+    final_output: true
+    final_output_type: image
+    default_sampling_params:
+      num_inference_steps: 50
+      guidance_scale: 1.5
+      height: 1024
+      width: 1024
+
+# Top-level runtime config with MultiConnector support
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1 # Trigger downstream only after full upstream completion
+    max_inflight: 1 # Process serially within each stage
+
+  edges:
+    - from: 0 # AR → Diffusion
+      to: 1
+      window_size: -1
+
+# OmniConnector configuration for efficient inter-stage tensor transfer
+connectors:
+  - type: tensor_transfer
+    source_stage: 0
+    target_stage: 1
+    # Transfer prior_token_ids efficiently between stages
+    fields:
+      - name: prior_token_ids
+        dtype: int64
+      - name: prior_token_image_ids
+        dtype: int64
+        optional: true
diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index e1ad7ddae76..4ea7e66c83c 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -1 +1,188 @@
-# init placeholder
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Stage input processor for GLM-Image: AR → Diffusion transition."""
+
+from math import sqrt
+from typing import Any
+
+import torch
+from vllm.inputs import TextPrompt
+from vllm.logger import init_logger
+
+from vllm_omni.inputs.data import OmniTokensPrompt
+
+logger = init_logger(__name__)
+
+
+def _upsample_token_ids(token_ids: torch.Tensor, token_h: int, token_w: int) -> torch.Tensor:
+    """Upsample token IDs by 2x using nearest neighbor interpolation.
+
+    GLM-Image AR model generates tokens at 32x downsampling, but DiT expects
+    16x downsampling, so we need to upsample by 2x.
+
+    Args:
+        token_ids: Prior token IDs of shape [num_tokens]
+        token_h: Height in token space (at 32x downsampling)
+        token_w: Width in token space (at 32x downsampling)
+
+    Returns:
+        Upsampled token IDs of shape [num_tokens * 4]
+    """
+    token_ids = token_ids.view(1, 1, token_h, token_w)
+    token_ids = torch.nn.functional.interpolate(token_ids.float(), scale_factor=2, mode="nearest").to(dtype=torch.long)
+    token_ids = token_ids.view(-1)
+    return token_ids
+
+
+def _parse_generated_tokens(
+    token_ids: list[int],
+    height: int,
+    width: int,
+    factor: int = 32,
+) -> tuple[torch.Tensor, int, int]:
+    """Parse AR-generated tokens to extract prior_token_ids.
+
+    The AR model generates tokens in a specific format:
+    - For text-to-image: small_image_tokens + large_image_tokens + EOS
+    - For image-to-image: large_image_tokens + EOS
+
+    We need to extract the large_image_tokens and upsample them.
+
+    Args:
+        token_ids: Generated token IDs from AR model
+        height: Target image height
+        width: Target image width
+        factor: Downsampling factor (default 32 for AR output)
+
+    Returns:
+        Tuple of (upsampled_prior_token_ids, pixel_height, pixel_width)
+    """
+    # Calculate token dimensions
+    token_h = height // factor
+    token_w = width // factor
+    large_image_tokens = token_h * token_w
+
+    # Calculate small image dimensions (used in text-to-image)
+    ratio = token_h / token_w
+    prev_token_h = int(sqrt(ratio) * (factor // 2))
+    prev_token_w = int(sqrt(1 / ratio) * (factor // 2))
+    small_image_tokens = prev_token_h * prev_token_w
+
+    # Determine if this is text-to-image (has small + large) or image-to-image (large only)
+    total_expected_t2i = small_image_tokens + large_image_tokens + 1  # +1 for EOS
+    total_expected_i2i = large_image_tokens + 1
+
+    token_tensor = torch.tensor(token_ids, dtype=torch.long)
+
+    if len(token_ids) >= total_expected_t2i:
+        # Text-to-image: extract large image tokens after small image tokens
+        large_start = small_image_tokens
+        large_end = large_start + large_image_tokens
+        prior_token_ids_d32 = token_tensor[large_start:large_end]
+    elif len(token_ids) >= total_expected_i2i:
+        # Image-to-image: large image tokens are at the beginning
+        prior_token_ids_d32 = token_tensor[:large_image_tokens]
+    else:
+        # Fallback: use whatever tokens we have
+        logger.warning(
+            f"Unexpected token count: {len(token_ids)}, expected at least {total_expected_i2i}. Using available tokens."
+        )
+        prior_token_ids_d32 = token_tensor[:large_image_tokens]
+
+    # Upsample from 32x to 16x
+    prior_token_ids = _upsample_token_ids(prior_token_ids_d32, token_h, token_w)
+
+    return prior_token_ids, height, width
+
+
+def ar2diffusion(
+    stage_list: list[Any],
+    engine_input_source: list[int],
+    prompt: OmniTokensPrompt | TextPrompt | list | None = None,
+    requires_multimodal_data: bool = False,
+) -> list[dict[str, Any]]:
+    """
+    Process AR stage outputs to create Diffusion stage inputs.
+
+    This function bridges the AR model (which generates prior_token_ids) and
+    the Diffusion pipeline (which uses them for conditioned denoising).
+
+    Workflow:
+    1. Extract generated token_ids from AR stage output
+    2. Parse and upsample prior_token_ids (32x → 16x)
+    3. Package into diffusion request format with original prompt info
+
+    Args:
+        stage_list: List of stage objects containing outputs
+        engine_input_source: Source stage IDs (typically [0] for AR stage)
+        prompt: Original prompt data (contains height, width, prompt text, images)
+        requires_multimodal_data: Whether to pass multimodal data (condition images)
+
+    Returns:
+        List of dicts containing diffusion request parameters
+    """
+    if not engine_input_source:
+        raise ValueError("engine_input_source cannot be empty")
+
+    source_stage_id = engine_input_source[0]
+    if source_stage_id >= len(stage_list):
+        raise IndexError(f"Invalid stage_id: {source_stage_id}")
+
+    if stage_list[source_stage_id].engine_outputs is None:
+        raise RuntimeError(f"Stage {source_stage_id} has no outputs yet")
+
+    ar_outputs = stage_list[source_stage_id].engine_outputs
+    diffusion_inputs = []
+
+    # Normalize prompt to list
+    if not isinstance(prompt, list):
+        prompt = [prompt] if prompt is not None else [{}]
+
+    for i, ar_output in enumerate(ar_outputs):
+        output = ar_output.outputs[0]
+        generated_token_ids = output.token_ids
+
+        # Get original prompt info
+        original_prompt = prompt[i] if i < len(prompt) else {}
+        if isinstance(original_prompt, (OmniTokensPrompt, TextPrompt)):
+            original_prompt = dict(original_prompt) if hasattr(original_prompt, "__iter__") else {}
+
+        # Extract dimensions from original prompt or use defaults
+        height = original_prompt.get("height", 1024)
+        width = original_prompt.get("width", 1024)
+        text_prompt = original_prompt.get("prompt", "")
+
+        # Parse and upsample prior tokens
+        prior_token_ids, pixel_h, pixel_w = _parse_generated_tokens(generated_token_ids, height, width)
+
+        # Build diffusion input
+        # The diffusion stage expects these in OmniDiffusionRequest format
+        diffusion_input = {
+            "prompt": text_prompt,
+            "height": pixel_h,
+            "width": pixel_w,
+            "extra": {
+                "prior_token_ids": prior_token_ids,
+                # Pass condition image info for image-to-image mode
+                "prior_token_image_ids": output.multimodal_output.get("prior_token_image_ids")
+                if hasattr(output, "multimodal_output") and output.multimodal_output
+                else None,
+            },
+        }
+
+        # Include multimodal data (condition images) if required
+        if requires_multimodal_data:
+            mm_data = original_prompt.get("multi_modal_data")
+            if mm_data:
+                diffusion_input["pil_image"] = mm_data.get("image")
+
+        # Copy other relevant parameters from original prompt
+        for key in ["seed", "num_inference_steps", "guidance_scale", "negative_prompt"]:
+            if key in original_prompt:
+                diffusion_input[key] = original_prompt[key]
+
+        diffusion_inputs.append(diffusion_input)
+
+    logger.debug(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs")
+
+    return diffusion_inputs

From 2320cf811dcb4ccdd17bd1993bf0e41de5fbed66 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 16:22:31 +0800
Subject: [PATCH 07/53] fix image2image error

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../diffusion/models/glm_image/__init__.py    |  4 +--
 .../models/glm_image/pipeline_glm_image.py    | 29 +++++++++++++++----
 vllm_omni/diffusion/registry.py               |  1 +
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/__init__.py b/vllm_omni/diffusion/models/glm_image/__init__.py
index ac7a98fa743..fc8256d8de6 100644
--- a/vllm_omni/diffusion/models/glm_image/__init__.py
+++ b/vllm_omni/diffusion/models/glm_image/__init__.py
@@ -9,7 +9,7 @@
 from vllm_omni.diffusion.models.glm_image.pipeline_glm_image import (
     GlmImagePipeline,
     get_glm_image_post_process_func,
-    # get_glm_image_pre_process_func,
+    get_glm_image_pre_process_func,
 )
 
 __all__ = [
@@ -17,5 +17,5 @@
     "GlmImagePipeline",
     "GlmImageTransformer2DModel",
     "get_glm_image_post_process_func",
-    # "get_glm_image_pre_process_func",
+    "get_glm_image_pre_process_func",
 ]
diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index 20c06d30b88..2eac7345c53 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -441,15 +441,34 @@ def generate_prior_tokens(
         input_length = inputs["input_ids"].shape[-1]
 
         # Process condition images if provided
+        # prior_token_image_ids should be a LIST of tensors, one per condition image
         prior_token_image_ids = None
         if image is not None and existing_grid is not None:
             prior_token_image_embed = self.vision_language_encoder.get_image_features(
                 inputs["pixel_values"], existing_grid
             )
             prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
-            prior_token_image_ids = self.vision_language_encoder.get_image_tokens(
+            # get_image_tokens returns a flat tensor, we need to split it per image
+            flat_prior_token_image_ids = self.vision_language_encoder.get_image_tokens(
                 prior_token_image_embed, existing_grid
             )
+            # Split by image grid sizes and convert to list
+            # Each image has t*h*w tokens, we need to split and reshape
+            split_sizes = (existing_grid.prod(dim=-1)).tolist()
+            prior_token_image_ids_list = torch.split(flat_prior_token_image_ids, split_sizes, dim=0)
+            # Convert to list and add batch dimension for each, then upsample
+            prior_token_image_ids = []
+            for i, token_ids in enumerate(prior_token_image_ids_list):
+                grid_t, grid_h, grid_w = existing_grid[i].tolist()
+                # Reshape to [1, t*h*w] then upsample like the main prior_token_ids
+                token_ids = token_ids.view(1, -1)
+                # Upsample 2x (from d32 to d64)
+                token_ids_2d = token_ids.view(1, 1, grid_h, grid_w)
+                token_ids_upsampled = torch.nn.functional.interpolate(
+                    token_ids_2d.float(), scale_factor=2, mode="nearest"
+                ).to(dtype=torch.long)
+                token_ids_upsampled = token_ids_upsampled.view(1, -1)
+                prior_token_image_ids.append(token_ids_upsampled)
 
         # Generate with AR model
         outputs = self.vision_language_encoder.generate(
@@ -634,7 +653,7 @@ def diffuse(
                         timestep=timestep,
                         target_size=target_size,
                         crop_coords=crop_coords,
-                        kv_caches=kv_caches,
+                        kv_cache=kv_caches,
                         return_dict=False,
                     )[0].float()
                 else:
@@ -647,7 +666,7 @@ def diffuse(
                         timestep=timestep,
                         target_size=target_size,
                         crop_coords=crop_coords,
-                        kv_caches=kv_caches,
+                        kv_cache=kv_caches,
                         return_dict=False,
                     )[0].float()
 
@@ -690,7 +709,7 @@ def diffuse(
                         timestep=timestep,
                         target_size=target_size,
                         crop_coords=crop_coords,
-                        kv_caches=kv_caches,
+                        kv_cache=kv_caches,
                         return_dict=False,
                     )[0].float()
 
@@ -763,7 +782,7 @@ def _prepare_condition_image_kv_cache(
                 timestep=torch.zeros((1,), device=self.device),
                 target_size=torch.tensor([condition_image.shape[-2:]], device=self.device, dtype=prompt_embeds.dtype),
                 crop_coords=torch.zeros((1, 2), device=self.device, dtype=prompt_embeds.dtype),
-                kv_caches=kv_caches,
+                kv_cache=kv_caches,
                 return_dict=False,
             )
 
diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py
index e566ca66cfa..9a507122024 100644
--- a/vllm_omni/diffusion/registry.py
+++ b/vllm_omni/diffusion/registry.py
@@ -137,6 +137,7 @@ def initialize_model(
     "QwenImageEditPlusPipeline": "get_qwen_image_edit_plus_pre_process_func",
     "LongCatImageEditPipeline": "get_longcat_image_edit_pre_process_func",
     "QwenImageLayeredPipeline": "get_qwen_image_layered_pre_process_func",
+    "GlmImagePipeline": "get_glm_image_pre_process_func",
     "WanPipeline": "get_wan22_pre_process_func",
     "WanImageToVideoPipeline": "get_wan22_i2v_pre_process_func",
 }

From 6b9b4869aad1fa0a68b38287fcffcfddd05b1ecf Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 16:23:01 +0800
Subject: [PATCH 08/53] implement a pre processor func

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/pipeline_glm_image.py    | 85 +++++++++++++++++--
 1 file changed, 80 insertions(+), 5 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index 2eac7345c53..9ab67985616 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -56,6 +56,73 @@
 logger = logging.getLogger(__name__)
 
 
+def get_glm_image_pre_process_func(od_config: OmniDiffusionConfig):
+    """Get pre-processing function for GLM-Image pipeline.
+
+    Pre-processes condition images before they are sent to the pipeline.
+    This is called by DiffusionEngine before batching requests.
+    """
+    model_name = od_config.model
+    if os.path.exists(model_name):
+        model_path = model_name
+    else:
+        model_path = download_weights_from_hf_specific(model_name, None, ["*"])
+
+    vae_config_path = os.path.join(model_path, "vae/config.json")
+    with open(vae_config_path) as f:
+        vae_config = json.load(f)
+        block_out_channels = vae_config.get("block_out_channels", [128, 256, 512, 512])
+        vae_scale_factor = 2 ** (len(block_out_channels) - 1)
+
+    image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
+    # GLM-Image uses patch_size=2 for transformer
+    patch_size = 2
+
+    def pre_process_func(requests: list[OmniDiffusionRequest]):
+        """Pre-process condition images for Image Edit mode."""
+        for req in requests:
+            images = req.pil_image
+            if images is None:
+                # Text-to-image mode, no preprocessing needed
+                continue
+
+            if not isinstance(images, list):
+                images = [images]
+
+            preprocessed = []
+            height, width = None, None
+
+            for img in images:
+                if isinstance(img, PIL.Image.Image):
+                    img_h, img_w = img.size[::-1]  # PIL is (width, height)
+                else:
+                    img_h, img_w = img.shape[:2]
+
+                # Align to multiple of vae_scale_factor * patch_size
+                multiple_of = vae_scale_factor * patch_size
+                img_h = (img_h // multiple_of) * multiple_of
+                img_w = (img_w // multiple_of) * multiple_of
+
+                processed = image_processor.preprocess(img, height=img_h, width=img_w)
+                preprocessed.append(processed)
+
+                # Use first image dimensions as default
+                if height is None:
+                    height, width = img_h, img_w
+
+            # Store in request
+            req.preprocessed_image = preprocessed
+            req.prompt_image = images  # Keep original PIL images
+            if req.height is None:
+                req.height = height
+            if req.width is None:
+                req.width = width
+
+        return requests
+
+    return pre_process_func
+
+
 def get_glm_image_post_process_func(od_config: OmniDiffusionConfig):
     """Get post-processing function for GLM-Image pipeline."""
     model_name = od_config.model
@@ -849,12 +916,20 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         prompt_embeds = req.prompt_embeds if isinstance(req.prompt_embeds, torch.Tensor) else None
 
         # Get condition images for Image Edit mode
-        condition_images = req.pil_image
-        if condition_images is not None and not isinstance(condition_images, list):
-            condition_images = [condition_images]
+        # Check if pre-processing was already done by DiffusionEngine
+        if hasattr(req, "preprocessed_image") and req.preprocessed_image is not None:
+            # Use pre-processed images from pre_process_func
+            preprocessed_images = req.preprocessed_image
+            condition_images = req.prompt_image if hasattr(req, "prompt_image") else req.pil_image
+            img_height = req.height
+            img_width = req.width
+        else:
+            # Fallback: preprocess in pipeline (for backward compatibility / debug)
+            condition_images = req.pil_image
+            if condition_images is not None and not isinstance(condition_images, list):
+                condition_images = [condition_images]
+            preprocessed_images, img_height, img_width = self._preprocess_condition_images(condition_images)
 
-        # Preprocess condition images and get dimensions
-        preprocessed_images, img_height, img_width = self._preprocess_condition_images(condition_images)
         is_image_edit = preprocessed_images is not None
 
         # Use image dimensions as default if available

From 2d92e22b20c74415b7e4f3902cfa3ecf47f63095 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 16:33:59 +0800
Subject: [PATCH 09/53] fix image2image error

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/pipeline_glm_image.py    | 209 +++++-------------
 1 file changed, 59 insertions(+), 150 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index 9ab67985616..53fea3d79e4 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -17,7 +17,6 @@
 import os
 import re
 from collections.abc import Iterable
-from math import sqrt
 
 import numpy as np
 import PIL.Image
@@ -358,45 +357,41 @@ def check_inputs(
     # ==================== AR Stage Methods ====================
 
     @staticmethod
-    def _build_image_grid_thw(
-        token_h: int,
-        token_w: int,
-        prev_token_h: int,
-        prev_token_w: int,
-        existing_grid: torch.Tensor | None = None,
-        device: torch.device | None = None,
-    ) -> torch.Tensor:
-        """Build image grid tensor for AR model."""
-        if existing_grid is None or existing_grid.numel() == 0:
-            return torch.tensor(
-                [
-                    [1, token_h, token_w],
-                    [1, prev_token_h, prev_token_w],
-                ],
-                device=device,
-            )
-        else:
-            return torch.cat(
-                [existing_grid.to(device), torch.tensor([[1, token_h, token_w]], device=device)],
-                dim=0,
-            )
+    def _compute_generation_params(
+        image_grid_thw: torch.Tensor,
+        is_text_to_image: bool,
+    ) -> tuple[int, int, int, int]:
+        """
+        Compute AR generation parameters from image grid.
 
-    @staticmethod
-    def _calculate_ar_generation_params(
-        token_h: int, token_w: int, prev_token_h: int, prev_token_w: int, is_text_to_image: bool
-    ) -> tuple[int, int]:
-        """Calculate AR generation parameters."""
-        large_image_tokens = token_h * token_w
-        small_image_tokens = prev_token_h * prev_token_w
+        Args:
+            image_grid_thw: Image grid tensor of shape [N, 3] where each row is [t, h, w]
+            is_text_to_image: Whether this is text-to-image (vs image-to-image)
 
-        if is_text_to_image:
-            max_new_tokens = small_image_tokens + large_image_tokens + 1
-            large_image_start_offset = small_image_tokens
-        else:
-            max_new_tokens = large_image_tokens + 1
+        Returns:
+            Tuple of (max_new_tokens, large_image_start_offset, target_grid_h, target_grid_w)
+        """
+        grid_sizes = []
+        grid_hw = []
+
+        for i in range(image_grid_thw.shape[0]):
+            t, h, w = image_grid_thw[i].tolist()
+            grid_sizes.append(int(h * w))
+            grid_hw.append((int(h), int(w)))
+
+        if not is_text_to_image:
+            # Image-to-image: only generate target image tokens
+            max_new_tokens = grid_sizes[-1] + 1
             large_image_start_offset = 0
+            target_grid_h, target_grid_w = grid_hw[-1]
+        else:
+            # Text-to-image: generate both small preview and large target
+            total_tokens = sum(grid_sizes)
+            max_new_tokens = total_tokens + 1
+            large_image_start_offset = sum(grid_sizes[1:])
+            target_grid_h, target_grid_w = grid_hw[0]
 
-        return max_new_tokens, large_image_start_offset
+        return max_new_tokens, large_image_start_offset, target_grid_h, target_grid_w
 
     @staticmethod
     def _extract_large_image_tokens(
@@ -418,28 +413,6 @@ def _upsample_token_ids(token_ids: torch.Tensor, token_h: int, token_w: int) ->
         token_ids = token_ids.view(1, -1)
         return token_ids
 
-    @staticmethod
-    def _build_prompt_with_shape(
-        prompt: str,
-        height: int,
-        width: int,
-        is_text_to_image: bool,
-        factor: int = 32,
-    ) -> tuple[str, int, int, int, int]:
-        """Build prompt with shape information for AR model."""
-        token_h = height // factor
-        token_w = width // factor
-        ratio = token_h / token_w
-        prev_token_h = int(sqrt(ratio) * (factor // 2))
-        prev_token_w = int(sqrt(1 / ratio) * (factor // 2))
-
-        if is_text_to_image:
-            expanded_prompt = f"{prompt}<sop>{token_h} {token_w}<eop><sop>{prev_token_h} {prev_token_w}<eop>"
-        else:
-            expanded_prompt = f"{prompt}<sop>{token_h} {token_w}<eop>"
-
-        return expanded_prompt, token_h, token_w, prev_token_h, prev_token_w
-
     @torch.inference_mode()
     def generate_prior_tokens(
         self,
@@ -448,7 +421,7 @@ def generate_prior_tokens(
         width: int,
         image: list[PIL.Image.Image] | None = None,
         factor: int = 32,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, int, int]:
+    ) -> tuple[torch.Tensor, list[torch.Tensor] | None]:
         """
         Generate prior tokens using the AR model.
 
@@ -460,74 +433,59 @@ def generate_prior_tokens(
             factor: Token factor (default 32)
 
         Returns:
-            Tuple of (prior_token_ids, prior_token_image_ids, pixel_height, pixel_width)
+            Tuple of (prior_token_ids, prior_token_image_ids)
+            prior_token_image_ids is a list of tensors, one per condition image
         """
         device = self.vision_language_encoder.device
         height = (height // factor) * factor
         width = (width // factor) * factor
         is_text_to_image = image is None or len(image) == 0
 
-        expanded_prompt, token_h, token_w, prev_h, prev_w = self._build_prompt_with_shape(
-            prompt, height, width, is_text_to_image
-        )
-
         # Build message content
         content = []
         if image is not None:
             for img in image:
                 content.append({"type": "image", "image": img})
-        content.append({"type": "text", "text": expanded_prompt})
+        content.append({"type": "text", "text": prompt})
         messages = [{"role": "user", "content": content}]
 
-        # Apply chat template
+        # Apply chat template - processor will handle target dimensions and build grid
         inputs = self.processor.apply_chat_template(
             messages,
-            add_generation_prompt=True,
             tokenize=True,
+            target_h=height,
+            target_w=width,
             return_dict=True,
             return_tensors="pt",
-        )
+        ).to(device)
 
-        # Build image grid
-        existing_grid = inputs.get("image_grid_thw")
-        inputs["image_grid_thw"] = self._build_image_grid_thw(
-            token_h,
-            token_w,
-            prev_h,
-            prev_w,
-            existing_grid=existing_grid if not is_text_to_image else None,
-            device=device,
-        )
+        image_grid_thw = inputs.get("image_grid_thw")
 
-        max_new_tokens, large_image_offset = self._calculate_ar_generation_params(
-            token_h, token_w, prev_h, prev_w, is_text_to_image
+        # Compute generation parameters from the full grid
+        max_new_tokens, large_image_offset, token_h, token_w = self._compute_generation_params(
+            image_grid_thw=image_grid_thw, is_text_to_image=is_text_to_image
         )
-        large_image_tokens = token_h * token_w
-
-        inputs = inputs.to(device)
-        input_length = inputs["input_ids"].shape[-1]
 
         # Process condition images if provided
-        # prior_token_image_ids should be a LIST of tensors, one per condition image
+        # Use image_grid_thw[:-1] to exclude the target image grid (last entry)
         prior_token_image_ids = None
-        if image is not None and existing_grid is not None:
+        if image is not None and image_grid_thw is not None and len(image_grid_thw) > 1:
+            # Get features only for condition images (exclude target image grid)
+            condition_grid = image_grid_thw[:-1]
             prior_token_image_embed = self.vision_language_encoder.get_image_features(
-                inputs["pixel_values"], existing_grid
+                inputs["pixel_values"], condition_grid
             )
             prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
-            # get_image_tokens returns a flat tensor, we need to split it per image
             flat_prior_token_image_ids = self.vision_language_encoder.get_image_tokens(
-                prior_token_image_embed, existing_grid
+                prior_token_image_embed, condition_grid
             )
             # Split by image grid sizes and convert to list
-            # Each image has t*h*w tokens, we need to split and reshape
-            split_sizes = (existing_grid.prod(dim=-1)).tolist()
+            split_sizes = (condition_grid.prod(dim=-1)).tolist()
             prior_token_image_ids_list = torch.split(flat_prior_token_image_ids, split_sizes, dim=0)
-            # Convert to list and add batch dimension for each, then upsample
+            # Convert to list with upsampling
             prior_token_image_ids = []
             for i, token_ids in enumerate(prior_token_image_ids_list):
-                grid_t, grid_h, grid_w = existing_grid[i].tolist()
-                # Reshape to [1, t*h*w] then upsample like the main prior_token_ids
+                grid_t, grid_h, grid_w = condition_grid[i].tolist()
                 token_ids = token_ids.view(1, -1)
                 # Upsample 2x (from d32 to d64)
                 token_ids_2d = token_ids.view(1, 1, grid_h, grid_w)
@@ -545,8 +503,9 @@ def generate_prior_tokens(
         )
 
         # Extract and upsample tokens
+        large_image_tokens = token_h * token_w
         prior_token_ids_d32 = self._extract_large_image_tokens(
-            outputs, input_length, large_image_offset, large_image_tokens
+            outputs, inputs["input_ids"].shape[-1], large_image_offset, large_image_tokens
         )
         prior_token_ids = self._upsample_token_ids(prior_token_ids_d32, token_h, token_w)
 
@@ -855,48 +814,6 @@ def _prepare_condition_image_kv_cache(
 
         return kv_caches
 
-    def _preprocess_condition_images(
-        self,
-        images: list[PIL.Image.Image] | PIL.Image.Image | None,
-    ) -> tuple[list[torch.Tensor] | None, int | None, int | None]:
-        """
-        Preprocess condition images for Image Edit mode.
-
-        Args:
-            images: Input images (PIL or list of PIL)
-
-        Returns:
-            Tuple of (preprocessed_images, height, width)
-        """
-        if images is None:
-            return None, None, None
-
-        if not isinstance(images, list):
-            images = [images]
-
-        preprocessed = []
-        height, width = None, None
-
-        for img in images:
-            if isinstance(img, PIL.Image.Image):
-                img_h, img_w = img.size[::-1]
-            else:
-                img_h, img_w = img.shape[:2]
-
-            # Align to multiple of vae_scale_factor * patch_size
-            multiple_of = self.vae_scale_factor * self._patch_size
-            img_h = (img_h // multiple_of) * multiple_of
-            img_w = (img_w // multiple_of) * multiple_of
-
-            processed = self.image_processor.preprocess(img, height=img_h, width=img_w)
-            preprocessed.append(processed)
-
-            # Use first image dimensions as default
-            if height is None:
-                height, width = img_h, img_w
-
-        return preprocessed, height, width
-
     @torch.inference_mode()
     def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         """
@@ -916,19 +833,11 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         prompt_embeds = req.prompt_embeds if isinstance(req.prompt_embeds, torch.Tensor) else None
 
         # Get condition images for Image Edit mode
-        # Check if pre-processing was already done by DiffusionEngine
-        if hasattr(req, "preprocessed_image") and req.preprocessed_image is not None:
-            # Use pre-processed images from pre_process_func
-            preprocessed_images = req.preprocessed_image
-            condition_images = req.prompt_image if hasattr(req, "prompt_image") else req.pil_image
-            img_height = req.height
-            img_width = req.width
-        else:
-            # Fallback: preprocess in pipeline (for backward compatibility / debug)
-            condition_images = req.pil_image
-            if condition_images is not None and not isinstance(condition_images, list):
-                condition_images = [condition_images]
-            preprocessed_images, img_height, img_width = self._preprocess_condition_images(condition_images)
+        # Use pre-processed images from pre_process_func
+        preprocessed_images = req.preprocessed_image
+        condition_images = req.prompt_image if hasattr(req, "prompt_image") else req.pil_image
+        img_height = req.height
+        img_width = req.width
 
         is_image_edit = preprocessed_images is not None
 

From fbb2ac8c7c8306d9efd4b9d0938e84840d4837e3 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 18:10:36 +0800
Subject: [PATCH 10/53] update stage config

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../model_executor/stage_configs/glm_image.yaml   | 15 ++++++++++++++-
 .../stage_configs/glm_image_muilticonnector.yaml  | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index 21ee05f3a4f..8ba307a8253 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -2,7 +2,6 @@
 # Stage 0: AR Model (vLLM implementation) - generates prior_token_ids
 # Stage 1: Diffusion (DiT + VAE) - denoising and image decoding
 
-# The following config is designed for H100-80G GPUs.
 stage_args:
   # Stage 0: AR Model (GlmImageForConditionalGeneration)
   # This stage uses the vLLM-optimized AR model to generate prior tokens
@@ -22,10 +21,12 @@ stage_args:
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: token_ids # Output prior_token_ids for diffusion stage
+      distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       hf_config_name: vision_language_encoder # Subfolder in model path
     final_output: false # AR is not the final output
+    is_comprehension: true
     default_sampling_params:
       temperature: 0.0
       top_p: 1.0
@@ -44,14 +45,26 @@ stage_args:
       max_batch_size: 1
     engine_args:
       model_arch: GlmImagePipeline
+      model_stage: dit
       # Diffusion-specific parameters
       num_gpus: 1
       cfg_parallel_size: 1 # Set to 2 for CFG parallelism on 2 GPUs
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: image # Final output is image
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
     engine_input_source: [0] # Input from AR stage
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion
     final_output: true
     final_output_type: image
+    is_comprehension: false
     default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      seed: 42
+      detokenize: true
       num_inference_steps: 50
       guidance_scale: 1.5
       height: 1024
diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
index e5ee76e6a54..7d209d6e3c6 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
@@ -5,7 +5,6 @@
 # This config uses OmniConnectors for inter-stage communication,
 # enabling efficient tensor transfer between stages on different processes/nodes.
 
-# The following config is designed for multi-GPU setups (e.g., 2x H100-80G).
 stage_args:
   # Stage 0: AR Model (GlmImageForConditionalGeneration)
   # This stage uses the vLLM-optimized AR model to generate prior tokens
@@ -25,10 +24,12 @@ stage_args:
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: token_ids # Output prior_token_ids for diffusion stage
+      distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       hf_config_name: vision_language_encoder # Subfolder in model path
     final_output: false # AR is not the final output
+    is_comprehension: true
     default_sampling_params:
       temperature: 0.0
       top_p: 1.0
@@ -47,14 +48,26 @@ stage_args:
       max_batch_size: 1
     engine_args:
       model_arch: GlmImagePipeline
+      model_stage: dit
       # Diffusion-specific parameters
       num_gpus: 1
       cfg_parallel_size: 1 # Set to 2 for CFG parallelism
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: image # Final output is image
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
     engine_input_source: [0] # Input from AR stage
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion
     final_output: true
     final_output_type: image
+    is_comprehension: false
     default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      seed: 42
+      detokenize: true
       num_inference_steps: 50
       guidance_scale: 1.5
       height: 1024

From 0e5366f0a4d0a8233c670a76af266ff6355735d7 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 18:51:43 +0800
Subject: [PATCH 11/53] implement example offline end2end files

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../offline_inference/glm_image/README.md     | 138 ++++++
 .../offline_inference/glm_image/end2end.py    | 402 ++++++++++++++++++
 .../offline_inference/glm_image/run_i2i.sh    |  93 ++++
 .../offline_inference/glm_image/run_t2i.sh    |  87 ++++
 4 files changed, 720 insertions(+)
 create mode 100644 examples/offline_inference/glm_image/README.md
 create mode 100644 examples/offline_inference/glm_image/end2end.py
 create mode 100755 examples/offline_inference/glm_image/run_i2i.sh
 create mode 100755 examples/offline_inference/glm_image/run_t2i.sh

diff --git a/examples/offline_inference/glm_image/README.md b/examples/offline_inference/glm_image/README.md
new file mode 100644
index 00000000000..73ae0e046dd
--- /dev/null
+++ b/examples/offline_inference/glm_image/README.md
@@ -0,0 +1,138 @@
+# GLM-Image Multistage End-to-End Inference
+
+This example demonstrates how to run GLM-Image with the vLLM-Omni multistage architecture.
+
+## Architecture
+
+GLM-Image uses a 2-stage pipeline:
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     GLM-Image Pipeline                       │
+├─────────────────────────────────────────────────────────────┤
+│                                                             │
+│  Stage 0 (AR Model)              Stage 1 (Diffusion)        │
+│  ┌─────────────────┐            ┌─────────────────────┐     │
+│  │ vLLM-optimized  │            │  GlmImagePipeline   │     │
+│  │ GlmImageFor     │  prior     │  ┌───────────────┐  │     │
+│  │ Conditional     │──tokens───►│  │ DiT Denoiser  │  │     │
+│  │ Generation      │            │  └───────────────┘  │     │
+│  │ (9B AR model)   │            │         │          │     │
+│  └─────────────────┘            │         ▼          │     │
+│         ▲                       │  ┌───────────────┐  │     │
+│         │                       │  │  VAE Decode   │──┼──► Image
+│    Text/Image                   │  └───────────────┘  │     │
+│      Input                      └─────────────────────┘     │
+│                                                             │
+└─────────────────────────────────────────────────────────────┘
+```
+
+## Features
+
+- **vLLM-optimized AR**: Uses PagedAttention and tensor parallelism for faster prior token generation
+- **Flexible deployment**: AR and Diffusion stages can run on different GPUs
+- **Text-to-Image**: Generate images from text descriptions
+- **Image-to-Image**: Edit existing images with text prompts
+
+## Usage
+
+### Text-to-Image
+
+```bash
+python end2end.py \
+    --model-path /path/to/glm-image \
+    --config-path ../../vllm_omni/model_executor/stage_configs/glm_image.yaml \
+    --prompt "A beautiful sunset over the ocean with sailing boats" \
+    --height 1024 \
+    --width 1024 \
+    --output output_t2i.png
+```
+
+### Image-to-Image (Image Editing)
+
+```bash
+python end2end.py \
+    --model-path /path/to/glm-image \
+    --config-path ../../vllm_omni/model_executor/stage_configs/glm_image.yaml \
+    --prompt "Transform this scene into a winter wonderland" \
+    --image input.png \
+    --output output_i2i.png
+```
+
+### With Custom Parameters
+
+```bash
+python end2end.py \
+    --model-path /path/to/glm-image \
+    --config-path ../../vllm_omni/model_executor/stage_configs/glm_image.yaml \
+    --prompt "A photorealistic cat sitting on a window sill" \
+    --height 1024 \
+    --width 1024 \
+    --num-inference-steps 50 \
+    --guidance-scale 1.5 \
+    --seed 42 \
+    --output output.png
+```
+
+## Shell Scripts
+
+### Run Text-to-Image
+
+```bash
+./run_t2i.sh
+```
+
+### Run Image-to-Image
+
+```bash
+./run_i2i.sh --image /path/to/input.png
+```
+
+## Stage Configuration
+
+The stage config (`glm_image.yaml`) defines:
+
+- **Stage 0 (AR)**: Uses `GPUARWorker` with vLLM engine
+
+  - Model: `GlmImageForConditionalGeneration`
+  - Output: `token_ids` (prior tokens)
+
+- **Stage 1 (Diffusion)**: Uses diffusion engine
+  - Model: `GlmImagePipeline`
+  - Output: Generated image
+
+See `vllm_omni/model_executor/stage_configs/glm_image.yaml` for full configuration.
+
+## Comparison with Single-Stage
+
+| Aspect      | Single-Stage (transformers) | Multistage (vLLM)   |
+| ----------- | --------------------------- | ------------------- |
+| AR Model    | transformers native         | vLLM PagedAttention |
+| Memory      | Higher (no KV cache opt)    | Lower (optimized)   |
+| Throughput  | Lower                       | Higher              |
+| Flexibility | Single GPU                  | Multi-GPU support   |
+
+## Troubleshooting
+
+### OOM Error
+
+Try reducing memory usage:
+
+```bash
+# In glm_image.yaml, adjust:
+gpu_memory_utilization: 0.5  # Reduce from 0.6
+```
+
+### Slow Initialization
+
+The first run loads model weights. Subsequent runs are faster:
+
+```bash
+--stage-init-timeout 900  # Increase timeout for slow storage
+```
+
+## Requirements
+
+- vLLM-Omni with GLM-Image support
+- CUDA-capable GPU (recommended: H100/A100 with 80GB)
+- GLM-Image model weights
diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py
new file mode 100644
index 00000000000..bc0ecb68f7e
--- /dev/null
+++ b/examples/offline_inference/glm_image/end2end.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+End-to-end offline inference example for GLM-Image with multistage architecture.
+
+This script tests the multistage pipeline where:
+- Stage 0 (AR): vLLM-optimized GlmImageForConditionalGeneration generates prior_token_ids
+- Stage 1 (Diffusion): GlmImagePipeline performs DiT denoising + VAE decode
+
+Usage (text-to-image):
+    python end2end.py \
+        --model-path /path/to/glm-image \
+        --config-path /path/to/glm_image.yaml \
+        --prompt "A beautiful sunset over the ocean" \
+        --output output_t2i.png
+
+Usage (image-to-image / image edit):
+    python end2end.py \
+        --model-path /path/to/glm-image \
+        --config-path /path/to/glm_image.yaml \
+        --prompt "Make it look like winter" \
+        --image input.png \
+        --output output_i2i.png
+
+Usage (with custom parameters):
+    python end2end.py \
+        --model-path /path/to/glm-image \
+        --config-path /path/to/glm_image.yaml \
+        --prompt "A cat sitting on a window sill" \
+        --height 1024 \
+        --width 1024 \
+        --num-inference-steps 50 \
+        --guidance-scale 1.5 \
+        --seed 42
+
+For more options, run:
+    python end2end.py --help
+"""
+
+import argparse
+import os
+import time
+from pathlib import Path
+
+from PIL import Image
+
+from vllm_omni.entrypoints.omni import Omni
+
+# Default stage config path (relative to vllm_omni package)
+DEFAULT_CONFIG_PATH = "vllm_omni/model_executor/stage_configs/glm_image.yaml"
+
+SEED = 42
+
+
+def load_image(image_path: str) -> Image.Image:
+    """Load an image from file path."""
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image file not found: {image_path}")
+    return Image.open(image_path).convert("RGB")
+
+
+def save_image(image: Image.Image, output_path: str) -> None:
+    """Save an image to file path."""
+    output_dir = os.path.dirname(output_path)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    image.save(output_path)
+    print(f"Image saved to: {output_path}")
+
+
+def build_prompt_for_t2i(
+    prompt: str,
+    height: int = 1024,
+    width: int = 1024,
+) -> dict:
+    """
+    Build prompt dict for text-to-image generation.
+
+    Args:
+        prompt: Text description for image generation
+        height: Target image height
+        width: Target image width
+
+    Returns:
+        Dict containing prompt and generation parameters
+    """
+    return {
+        "prompt": prompt,
+        "height": height,
+        "width": width,
+    }
+
+
+def build_prompt_for_i2i(
+    prompt: str,
+    image: Image.Image,
+    height: int | None = None,
+    width: int | None = None,
+) -> dict:
+    """
+    Build prompt dict for image-to-image generation.
+
+    Args:
+        prompt: Text description for image editing
+        image: Source image for editing
+        height: Target image height (default: use source image size)
+        width: Target image width (default: use source image size)
+
+    Returns:
+        Dict containing prompt, image, and generation parameters
+    """
+    # Use source image dimensions if not specified
+    if height is None:
+        height = image.height
+    if width is None:
+        width = image.width
+
+    return {
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": image,
+        },
+        "height": height,
+        "width": width,
+    }
+
+
+def main(args: argparse.Namespace) -> None:
+    """Main entry point for GLM-Image end-to-end inference."""
+    print("=" * 60)
+    print("GLM-Image Multistage End-to-End Inference")
+    print("=" * 60)
+
+    # Validate arguments
+    if not args.model_path:
+        raise ValueError("--model-path is required")
+
+    if not args.prompt:
+        raise ValueError("--prompt is required")
+
+    # Determine config path
+    config_path = args.config_path
+    if config_path is None:
+        # Try to find default config
+        if os.path.exists(DEFAULT_CONFIG_PATH):
+            config_path = DEFAULT_CONFIG_PATH
+        else:
+            # Try relative to script location
+            script_dir = Path(__file__).parent.parent.parent.parent
+            config_path = script_dir / "vllm_omni/model_executor/stage_configs/glm_image.yaml"
+            if not config_path.exists():
+                raise FileNotFoundError(
+                    f"Stage config not found. Please specify --config-path. Tried: {DEFAULT_CONFIG_PATH}"
+                )
+            config_path = str(config_path)
+
+    print(f"Model path: {args.model_path}")
+    print(f"Config path: {config_path}")
+    print(f"Prompt: {args.prompt}")
+
+    # Load source image for image-to-image mode
+    source_image = None
+    if args.image:
+        print(f"Source image: {args.image}")
+        source_image = load_image(args.image)
+        print(f"  Image size: {source_image.width}x{source_image.height}")
+
+    # Build prompt based on mode
+    if source_image is not None:
+        # Image-to-image mode
+        prompt_dict = build_prompt_for_i2i(
+            prompt=args.prompt,
+            image=source_image,
+            height=args.height,
+            width=args.width,
+        )
+        mode = "image-to-image"
+    else:
+        # Text-to-image mode
+        prompt_dict = build_prompt_for_t2i(
+            prompt=args.prompt,
+            height=args.height or 1024,
+            width=args.width or 1024,
+        )
+        mode = "text-to-image"
+
+    print(f"Mode: {mode}")
+    print(f"Target size: {prompt_dict.get('height', 1024)}x{prompt_dict.get('width', 1024)}")
+
+    # Add generation parameters to prompt
+    prompt_dict["seed"] = args.seed
+    prompt_dict["num_inference_steps"] = args.num_inference_steps
+    prompt_dict["guidance_scale"] = args.guidance_scale
+
+    if args.negative_prompt:
+        prompt_dict["negative_prompt"] = args.negative_prompt
+
+    # Initialize Omni with multistage config
+    print("\nInitializing Omni with multistage pipeline...")
+    start_time = time.time()
+
+    omni = Omni(
+        model=args.model_path,
+        stage_configs_path=config_path,
+        log_stats=args.enable_stats,
+        stage_init_timeout=args.stage_init_timeout,
+    )
+
+    init_time = time.time() - start_time
+    print(f"Initialization completed in {init_time:.2f}s")
+
+    # Prepare prompts (support batch generation)
+    prompts = [prompt_dict for _ in range(args.num_prompts)]
+
+    # No explicit sampling_params for diffusion - parameters are in prompt_dict
+    # For multistage, the AR stage may need sampling params
+    from vllm import SamplingParams
+
+    ar_sampling_params = SamplingParams(
+        temperature=0.0,
+        top_p=1.0,
+        top_k=-1,
+        max_tokens=args.max_tokens,
+        seed=args.seed,
+        detokenize=False,
+    )
+
+    # For multistage, we may need multiple sampling_params (one per LLM stage)
+    # For GLM-Image: Stage 0 (AR) is LLM, Stage 1 (Diffusion) uses diffusion_kwargs
+    sampling_params_list = [ar_sampling_params]
+
+    # Run generation
+    print(f"\nGenerating {args.num_prompts} image(s)...")
+    gen_start_time = time.time()
+
+    output_dir = os.path.dirname(args.output) if args.output else "outputs"
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+
+    output_count = 0
+    for stage_outputs in omni.generate(prompts, sampling_params_list, py_generator=True):
+        if stage_outputs.final_output_type == "image":
+            for output in stage_outputs.request_output:
+                request_id = output.request_id
+
+                # Get generated images
+                images = output.images if hasattr(output, "images") else []
+                if not images and hasattr(output, "multimodal_output"):
+                    images = output.multimodal_output.get("images", [])
+
+                # Save each generated image
+                for idx, img in enumerate(images):
+                    if args.num_prompts == 1 and len(images) == 1:
+                        output_path = args.output
+                    else:
+                        base, ext = os.path.splitext(args.output)
+                        output_path = f"{base}_{request_id}_{idx}{ext}"
+
+                    if isinstance(img, Image.Image):
+                        save_image(img, output_path)
+                    else:
+                        print(f"Warning: Unexpected image type for request {request_id}: {type(img)}")
+
+                    output_count += 1
+
+        elif stage_outputs.final_output_type == "text":
+            # AR stage output (intermediate, for debugging)
+            if args.verbose:
+                for output in stage_outputs.request_output:
+                    print(f"AR output for request {output.request_id}:")
+                    print(f"  Token count: {len(output.outputs[0].token_ids)}")
+
+    gen_time = time.time() - gen_start_time
+    print(f"\nGeneration completed in {gen_time:.2f}s")
+    print(f"Generated {output_count} image(s)")
+
+    # Cleanup
+    omni.close()
+    print("\nDone!")
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="GLM-Image Multistage End-to-End Inference",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    # Required arguments
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        required=True,
+        help="Path to GLM-Image model directory or HuggingFace model ID",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        required=True,
+        help="Text prompt for image generation",
+    )
+
+    # Optional arguments
+    parser.add_argument(
+        "--config-path",
+        type=str,
+        default=None,
+        help="Path to stage config YAML file (default: auto-detect)",
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        default=None,
+        help="Path to source image for image-to-image mode",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="output_glm_image.png",
+        help="Output image path (default: output_glm_image.png)",
+    )
+    parser.add_argument(
+        "--negative-prompt",
+        type=str,
+        default=None,
+        help="Negative prompt for classifier-free guidance",
+    )
+
+    # Generation parameters
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=None,
+        help="Output image height (default: 1024 for t2i, source size for i2i)",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=None,
+        help="Output image width (default: 1024 for t2i, source size for i2i)",
+    )
+    parser.add_argument(
+        "--num-inference-steps",
+        type=int,
+        default=50,
+        help="Number of diffusion denoising steps (default: 50)",
+    )
+    parser.add_argument(
+        "--guidance-scale",
+        type=float,
+        default=1.5,
+        help="Classifier-free guidance scale (default: 1.5)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=SEED,
+        help=f"Random seed for reproducibility (default: {SEED})",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=16384,
+        help="Maximum tokens for AR generation (default: 16384)",
+    )
+
+    # Batch processing
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1,
+        help="Number of images to generate (default: 1)",
+    )
+
+    # Runtime options
+    parser.add_argument(
+        "--enable-stats",
+        action="store_true",
+        default=False,
+        help="Enable statistics logging",
+    )
+    parser.add_argument(
+        "--stage-init-timeout",
+        type=int,
+        default=600,
+        help="Timeout for stage initialization in seconds (default: 600)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        default=False,
+        help="Enable verbose output",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/glm_image/run_i2i.sh b/examples/offline_inference/glm_image/run_i2i.sh
new file mode 100755
index 00000000000..f81b157b0c8
--- /dev/null
+++ b/examples/offline_inference/glm_image/run_i2i.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# Run GLM-Image image-to-image (editing) with multistage pipeline
+
+set -e
+
+# Default values
+MODEL_PATH="${MODEL_PATH:-/path/to/glm-image}"
+CONFIG_PATH="${CONFIG_PATH:-vllm_omni/model_executor/stage_configs/glm_image.yaml}"
+PROMPT="${PROMPT:-Transform this image into an oil painting style}"
+INPUT_IMAGE=""
+OUTPUT="${OUTPUT:-output_i2i.png}"
+NUM_STEPS="${NUM_STEPS:-50}"
+GUIDANCE="${GUIDANCE:-1.5}"
+SEED="${SEED:-42}"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model-path)
+            MODEL_PATH="$2"
+            shift 2
+            ;;
+        --config-path)
+            CONFIG_PATH="$2"
+            shift 2
+            ;;
+        --prompt)
+            PROMPT="$2"
+            shift 2
+            ;;
+        --image)
+            INPUT_IMAGE="$2"
+            shift 2
+            ;;
+        --output)
+            OUTPUT="$2"
+            shift 2
+            ;;
+        --num-steps)
+            NUM_STEPS="$2"
+            shift 2
+            ;;
+        --guidance)
+            GUIDANCE="$2"
+            shift 2
+            ;;
+        --seed)
+            SEED="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Check if input image is provided
+if [ -z "${INPUT_IMAGE}" ]; then
+    echo "Error: --image is required for image-to-image mode"
+    echo "Usage: ./run_i2i.sh --image /path/to/input.png [--prompt \"edit instruction\"]"
+    exit 1
+fi
+
+if [ ! -f "${INPUT_IMAGE}" ]; then
+    echo "Error: Input image not found: ${INPUT_IMAGE}"
+    exit 1
+fi
+
+echo "=============================================="
+echo "GLM-Image Image-to-Image Generation"
+echo "=============================================="
+echo "Model: ${MODEL_PATH}"
+echo "Config: ${CONFIG_PATH}"
+echo "Input: ${INPUT_IMAGE}"
+echo "Prompt: ${PROMPT}"
+echo "Output: ${OUTPUT}"
+echo "Steps: ${NUM_STEPS}"
+echo "Guidance: ${GUIDANCE}"
+echo "Seed: ${SEED}"
+echo "=============================================="
+
+python end2end.py \
+    --model-path "${MODEL_PATH}" \
+    --config-path "${CONFIG_PATH}" \
+    --prompt "${PROMPT}" \
+    --image "${INPUT_IMAGE}" \
+    --output "${OUTPUT}" \
+    --num-inference-steps "${NUM_STEPS}" \
+    --guidance-scale "${GUIDANCE}" \
+    --seed "${SEED}" \
+    --verbose
diff --git a/examples/offline_inference/glm_image/run_t2i.sh b/examples/offline_inference/glm_image/run_t2i.sh
new file mode 100755
index 00000000000..5d249960b8f
--- /dev/null
+++ b/examples/offline_inference/glm_image/run_t2i.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# Run GLM-Image text-to-image generation with multistage pipeline
+
+set -e
+
+# Default values
+MODEL_PATH="${MODEL_PATH:-/path/to/glm-image}"
+CONFIG_PATH="${CONFIG_PATH:-vllm_omni/model_executor/stage_configs/glm_image.yaml}"
+PROMPT="${PROMPT:-A beautiful mountain landscape with snow-capped peaks and a clear blue lake}"
+OUTPUT="${OUTPUT:-output_t2i.png}"
+HEIGHT="${HEIGHT:-1024}"
+WIDTH="${WIDTH:-1024}"
+NUM_STEPS="${NUM_STEPS:-50}"
+GUIDANCE="${GUIDANCE:-1.5}"
+SEED="${SEED:-42}"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model-path)
+            MODEL_PATH="$2"
+            shift 2
+            ;;
+        --config-path)
+            CONFIG_PATH="$2"
+            shift 2
+            ;;
+        --prompt)
+            PROMPT="$2"
+            shift 2
+            ;;
+        --output)
+            OUTPUT="$2"
+            shift 2
+            ;;
+        --height)
+            HEIGHT="$2"
+            shift 2
+            ;;
+        --width)
+            WIDTH="$2"
+            shift 2
+            ;;
+        --num-steps)
+            NUM_STEPS="$2"
+            shift 2
+            ;;
+        --guidance)
+            GUIDANCE="$2"
+            shift 2
+            ;;
+        --seed)
+            SEED="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+echo "=============================================="
+echo "GLM-Image Text-to-Image Generation"
+echo "=============================================="
+echo "Model: ${MODEL_PATH}"
+echo "Config: ${CONFIG_PATH}"
+echo "Prompt: ${PROMPT}"
+echo "Output: ${OUTPUT}"
+echo "Size: ${WIDTH}x${HEIGHT}"
+echo "Steps: ${NUM_STEPS}"
+echo "Guidance: ${GUIDANCE}"
+echo "Seed: ${SEED}"
+echo "=============================================="
+
+python end2end.py \
+    --model-path "${MODEL_PATH}" \
+    --config-path "${CONFIG_PATH}" \
+    --prompt "${PROMPT}" \
+    --output "${OUTPUT}" \
+    --height "${HEIGHT}" \
+    --width "${WIDTH}" \
+    --num-inference-steps "${NUM_STEPS}" \
+    --guidance-scale "${GUIDANCE}" \
+    --seed "${SEED}" \
+    --verbose

From dc8f2c2b9591ae69acbfe38885f1bdf8de5e3a57 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 11:29:47 +0000
Subject: [PATCH 12/53] modify dit configs

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/stage_configs/glm_image.yaml        | 5 -----
 .../stage_configs/glm_image_muilticonnector.yaml             | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index 8ba307a8253..59c6a5252d4 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -44,21 +44,16 @@ stage_args:
       devices: "1" # Can use different GPU, or same GPU if memory allows
       max_batch_size: 1
     engine_args:
-      model_arch: GlmImagePipeline
       model_stage: dit
       # Diffusion-specific parameters
       num_gpus: 1
-      cfg_parallel_size: 1 # Set to 2 for CFG parallelism on 2 GPUs
       enforce_eager: true
       trust_remote_code: true
-      engine_output_type: image # Final output is image
       distributed_executor_backend: "mp"
-      enable_prefix_caching: false
     engine_input_source: [0] # Input from AR stage
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion
     final_output: true
     final_output_type: image
-    is_comprehension: false
     default_sampling_params:
       temperature: 0.0
       top_p: 1.0
diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
index 7d209d6e3c6..c32b1cd3d07 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
@@ -47,21 +47,16 @@ stage_args:
       devices: "1" # Use separate GPU for diffusion
       max_batch_size: 1
     engine_args:
-      model_arch: GlmImagePipeline
       model_stage: dit
       # Diffusion-specific parameters
       num_gpus: 1
-      cfg_parallel_size: 1 # Set to 2 for CFG parallelism
       enforce_eager: true
       trust_remote_code: true
-      engine_output_type: image # Final output is image
       distributed_executor_backend: "mp"
-      enable_prefix_caching: false
     engine_input_source: [0] # Input from AR stage
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion
     final_output: true
     final_output_type: image
-    is_comprehension: false
     default_sampling_params:
       temperature: 0.0
       top_p: 1.0

From ad0da646870810a5a73743df22fe1e40a983de93 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 19:47:55 +0800
Subject: [PATCH 13/53] fix end2end offline examples

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../offline_inference/glm_image/end2end.py    | 17 ++++-
 vllm_omni/entrypoints/omni_diffusion.py       | 66 +++++++++++--------
 vllm_omni/entrypoints/omni_stage.py           |  5 +-
 .../stage_configs/glm_image.yaml              |  4 +-
 4 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py
index bc0ecb68f7e..1134f00b424 100644
--- a/examples/offline_inference/glm_image/end2end.py
+++ b/examples/offline_inference/glm_image/end2end.py
@@ -225,9 +225,20 @@ def main(args: argparse.Namespace) -> None:
         detokenize=False,
     )
 
-    # For multistage, we may need multiple sampling_params (one per LLM stage)
-    # For GLM-Image: Stage 0 (AR) is LLM, Stage 1 (Diffusion) uses diffusion_kwargs
-    sampling_params_list = [ar_sampling_params]
+    # For diffusion stage, sampling_params contains diffusion-specific parameters
+    # These are passed as kwargs to the diffusion engine
+    diffusion_sampling_params = {
+        "num_inference_steps": args.num_inference_steps,
+        "guidance_scale": args.guidance_scale,
+        "height": prompt_dict.get("height", 1024),
+        "width": prompt_dict.get("width", 1024),
+        "seed": args.seed,
+    }
+
+    # For multistage, we need sampling_params for each stage
+    # Stage 0 (AR): SamplingParams for vLLM
+    # Stage 1 (Diffusion): dict with diffusion kwargs
+    sampling_params_list = [ar_sampling_params, diffusion_sampling_params]
 
     # Run generation
     print(f"\nGenerating {args.num_prompts} image(s)...")
diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
index 43c68dc5cdd..f3546cc1b66 100644
--- a/vllm_omni/entrypoints/omni_diffusion.py
+++ b/vllm_omni/entrypoints/omni_diffusion.py
@@ -51,34 +51,48 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
 
         self.od_config = od_config
 
-        # Diffusers-style models expose `model_index.json` with `_class_name`.
-        # Bagel models (and other non-diffusers) typically expose `config.json`.
-        try:
-            config_dict = get_hf_file_to_dict(
-                "model_index.json",
-                od_config.model,
-            )
-            od_config.model_class_name = config_dict.get("_class_name", None)
+        # Allow direct specification of model_class_name via model_arch parameter
+        # This is useful for multistage pipelines where we know the exact pipeline class
+        model_arch = kwargs.get("model_arch")
+        if model_arch and od_config.model_class_name is None:
+            od_config.model_class_name = model_arch
+            od_config.tf_model_config = TransformerConfig()
             od_config.update_multimodal_support()
-
-            tf_config_dict = get_hf_file_to_dict(
-                "transformer/config.json",
-                od_config.model,
-            )
-            od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict)
-        except (AttributeError, OSError, ValueError):
-            cfg = get_hf_file_to_dict("config.json", od_config.model)
-            if cfg is None:
-                raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
-
-            model_type = cfg.get("model_type")
-            architectures = cfg.get("architectures") or []
-            if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
-                od_config.model_class_name = "BagelPipeline"
-                od_config.tf_model_config = TransformerConfig()
+            logger.info(f"Using model_arch '{model_arch}' as model_class_name")
+        elif od_config.model_class_name is None:
+            # Diffusers-style models expose `model_index.json` with `_class_name`.
+            # Bagel models (and other non-diffusers) typically expose `config.json`.
+            try:
+                config_dict = get_hf_file_to_dict(
+                    "model_index.json",
+                    od_config.model,
+                )
+                od_config.model_class_name = config_dict.get("_class_name", None)
                 od_config.update_multimodal_support()
-            else:
-                raise
+
+                tf_config_dict = get_hf_file_to_dict(
+                    "transformer/config.json",
+                    od_config.model,
+                )
+                od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict)
+            except (AttributeError, OSError, ValueError):
+                cfg = get_hf_file_to_dict("config.json", od_config.model)
+                if cfg is None:
+                    raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
+
+                model_type = cfg.get("model_type")
+                architectures = cfg.get("architectures") or []
+                if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
+                    od_config.model_class_name = "BagelPipeline"
+                    od_config.tf_model_config = TransformerConfig()
+                    od_config.update_multimodal_support()
+                elif model_type == "glm-image" or "GlmImageForConditionalGeneration" in architectures:
+                    # GLM-Image model detected
+                    od_config.model_class_name = "GlmImagePipeline"
+                    od_config.tf_model_config = TransformerConfig()
+                    od_config.update_multimodal_support()
+                else:
+                    raise
 
         self.engine: DiffusionEngine = DiffusionEngine.make_engine(od_config)
 
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
index af6f60f0420..84df3397bbc 100644
--- a/vllm_omni/entrypoints/omni_stage.py
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -599,7 +599,10 @@ def _stage_worker(
     )
     try:
         if stage_type == "diffusion":
-            engine_args.pop("model_stage")
+            engine_args.pop("model_stage", None)
+            # Pass model path to OmniDiffusion if not already in engine_args
+            if "model" not in engine_args:
+                engine_args["model"] = model
             stage_engine = OmniDiffusion(**engine_args)
         else:
             # Default to LLM engine
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index 59c6a5252d4..288657aa01f 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -24,7 +24,8 @@ stage_args:
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
-      hf_config_name: vision_language_encoder # Subfolder in model path
+      # Model path points to the main GLM-Image directory
+      # vLLM will load GlmImageForConditionalGeneration from it
     final_output: false # AR is not the final output
     is_comprehension: true
     default_sampling_params:
@@ -45,6 +46,7 @@ stage_args:
       max_batch_size: 1
     engine_args:
       model_stage: dit
+      model_arch: GlmImagePipeline # Required for diffusion model class resolution
       # Diffusion-specific parameters
       num_gpus: 1
       enforce_eager: true

From 974ed2201ab764dbba68c90f4bbe08ddbaa4a7cb Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Thu, 15 Jan 2026 20:02:46 +0800
Subject: [PATCH 14/53] support sub folder and model arch

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/entrypoints/omni_diffusion.py               | 4 +++-
 vllm_omni/entrypoints/omni_stage.py                   | 6 ++++++
 vllm_omni/model_executor/stage_configs/glm_image.yaml | 3 +--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
index f3546cc1b66..e5f74beb92b 100644
--- a/vllm_omni/entrypoints/omni_diffusion.py
+++ b/vllm_omni/entrypoints/omni_diffusion.py
@@ -44,6 +44,9 @@ class OmniDiffusion:
     """
 
     def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
+        # Extract model_arch before passing to OmniDiffusionConfig (not a valid config field)
+        model_arch = kwargs.pop("model_arch", None)
+
         if od_config is None:
             od_config = OmniDiffusionConfig.from_kwargs(**kwargs)
         elif isinstance(od_config, dict):
@@ -53,7 +56,6 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
 
         # Allow direct specification of model_class_name via model_arch parameter
         # This is useful for multistage pipelines where we know the exact pipeline class
-        model_arch = kwargs.get("model_arch")
         if model_arch and od_config.model_class_name is None:
             od_config.model_class_name = model_arch
             od_config.tf_model_config = TransformerConfig()
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
index 84df3397bbc..af947448d1a 100644
--- a/vllm_omni/entrypoints/omni_stage.py
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -443,6 +443,12 @@ def _stage_worker(
     connectors_config = stage_payload.get("connectors_config", {})
     stage_type = stage_payload.get("stage_type", "llm")
 
+    # Handle model_subdir for models with config in subdirectory (e.g., GLM-Image AR model)
+    model_subdir = engine_args.pop("model_subdir", None)
+    if model_subdir:
+        model = _os.path.join(model, model_subdir)
+        logger.info(f"Using model subdirectory: {model}")
+
     # Aggregates for running average
     _agg_total_tokens = 0
     _agg_total_gen_time_ms = 0.0
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index 288657aa01f..0e543ff7b8b 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -15,6 +15,7 @@ stage_args:
     engine_args:
       model_stage: ar
       model_arch: GlmImageForConditionalGeneration
+      model_subdir: vision_language_encoder # AR model config.json is in this subdirectory
       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
       gpu_memory_utilization: 0.6
@@ -24,8 +25,6 @@ stage_args:
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
-      # Model path points to the main GLM-Image directory
-      # vLLM will load GlmImageForConditionalGeneration from it
     final_output: false # AR is not the final output
     is_comprehension: true
     default_sampling_params:

From f5551f2c2c5a156b497f0485d7e4a4953f33e6a6 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:00:39 +0800
Subject: [PATCH 15/53] fix import error

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 98fd1dd19db..6dce7072976 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -69,6 +69,7 @@
     make_empty_intermediate_tensors_factory,
     make_layers,
 )
+from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -89,8 +90,6 @@
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
-from .vision import get_vit_attn_backend
-
 logger = init_logger(__name__)
 
 

From 88439b675445e136eeb4803580bd70971285a273 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:04:25 +0800
Subject: [PATCH 16/53] tokenizer

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/entrypoints/omni_stage.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
index af947448d1a..6f62dff7fbb 100644
--- a/vllm_omni/entrypoints/omni_stage.py
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -444,11 +444,27 @@ def _stage_worker(
     stage_type = stage_payload.get("stage_type", "llm")
 
     # Handle model_subdir for models with config in subdirectory (e.g., GLM-Image AR model)
+    # Also handle tokenizer_subdir for when tokenizer is in a different location than model
     model_subdir = engine_args.pop("model_subdir", None)
+    tokenizer_subdir = engine_args.pop("tokenizer_subdir", None)
+    base_model_path = model  # Keep original model path for tokenizer
+
     if model_subdir:
         model = _os.path.join(model, model_subdir)
         logger.info(f"Using model subdirectory: {model}")
 
+    # Set tokenizer path if different from model path
+    if tokenizer_subdir is not None:
+        # tokenizer_subdir can be empty string "" to use base_model_path directly
+        tokenizer_path = _os.path.join(base_model_path, tokenizer_subdir) if tokenizer_subdir else base_model_path
+        engine_args["tokenizer"] = tokenizer_path
+        logger.info(f"Using tokenizer from: {tokenizer_path}")
+    elif model_subdir and "tokenizer" not in engine_args:
+        # If model is in subdirectory but tokenizer not specified, use base path
+        # This is common for models like GLM-Image where tokenizer is in root
+        engine_args["tokenizer"] = base_model_path
+        logger.info(f"Using tokenizer from base model path: {base_model_path}")
+
     # Aggregates for running average
     _agg_total_tokens = 0
     _agg_total_gen_time_ms = 0.0

From 6676c96f2883391f200be0fe5936dc5f9dea8e09 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:10:24 +0800
Subject: [PATCH 17/53] fix BaseDummyInputsBuilders

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 6dce7072976..6efeb57a4d6 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -79,13 +79,13 @@
 )
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (
+    BaseDummyInputsBuilder,
     BaseMultiModalProcessor,
     BaseProcessingInfo,
     PromptReplacement,
     PromptUpdate,
     PromptUpdateDetails,
 )
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backends.registry import AttentionBackendEnum

From 324348cbe3bdfee551335142f1ad02abed9cf61d Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:12:45 +0800
Subject: [PATCH 18/53] tokenizer sub dir

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/stage_configs/glm_image.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index 0e543ff7b8b..d9186769362 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -16,6 +16,7 @@ stage_args:
       model_stage: ar
       model_arch: GlmImageForConditionalGeneration
       model_subdir: vision_language_encoder # AR model config.json is in this subdirectory
+      tokenizer_subdir: tokenizer # Tokenizer files are in tokenizer/ subdirectory
       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
       gpu_memory_utilization: 0.6

From c0877a31fd09cfa07119429cc22d31cfb6a95830 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:22:39 +0800
Subject: [PATCH 19/53] fix text2image

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 6efeb57a4d6..a81c1b53ec6 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -229,7 +229,15 @@ def get_dummy_mm_data(
     ) -> MultiModalDataDict:
         """
         Generate dummy multimodal data for profiling.
+
+        Returns empty dict if no images (text-to-image mode).
         """
+        num_images = mm_counts.get("image", 0)
+
+        # Text-to-image mode: no multimodal data needed
+        if num_images == 0:
+            return {}
+
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
@@ -237,8 +245,6 @@ def get_dummy_mm_data(
         image_size = getattr(vision_config, "image_size", 2048)
         width = height = image_size
 
-        num_images = mm_counts.get("image", 0)
-
         image_overrides = mm_options.get("image") if mm_options else None
 
         return {
@@ -295,7 +301,13 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         """
         Get the multimodal field configuration.
+
+        Returns empty dict if no image data (text-to-image mode).
         """
+        # Check if we have image data
+        if "pixel_values" not in hf_inputs:
+            return {}
+
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
             image_grid_thw=MultiModalFieldConfig.batched("image"),
@@ -312,7 +324,13 @@ def _get_prompt_updates(
 
         GLM-Image replaces each image placeholder with:
         <|image_start|> + image_tokens + <|image_end|>
+
+        Returns empty list if no images (text-to-image mode).
         """
+        # Check if we have any images
+        if not mm_items.get_count("image", strict=False):
+            return []
+
         hf_config = self.info.get_hf_config()
 
         # Get special token IDs from config

From 277fd0d1ca9f2a4b232e20c09ee5502706a2197b Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:41:09 +0800
Subject: [PATCH 20/53] fix i2i dummy inputs

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index a81c1b53ec6..b31c9072af2 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -138,10 +138,15 @@ def get_hf_processor(self, **kwargs: object):
             return None
 
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
-        # GLM-Image supports multiple source images for image-to-image generation
-        # or no image for text-to-image generation
-        # None means no limit on the number of images
-        return {"image": None}
+        # GLM-Image is an image GENERATION model, not an image understanding model.
+        # For text-to-image (t2i) mode: no multimodal input is needed
+        # For image-to-image (i2i) mode: source images are provided as input
+        #
+        # Return empty dict to indicate no multimodal inputs are required for
+        # profiling. Image-to-image mode will be handled dynamically at runtime.
+        # This prevents vLLM from trying to create dummy image inputs during
+        # model initialization.
+        return {}
 
     def get_num_image_tokens(
         self,
@@ -198,16 +203,24 @@ def get_image_size_with_most_features(self) -> tuple[int, int]:
 class GlmImageDummyInputsBuilder(BaseDummyInputsBuilder[GlmImageProcessingInfo]):
     """
     Builds dummy inputs for GLM-Image model profiling.
+
+    GLM-Image is an image GENERATION model. For text-to-image mode,
+    no multimodal inputs are needed - just a text prompt.
     """
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         """
-        Generate dummy text with image placeholders.
+        Generate dummy text for profiling.
 
-        GLM-Image uses <|image|> as the image placeholder token.
+        For text-to-image mode (no images), returns a simple text prompt.
+        For image-to-image mode, includes image placeholders.
         """
         num_images = mm_counts.get("image", 0)
 
+        # Text-to-image mode: return a simple text prompt for profiling
+        if num_images == 0:
+            return "A beautiful image."
+
         hf_config = self.info.get_hf_config()
         # Get image token from config or use default
         image_token_id = getattr(hf_config, "image_token_id", 167855)

From 4883184523b1ca578905b4cf97ea430b09044126 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:45:14 +0800
Subject: [PATCH 21/53] fix gate up load weight

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index b31c9072af2..ab840ad4428 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -1557,7 +1557,10 @@ class GlmImageForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": ["gate_up_proj"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
     }
 
     # Weight mapping from HuggingFace to vLLM format

From 393dfd2fe8d63d4a086a2214b1a32a454a57ba9f Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:48:57 +0800
Subject: [PATCH 22/53] fix gate up load weight

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py              | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index ab840ad4428..56d9e57def9 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -1785,24 +1785,27 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
         for name, loaded_weight in weights:
             # Handle stacked parameters (QKV, gate_up)
+            is_stacked = False
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                name = name.replace(weight_name, param_name)
-                if name not in params_dict:
+                stacked_name = name.replace(weight_name, param_name)
+                if stacked_name not in params_dict:
                     break
-                param = params_dict[name]
+                param = params_dict[stacked_name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(stacked_name)
+                is_stacked = True
                 break
-            else:
+
+            if not is_stacked:
                 # Regular weight loading
                 if name not in params_dict:
                     continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
-
-            loaded_params.add(name)
+                loaded_params.add(name)
 
         return loaded_params

From aa4b586c2f29e3a7296f8cff48818e4e253380d1 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 10:55:02 +0800
Subject: [PATCH 23/53] get transformer/config.json

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/entrypoints/omni_diffusion.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
index e5f74beb92b..0b1a1763316 100644
--- a/vllm_omni/entrypoints/omni_diffusion.py
+++ b/vllm_omni/entrypoints/omni_diffusion.py
@@ -58,7 +58,15 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
         # This is useful for multistage pipelines where we know the exact pipeline class
         if model_arch and od_config.model_class_name is None:
             od_config.model_class_name = model_arch
-            od_config.tf_model_config = TransformerConfig()
+            # Try to load transformer config from transformer/config.json
+            try:
+                tf_config_dict = get_hf_file_to_dict(
+                    "transformer/config.json",
+                    od_config.model,
+                )
+                od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict)
+            except (AttributeError, OSError, ValueError):
+                od_config.tf_model_config = TransformerConfig()
             od_config.update_multimodal_support()
             logger.info(f"Using model_arch '{model_arch}' as model_class_name")
         elif od_config.model_class_name is None:
@@ -89,9 +97,17 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
                     od_config.tf_model_config = TransformerConfig()
                     od_config.update_multimodal_support()
                 elif model_type == "glm-image" or "GlmImageForConditionalGeneration" in architectures:
-                    # GLM-Image model detected
+                    # GLM-Image model detected - load transformer config
                     od_config.model_class_name = "GlmImagePipeline"
-                    od_config.tf_model_config = TransformerConfig()
+                    # Try to load transformer config from transformer/config.json
+                    try:
+                        tf_config_dict = get_hf_file_to_dict(
+                            "transformer/config.json",
+                            od_config.model,
+                        )
+                        od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict)
+                    except (AttributeError, OSError, ValueError):
+                        od_config.tf_model_config = TransformerConfig()
                     od_config.update_multimodal_support()
                 else:
                     raise

From daa57c5878fce2967e0ef18668c8b5293d831f3e Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 11:01:31 +0800
Subject: [PATCH 24/53] add glm image mrope

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/layers/mrope.py | 86 ++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/vllm_omni/model_executor/layers/mrope.py b/vllm_omni/model_executor/layers/mrope.py
index 9ca6a36e233..b3517f9492c 100644
--- a/vllm_omni/model_executor/layers/mrope.py
+++ b/vllm_omni/model_executor/layers/mrope.py
@@ -201,6 +201,17 @@ def get_input_positions_tensor(
                 context_len=context_len,
                 seq_len=seq_len,
             )
+        elif hf_config.model_type == "glm-image":
+            # GLM-Image is an image generation model.
+            # For text-to-image mode (no input images), use simple text-only positions.
+            # For image-to-image mode, use GLM4V-style position encoding.
+            return cls._glm_image_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
         else:
             return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -313,6 +324,81 @@ def _glm4v_get_input_positions_tensor(
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _glm_image_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: list[list[int]] | torch.Tensor | None,
+        context_len: int = 0,
+        seq_len: int | None = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions for GLM-Image model.
+
+        GLM-Image is an image GENERATION model, not understanding.
+        - For text-to-image: no input images, just text positions
+        - For image-to-image: source images have grid positions
+
+        Unlike Qwen2-VL, GLM-Image doesn't have video support.
+        """
+        llm_pos_ids_list: list = []
+
+        # Check if we have any image inputs (image-to-image mode)
+        has_image_input = image_grid_thw is not None and len(image_grid_thw) > 0
+
+        if has_image_input:
+            # Image-to-image mode: handle source image positions
+            image_token_id = getattr(hf_config, "image_token_id", None)
+            spatial_merge_size = getattr(hf_config.vision_config, "spatial_merge_size", 1)
+
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_tokens_tensor = torch.tensor(input_tokens)
+            image_indices = torch.argwhere(input_tokens_tensor == image_token_id).squeeze(1).tolist()
+
+            st = 0
+            image_idx = 0
+            for i, token_pos in enumerate(image_indices):
+                # Text before this image
+                if token_pos > st:
+                    text_len = token_pos - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    st = token_pos
+
+                # Image tokens
+                if image_idx < len(image_grid_thw):
+                    t, h, w = image_grid_thw[image_idx]
+                    llm_grid_t = t
+                    llm_grid_h = h // spatial_merge_size
+                    llm_grid_w = w // spatial_merge_size
+
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                    num_image_tokens = llm_grid_t * llm_grid_h * llm_grid_w
+                    st += num_image_tokens
+                    image_idx += 1
+
+            # Remaining text after last image
+            if st < len(input_tokens):
+                text_len = len(input_tokens) - st
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+        else:
+            # Text-to-image mode: simple sequential positions
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def _vl_get_input_positions_tensor(
         cls,

From 82460fae7efa9c9caf01f1c085e4ba1608ffe7a3 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 11:21:55 +0800
Subject: [PATCH 25/53] fix glm_image spelling

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/layers/mrope.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/layers/mrope.py b/vllm_omni/model_executor/layers/mrope.py
index b3517f9492c..49bd56a65db 100644
--- a/vllm_omni/model_executor/layers/mrope.py
+++ b/vllm_omni/model_executor/layers/mrope.py
@@ -201,7 +201,7 @@ def get_input_positions_tensor(
                 context_len=context_len,
                 seq_len=seq_len,
             )
-        elif hf_config.model_type == "glm-image":
+        elif hf_config.model_type == "glm_image":
             # GLM-Image is an image generation model.
             # For text-to-image mode (no input images), use simple text-only positions.
             # For image-to-image mode, use GLM4V-style position encoding.

From e1d946f38caf694d39bdb082c1519587163b386e Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 11:30:34 +0800
Subject: [PATCH 26/53] fix

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/worker/gpu_ar_model_runner.py | 101 ++++++++++--------------
 1 file changed, 40 insertions(+), 61 deletions(-)

diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py
index d4e7e195fe8..78f14d8f1e0 100644
--- a/vllm_omni/worker/gpu_ar_model_runner.py
+++ b/vllm_omni/worker/gpu_ar_model_runner.py
@@ -12,10 +12,15 @@
 import numpy as np
 import torch
 from vllm.config import CUDAGraphMode
+from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
+from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+    RoutedExpertsCapturer,
+)
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-from vllm.v1.outputs import AsyncModelRunnerOutput
+from vllm.v1.outputs import AsyncModelRunnerOutput, make_empty_encoder_model_runner_output
 from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import record_function_or_nullcontext
@@ -25,19 +30,12 @@
     IntermediateTensors,
     get_pp_group,
     get_tp_group,
-    has_kv_transfer_group,
-    
 )
+from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
-from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
-    RoutedExpertsCapturer,
-)
-from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
-from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
+
 from vllm_omni.outputs import OmniModelRunnerOutput
 from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner
-from vllm.v1.outputs import make_empty_encoder_model_runner_output
-from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
 
 logger = init_logger(__name__)
 
@@ -91,10 +89,7 @@ def execute_model(
         intermediate_tensors: IntermediateTensors | None = None,
     ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors | None:
         if self.execute_model_state is not None:
-            raise RuntimeError(
-                "State error: sample_tokens() must be called "
-                "after execute_model() returns None."
-            )
+            raise RuntimeError("State error: sample_tokens() must be called after execute_model() returns None.")
 
         if self.vllm_config.model_config.enable_return_routed_experts:
             capturer = RoutedExpertsCapturer.get_instance()
@@ -104,9 +99,7 @@ def execute_model(
                 logger.error("RoutedExpertsCapturer not initialized.")
 
         if scheduler_output.preempted_req_ids and has_kv_transfer_group():
-            get_kv_transfer_group().handle_preemptions(
-                scheduler_output.preempted_req_ids
-            )
+            get_kv_transfer_group().handle_preemptions(scheduler_output.preempted_req_ids)
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         with (
@@ -126,8 +119,7 @@ def execute_model(
 
             if not num_scheduled_tokens:
                 if (
-                    self.parallel_config.distributed_executor_backend
-                    == "external_launcher"
+                    self.parallel_config.distributed_executor_backend == "external_launcher"
                     and self.parallel_config.data_parallel_size > 1
                 ):
                     # this is a corner case when both external launcher
@@ -196,9 +188,7 @@ def execute_model(
             )
 
             num_tokens_padded = batch_desc.num_tokens
-            num_reqs_padded = (
-                batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
-            )
+            num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
             ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
                 should_ubatch,
                 num_scheduled_tokens_np,
@@ -218,19 +208,17 @@ def execute_model(
             use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
             ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
 
-            attn_metadata, spec_decode_common_attn_metadata = (
-                self._build_attention_metadata(
-                    num_tokens=num_tokens_unpadded,
-                    num_tokens_padded=num_tokens_padded if pad_attn else None,
-                    num_reqs=num_reqs,
-                    num_reqs_padded=num_reqs_padded if pad_attn else None,
-                    max_query_len=max_num_scheduled_tokens,
-                    ubatch_slices=ubatch_slices_attn,
-                    logits_indices=logits_indices,
-                    use_spec_decode=use_spec_decode,
-                    num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
-                    cascade_attn_prefix_lens=cascade_attn_prefix_lens,
-                )
+            attn_metadata, spec_decode_common_attn_metadata = self._build_attention_metadata(
+                num_tokens=num_tokens_unpadded,
+                num_tokens_padded=num_tokens_padded if pad_attn else None,
+                num_reqs=num_reqs,
+                num_reqs_padded=num_reqs_padded if pad_attn else None,
+                max_query_len=max_num_scheduled_tokens,
+                ubatch_slices=ubatch_slices_attn,
+                logits_indices=logits_indices,
+                use_spec_decode=use_spec_decode,
+                num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
+                cascade_attn_prefix_lens=cascade_attn_prefix_lens,
             )
 
             (
@@ -240,9 +228,7 @@ def execute_model(
                 intermediate_tensors,
                 model_kwargs,
                 ec_connector_output,
-            ) = self._preprocess(
-                scheduler_output, num_tokens_padded, intermediate_tensors
-            )
+            ) = self._preprocess(scheduler_output, num_tokens_padded, intermediate_tensors)
 
         # Set cudagraph mode to none if calc_kv_scales is true.
         # KV scales calculation involves dynamic operations that are incompatible
@@ -287,10 +273,11 @@ def execute_model(
                 hidden_states = model_output
                 aux_hidden_states = None
 
-            multimodal_outputs = model_output.multimodal_outputs
-            hidden_states = model_output.text_hidden_states
+            # Extract multimodal outputs if model supports it
+            # This handles both OmniOutput objects and plain tensors
+            hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states)
 
-            if multimodal_outputs is not None:
+            if multimodal_outputs is not None and multimodal_outputs:
                 keys_or_type = (
                     list(multimodal_outputs.keys())
                     if isinstance(multimodal_outputs, dict)
@@ -329,9 +316,7 @@ def execute_model(
                 sample_hidden_states = hidden_states[logits_indices]
                 if not get_pp_group().is_last_rank:
                     all_gather_tensors = {
-                        "residual": not is_residual_scattered_for_sp(
-                            self.vllm_config, num_tokens_padded
-                        )
+                        "residual": not is_residual_scattered_for_sp(self.vllm_config, num_tokens_padded)
                     }
                     get_pp_group().send_tensor_dict(
                         hidden_states.tensors,
@@ -408,9 +393,7 @@ def sample_tokens(
 
         # Apply structured output bitmasks if present.
         if grammar_output is not None:
-            apply_grammar_bitmask(
-                scheduler_output, grammar_output, self.input_batch, logits
-            )
+            apply_grammar_bitmask(scheduler_output, grammar_output, self.input_batch, logits)
 
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
@@ -450,23 +433,19 @@ def propose_draft_token_ids(sampled_token_ids):
                     propose_draft_token_ids(sampled_token_ids)
                 elif self.valid_sampled_token_count_event is not None:
                     assert spec_decode_common_attn_metadata is not None
-                    next_token_ids, valid_sampled_tokens_count = (
-                        self.drafter.prepare_next_token_ids_padded(
-                            spec_decode_common_attn_metadata,
-                            sampled_token_ids,
-                            self.requests,
-                            self.input_batch,
-                            self.discard_request_mask.gpu,
-                        )
-                    )
-                    self._copy_valid_sampled_token_count(
-                        next_token_ids, valid_sampled_tokens_count
+                    next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded(
+                        spec_decode_common_attn_metadata,
+                        sampled_token_ids,
+                        self.requests,
+                        self.input_batch,
+                        self.discard_request_mask.gpu,
                     )
+                    self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count)
                     # Since we couldn't run the drafter,
                     # just use zeros for the draft tokens.
-                    self._draft_token_ids = torch.zeros(
-                        1, device=self.device, dtype=torch.int32
-                    ).expand(len(self.input_batch.req_ids), self.num_spec_tokens)
+                    self._draft_token_ids = torch.zeros(1, device=self.device, dtype=torch.int32).expand(
+                        len(self.input_batch.req_ids), self.num_spec_tokens
+                    )
                     self._copy_draft_token_ids_to_cpu(scheduler_output, zeros_only=True)
             else:
                 propose_drafts_after_bookkeeping = input_fits_in_drafter

From 468cd04da44615b46a0ad3e74be04502cd276282 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 11:34:38 +0800
Subject: [PATCH 27/53] fix compute_logits

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/models/glm_image/glm_image_ar.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 56d9e57def9..6be85f2a861 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -1753,6 +1753,7 @@ def forward(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
+        **kwargs: object,
     ) -> torch.Tensor | None:
         """Compute logits from hidden states."""
         logits = self.logits_processor(

From 59df49531d2049aadeb750c4d11fe75d8884fa7e Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 11:54:24 +0800
Subject: [PATCH 28/53] fix glm image stage input processors

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../stage_input_processors/glm_image.py             | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index 4ea7e66c83c..a135176e5a1 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -144,8 +144,17 @@ def ar2diffusion(
 
         # Get original prompt info
         original_prompt = prompt[i] if i < len(prompt) else {}
-        if isinstance(original_prompt, (OmniTokensPrompt, TextPrompt)):
-            original_prompt = dict(original_prompt) if hasattr(original_prompt, "__iter__") else {}
+        # Handle various prompt types - convert to dict for uniform access
+        # Note: TypedDict (TextPrompt, OmniTokensPrompt) doesn't support isinstance
+        if isinstance(original_prompt, dict):
+            pass  # Already a dict
+        elif hasattr(original_prompt, "_asdict"):
+            # NamedTuple
+            original_prompt = original_prompt._asdict()
+        elif hasattr(original_prompt, "__dict__"):
+            original_prompt = vars(original_prompt)
+        else:
+            original_prompt = {}
 
         # Extract dimensions from original prompt or use defaults
         height = original_prompt.get("height", 1024)

From 396883cef35967f5a5c19d66c2c11d6c14177a89 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 12:19:33 +0800
Subject: [PATCH 29/53] fix stage input

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/entrypoints/omni_stage.py           | 61 +++++++++++++++----
 .../stage_input_processors/glm_image.py       |  7 ++-
 2 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
index 6f62dff7fbb..cc8bbdea3ea 100644
--- a/vllm_omni/entrypoints/omni_stage.py
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -788,22 +788,51 @@ def handle_profiler_task(task_type: OmniStageTaskType) -> None:
             gen_outputs: list[Any] = []
             _gen_t0 = _time.time()
             if stage_type == "diffusion":
-                # For diffusion, batch_engine_inputs should be prompts (strings)
-                # Convert to list of strings if needed
+                # For diffusion, batch_engine_inputs can be:
+                # 1. Strings (direct prompts)
+                # 2. Dicts with "prompt" and other fields like "extra", "height", "width"
+                #    (from custom_process_input_func like ar2diffusion)
+                # We need to preserve all fields for proper multistage integration
                 prompts = []
+                per_request_kwargs = []
                 for ein in batch_engine_inputs:
                     if isinstance(ein, str):
                         prompts.append(ein)
-                    elif isinstance(ein, dict) and "prompt" in ein:
-                        prompts.append(ein["prompt"])
+                        per_request_kwargs.append({})
+                    elif isinstance(ein, dict):
+                        prompts.append(ein.get("prompt", ""))
+                        # Extract all non-prompt fields as kwargs for this request
+                        req_kwargs = {k: v for k, v in ein.items() if k != "prompt"}
+                        per_request_kwargs.append(req_kwargs)
                     elif hasattr(ein, "prompt"):
                         prompts.append(ein.prompt)
+                        per_request_kwargs.append({})
                     else:
                         prompts.append(str(ein))
+                        per_request_kwargs.append({})
                 # Prepare diffusion kwargs from sampling parameters
                 diffusion_kwargs = prepare_sampling_params(sampling_params, "diffusion")
-                # Diffusion generate returns results directly, not an iterator
-                diffusion_results = stage_engine.generate(prompts, **diffusion_kwargs)
+                # For multistage with extra params (like prior_token_ids), process each request
+                # with its specific kwargs merged with global diffusion_kwargs
+                diffusion_results = []
+                for i, (prompt, req_kwargs) in enumerate(zip(prompts, per_request_kwargs)):
+                    # Merge global diffusion_kwargs with per-request kwargs
+                    # Per-request kwargs take precedence (they may contain extra, height, width)
+                    merged_kwargs = {**diffusion_kwargs, **req_kwargs}
+                    # Log to verify extra params are being passed
+                    has_extra = "extra" in merged_kwargs
+                    has_prior_tokens = (
+                        merged_kwargs.get("extra", {}).get("prior_token_ids") is not None if has_extra else False
+                    )
+                    logger.info(
+                        f"[Diffusion] Request {i}: prompt='{prompt[:30] if prompt else ''}...', "
+                        f"has_extra={has_extra}, has_prior_token_ids={has_prior_tokens}"
+                    )
+                    result = stage_engine.generate(prompt, **merged_kwargs)
+                    if isinstance(result, list):
+                        diffusion_results.extend(result)
+                    else:
+                        diffusion_results.append(result)
                 # Convert to list format compatible with LLM outputs
                 # Ensure each result has a request_id for proper mapping
                 if isinstance(diffusion_results, list):
@@ -1248,12 +1277,19 @@ async def generation_single_request(task: dict[str, Any]):
                 ein = ein[0]
 
             if stage_type == "diffusion":
-                # For diffusion, ein should be prompts (strings)
-                # Convert to string if needed
+                # For diffusion, ein can be:
+                # 1. A string (direct prompt)
+                # 2. A dict with "prompt" and other fields like "extra", "height", "width"
+                #    (from custom_process_input_func like ar2diffusion)
+                # We need to preserve all fields for proper multistage integration
+                prompt = ""
+                per_request_kwargs = {}
                 if isinstance(ein, str):
                     prompt = ein
-                elif isinstance(ein, dict) and "prompt" in ein:
-                    prompt = ein["prompt"]
+                elif isinstance(ein, dict):
+                    prompt = ein.get("prompt", "")
+                    # Extract all non-prompt fields as kwargs for this request
+                    per_request_kwargs = {k: v for k, v in ein.items() if k != "prompt"}
                 elif hasattr(ein, "prompt"):
                     prompt = ein.prompt
                 else:
@@ -1261,8 +1297,11 @@ async def generation_single_request(task: dict[str, Any]):
 
                 # Prepare diffusion kwargs from sampling parameters
                 diffusion_kwargs = prepare_sampling_params(sampling_params, "diffusion")
+                # Merge global diffusion_kwargs with per-request kwargs
+                # Per-request kwargs take precedence (they may contain extra, height, width)
+                merged_kwargs = {**diffusion_kwargs, **per_request_kwargs}
                 # AsyncOmniDiffusion.generate returns a single result, not an async generator
-                gen_output = await stage_engine.generate(prompt=prompt, request_id=rid, **diffusion_kwargs)
+                gen_output = await stage_engine.generate(prompt=prompt, request_id=rid, **merged_kwargs)
                 _gen_t1 = _time.time()
                 _gen_ms = (_gen_t1 - _gen_t0) * 1000.0
                 await generation_out_q.put((rid, gen_output, _gen_ms))
diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index a135176e5a1..ac17f3fcfaa 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -191,7 +191,12 @@ def ar2diffusion(
                 diffusion_input[key] = original_prompt[key]
 
         diffusion_inputs.append(diffusion_input)
+        logger.info(
+            f"ar2diffusion: request {i}: prompt='{text_prompt[:50]}...', "
+            f"prior_token_ids shape={prior_token_ids.shape}, "
+            f"height={pixel_h}, width={pixel_w}"
+        )
 
-    logger.debug(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs")
+    logger.info(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs")
 
     return diffusion_inputs

From 2a36a470fe2a07734411bfc512353fad6c1b09e7 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 12:36:39 +0800
Subject: [PATCH 30/53] fix stage input

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../stage_input_processors/glm_image.py       | 30 ++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index ac17f3fcfaa..28732300334 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Stage input processor for GLM-Image: AR → Diffusion transition."""
 
-from math import sqrt
 from typing import Any
 
 import torch
@@ -57,16 +56,24 @@ def _parse_generated_tokens(
     Returns:
         Tuple of (upsampled_prior_token_ids, pixel_height, pixel_width)
     """
-    # Calculate token dimensions
+    # Calculate token dimensions for target image
     token_h = height // factor
     token_w = width // factor
     large_image_tokens = token_h * token_w
 
-    # Calculate small image dimensions (used in text-to-image)
-    ratio = token_h / token_w
-    prev_token_h = int(sqrt(ratio) * (factor // 2))
-    prev_token_w = int(sqrt(1 / ratio) * (factor // 2))
-    small_image_tokens = prev_token_h * prev_token_w
+    # Calculate small preview image dimensions (used in text-to-image)
+    # GLM-Image generates a small preview at 1/4 resolution before the full image
+    # The preview grid is computed as target_grid / 2 in each dimension
+    small_token_h = token_h // 2
+    small_token_w = token_w // 2
+    small_image_tokens = small_token_h * small_token_w
+
+    # Log actual values for debugging
+    logger.info(
+        f"_parse_generated_tokens: total_tokens={len(token_ids)}, "
+        f"large_image_tokens={large_image_tokens} ({token_h}x{token_w}), "
+        f"small_image_tokens={small_image_tokens} ({small_token_h}x{small_token_w})"
+    )
 
     # Determine if this is text-to-image (has small + large) or image-to-image (large only)
     total_expected_t2i = small_image_tokens + large_image_tokens + 1  # +1 for EOS
@@ -79,9 +86,11 @@ def _parse_generated_tokens(
         large_start = small_image_tokens
         large_end = large_start + large_image_tokens
         prior_token_ids_d32 = token_tensor[large_start:large_end]
+        logger.info(f"Text-to-image mode: extracting tokens [{large_start}:{large_end}]")
     elif len(token_ids) >= total_expected_i2i:
         # Image-to-image: large image tokens are at the beginning
         prior_token_ids_d32 = token_tensor[:large_image_tokens]
+        logger.info(f"Image-to-image mode: extracting tokens [0:{large_image_tokens}]")
     else:
         # Fallback: use whatever tokens we have
         logger.warning(
@@ -89,6 +98,13 @@ def _parse_generated_tokens(
         )
         prior_token_ids_d32 = token_tensor[:large_image_tokens]
 
+    # Log token value statistics for debugging
+    logger.info(
+        f"prior_token_ids_d32: min={prior_token_ids_d32.min().item()}, "
+        f"max={prior_token_ids_d32.max().item()}, "
+        f"unique_count={prior_token_ids_d32.unique().numel()}"
+    )
+
     # Upsample from 32x to 16x
     prior_token_ids = _upsample_token_ids(prior_token_ids_d32, token_h, token_w)
 

From 31fa960e43473c696f6f891d070b6fd51d4ed982 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 12:48:48 +0800
Subject: [PATCH 31/53] debug

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../stage_input_processors/glm_image.py       | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index 28732300334..eff1fb197e9 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -68,6 +68,8 @@ def _parse_generated_tokens(
     small_token_w = token_w // 2
     small_image_tokens = small_token_h * small_token_w
 
+    token_tensor = torch.tensor(token_ids, dtype=torch.long)
+
     # Log actual values for debugging
     logger.info(
         f"_parse_generated_tokens: total_tokens={len(token_ids)}, "
@@ -75,11 +77,39 @@ def _parse_generated_tokens(
         f"small_image_tokens={small_image_tokens} ({small_token_h}x{small_token_w})"
     )
 
-    # Determine if this is text-to-image (has small + large) or image-to-image (large only)
+    # Analyze token distribution to find image tokens
+    # Image tokens should be in range [0, 16384) for VQ codebook
+    # Text tokens are typically higher values
+    logger.info(
+        f"Full sequence stats: min={token_tensor.min().item()}, "
+        f"max={token_tensor.max().item()}, "
+        f"unique={token_tensor.unique().numel()}"
+    )
+
+    # Look for the actual image tokens - they should be consecutive and in VQ range
+    # Print first 20 and last 20 tokens to understand the structure
+    logger.info(f"First 20 tokens: {token_tensor[:20].tolist()}")
+    logger.info(f"Last 20 tokens: {token_tensor[-20:].tolist()}")
+
+    # The actual structure for text-to-image from vLLM AR should be:
+    # [small_image_tokens (256)] + [large_image_tokens (1024)] + [EOS]
+    # Total expected: 256 + 1024 + 1 = 1281 tokens
+    # But we got 16384 tokens - this suggests the output includes prompt tokens
+
+    # For GLM-Image, the expected structure is that the model generates ALL new tokens
+    # including both small preview and large image tokens
+    # Since we got 16384 tokens, and 1024*16 = 16384, this might be at 2x downsampling
+    # Let's try different interpretations
+
+    # Possibility 1: tokens are at 2x scale (64x64 = 4096 for large, 32x32 = 1024 for small)
+    # Possibility 2: the output is padded or has a different format
+    # Possibility 3: tokens include repeated EOS or padding
+
     total_expected_t2i = small_image_tokens + large_image_tokens + 1  # +1 for EOS
     total_expected_i2i = large_image_tokens + 1
 
-    token_tensor = torch.tensor(token_ids, dtype=torch.long)
+    # Try to detect the end of meaningful tokens by looking for EOS patterns
+    # EOS token is typically a high value or repeated value at the end
 
     if len(token_ids) >= total_expected_t2i:
         # Text-to-image: extract large image tokens after small image tokens

From 1b15a94fdee4f075997943339c31b344ac11d86e Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 12:58:49 +0800
Subject: [PATCH 32/53] diffusion temperature 1.0

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/model_executor/stage_configs/glm_image.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index d9186769362..c8cddae9e7c 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -29,7 +29,7 @@ stage_args:
     final_output: false # AR is not the final output
     is_comprehension: true
     default_sampling_params:
-      temperature: 0.0
+      temperature: 1.0 # Must use sampling (not greedy) for image token generation
       top_p: 1.0
       top_k: -1
       max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384)

From 67ec0afb65616727f619061a9a3d89f6df33b124 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 13:18:27 +0800
Subject: [PATCH 33/53] end2end params temp

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 examples/offline_inference/glm_image/end2end.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py
index 1134f00b424..aa7955e44e7 100644
--- a/examples/offline_inference/glm_image/end2end.py
+++ b/examples/offline_inference/glm_image/end2end.py
@@ -216,8 +216,11 @@ def main(args: argparse.Namespace) -> None:
     # For multistage, the AR stage may need sampling params
     from vllm import SamplingParams
 
+    # IMPORTANT: GLM-Image AR model requires sampling (not greedy) for proper
+    # image token generation. Using temperature=0.0 causes degenerate repetitive
+    # tokens and black images. Must use temperature > 0 (default: 1.0).
     ar_sampling_params = SamplingParams(
-        temperature=0.0,
+        temperature=1.0,  # Must use sampling for image token diversity
         top_p=1.0,
         top_k=-1,
         max_tokens=args.max_tokens,

From 15c6a36bdce354706afc8043db71aebdf3cd590d Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 13:38:32 +0800
Subject: [PATCH 34/53] apply_chat_template, prepocessor text

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../offline_inference/glm_image/end2end.py    | 10 ++++
 vllm_omni/inputs/preprocess.py                | 49 ++++++++++++++++
 .../models/glm_image/glm_image_ar.py          | 58 ++++++++++++++-----
 3 files changed, 103 insertions(+), 14 deletions(-)

diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py
index aa7955e44e7..1a4a0273829 100644
--- a/examples/offline_inference/glm_image/end2end.py
+++ b/examples/offline_inference/glm_image/end2end.py
@@ -88,6 +88,11 @@ def build_prompt_for_t2i(
         "prompt": prompt,
         "height": height,
         "width": width,
+        # Pass target dimensions to AR processor for proper grid token generation
+        "mm_processor_kwargs": {
+            "target_h": height,
+            "target_w": width,
+        },
     }
 
 
@@ -122,6 +127,11 @@ def build_prompt_for_i2i(
         },
         "height": height,
         "width": width,
+        # Pass target dimensions to AR processor for proper grid token generation
+        "mm_processor_kwargs": {
+            "target_h": height,
+            "target_w": width,
+        },
     }
 
 
diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py
index 4c1ee5388ec..659d8e9adfe 100644
--- a/vllm_omni/inputs/preprocess.py
+++ b/vllm_omni/inputs/preprocess.py
@@ -20,6 +20,55 @@ class OmniInputPreprocessor(InputPreprocessor):
     Supports processing tokens, embeddings, text, and multimodal inputs.
     """
 
+    def _process_text(
+        self,
+        parsed_content: TextPrompt,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        *,
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> OmniTokenInputs | MultiModalInputs:
+        """Process text prompts with support for mm_processor_kwargs.
+
+        Override the base class to support passing mm_processor_kwargs even when
+        there's no multi_modal_data. This is needed for models like GLM-Image
+        where text-to-image generation requires processor kwargs (target_h, target_w)
+        to properly format the prompt with grid tokens.
+        """
+        prompt_text = parsed_content["prompt"]
+        mm_processor_kwargs = parsed_content.get("mm_processor_kwargs") or {}
+
+        inputs: OmniTokenInputs | MultiModalInputs
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = self._process_multimodal(
+                prompt_text,
+                multi_modal_data,
+                mm_processor_kwargs,
+                tokenization_kwargs=tokenization_kwargs,
+                mm_uuids=mm_uuids,
+            )
+        elif mm_processor_kwargs:
+            # Handle case where mm_processor_kwargs is provided without multi_modal_data
+            # This is needed for GLM-Image text-to-image mode where the processor
+            # needs target_h/target_w to build the prompt with grid tokens
+            inputs = self._process_multimodal(
+                prompt_text,
+                {},  # Empty multi_modal_data
+                mm_processor_kwargs,
+                tokenization_kwargs=tokenization_kwargs,
+                mm_uuids=mm_uuids,
+            )
+        else:
+            prompt_token_ids = self._tokenize_prompt(
+                prompt_text,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            inputs = token_inputs_omni(prompt_token_ids=prompt_token_ids)
+
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
+
+        return inputs
+
     def _process_tokens(
         self,
         parsed_content: TokensPrompt,
diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 6be85f2a861..458013522e5 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -290,14 +290,40 @@ def _call_hf_processor(
         """
         Call the HuggingFace processor.
 
-        If no multimodal data is provided (text-to-image mode),
-        we only tokenize the text.
+        For text-to-image mode (no images), we need to:
+        1. Build the prompt with target grid dimensions
+        2. Build the image_grid_thw tensor for M-RoPE position encoding
+
+        For image-to-image mode, we use the full processor.
         """
         if not mm_data or not mm_data.get("image"):
-            # Text-to-image mode: just tokenize the prompt
-            tokenizer = self.info.get_tokenizer()
-            prompt_ids = tokenizer.encode(prompt)
-            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+            # Text-to-image mode: use GlmImageProcessor with target dimensions
+            # This is critical - the processor adds grid tokens that tell the model
+            # what resolution to generate
+            processor = self.info.get_hf_processor()
+            if processor is not None:
+                # Get target dimensions from mm_kwargs or use defaults
+                target_h = mm_kwargs.get("target_h", 1024) if mm_kwargs else 1024
+                target_w = mm_kwargs.get("target_w", 1024) if mm_kwargs else 1024
+
+                # Build messages format expected by processor
+                messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+
+                # Use apply_chat_template which handles target dimensions
+                hf_inputs = processor.apply_chat_template(
+                    messages,
+                    tokenize=True,
+                    target_h=target_h,
+                    target_w=target_w,
+                    return_dict=True,
+                    return_tensors="pt",
+                )
+                return hf_inputs
+            else:
+                # Fallback: just tokenize (this won't work properly for generation)
+                tokenizer = self.info.get_tokenizer()
+                prompt_ids = tokenizer.encode(prompt)
+                return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         # Image-to-image mode: use full processor
         return super()._call_hf_processor(
@@ -315,16 +341,20 @@ def _get_mm_fields_config(
         """
         Get the multimodal field configuration.
 
-        Returns empty dict if no image data (text-to-image mode).
+        For text-to-image: only image_grid_thw is needed (no pixel_values)
+        For image-to-image: both pixel_values and image_grid_thw are needed
         """
-        # Check if we have image data
-        if "pixel_values" not in hf_inputs:
-            return {}
+        result = {}
 
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_grid_thw=MultiModalFieldConfig.batched("image"),
-        )
+        # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding)
+        if "image_grid_thw" in hf_inputs:
+            result["image_grid_thw"] = MultiModalFieldConfig.batched("image")
+
+        # pixel_values only present in image-to-image mode
+        if "pixel_values" in hf_inputs:
+            result["pixel_values"] = MultiModalFieldConfig.batched("image")
+
+        return result
 
     def _get_prompt_updates(
         self,

From 36bd0f77acc9d2619632fe81c081d30483690258 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 13:59:19 +0800
Subject: [PATCH 35/53] get processor config

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 458013522e5..0bfecb6a7b5 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -127,14 +127,48 @@ def get_hf_config(self) -> GlmImageConfig:
         return self.ctx.get_hf_config(GlmImageConfig)
 
     def get_hf_processor(self, **kwargs: object):
-        # GLM-Image uses a processor similar to Qwen2-VL
-        # Try to get GlmImageProcessor if available
+        """Get the GlmImageProcessor.
+
+        GLM-Image has a special directory structure where:
+        - Model (AR) is in: {base}/vision_language_encoder/
+        - Processor is in: {base}/processor/
+
+        Since model_subdir is used to load the AR model, the model_config.model
+        path points to vision_language_encoder/. We need to go up one level
+        and into processor/ to load the GlmImageProcessor.
+        """
+        import os
+
         try:
             from transformers import GlmImageProcessor
 
-            return self.ctx.get_hf_processor(GlmImageProcessor, **kwargs)
-        except ImportError:
+            # Get the model path from config
+            model_path = self.ctx.model_config.model
+
+            # Check if we're in a subdirectory (vision_language_encoder)
+            # and need to go to processor/ instead
+            if model_path.endswith("vision_language_encoder") or "/vision_language_encoder" in model_path:
+                # Go up one level and into processor/
+                base_path = os.path.dirname(model_path.rstrip("/"))
+                processor_path = os.path.join(base_path, "processor")
+            else:
+                # Try processor subdirectory of current path
+                processor_path = os.path.join(model_path, "processor")
+                if not os.path.exists(processor_path):
+                    processor_path = model_path
+
+            # Load processor directly from the correct path
+            return GlmImageProcessor.from_pretrained(
+                processor_path,
+                trust_remote_code=self.ctx.model_config.trust_remote_code,
+                **kwargs,
+            )
+        except (ImportError, OSError) as e:
             # Fallback: return None and handle in processor
+            from vllm.logger import init_logger
+
+            logger = init_logger(__name__)
+            logger.warning(f"Failed to load GlmImageProcessor: {e}")
             return None
 
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:

From 186f14992ddaff563fb85915ca983c3ded551e32 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 14:07:55 +0800
Subject: [PATCH 36/53] debug logging

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 20 +++++
 .../stage_input_processors/glm_image.py       | 87 ++++++++++++-------
 2 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 0bfecb6a7b5..e319359eb93 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -343,6 +343,10 @@ def _call_hf_processor(
                 # Build messages format expected by processor
                 messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
 
+                logger.info("[GLM-Image T2I] Using GlmImageProcessor.apply_chat_template")
+                logger.info(f"[GLM-Image T2I] target_h={target_h}, target_w={target_w}")
+                logger.info(f"[GLM-Image T2I] prompt: {prompt[:200]}...")
+
                 # Use apply_chat_template which handles target dimensions
                 hf_inputs = processor.apply_chat_template(
                     messages,
@@ -352,6 +356,22 @@ def _call_hf_processor(
                     return_dict=True,
                     return_tensors="pt",
                 )
+
+                # Debug: log the tokenized input
+                if "input_ids" in hf_inputs:
+                    input_ids = hf_inputs["input_ids"]
+                    if hasattr(input_ids, "shape"):
+                        logger.info(f"[GLM-Image T2I] input_ids shape: {input_ids.shape}")
+                    tokenizer = self.info.get_tokenizer()
+                    if tokenizer is not None and hasattr(input_ids, "__len__"):
+                        # Decode to check the format
+                        ids_list = input_ids[0].tolist() if hasattr(input_ids[0], "tolist") else list(input_ids[0])
+                        decoded = tokenizer.decode(ids_list)
+                        logger.info(f"[GLM-Image T2I] decoded input: {decoded}")
+
+                if "image_grid_thw" in hf_inputs:
+                    logger.info(f"[GLM-Image T2I] image_grid_thw: {hf_inputs['image_grid_thw']}")
+
                 return hf_inputs
             else:
                 # Fallback: just tokenize (this won't work properly for generation)
diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index eff1fb197e9..79f5d91baf9 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -78,65 +78,92 @@ def _parse_generated_tokens(
     )
 
     # Analyze token distribution to find image tokens
-    # Image tokens should be in range [0, 16384) for VQ codebook
-    # Text tokens are typically higher values
     logger.info(
         f"Full sequence stats: min={token_tensor.min().item()}, "
         f"max={token_tensor.max().item()}, "
         f"unique={token_tensor.unique().numel()}"
     )
 
-    # Look for the actual image tokens - they should be consecutive and in VQ range
     # Print first 20 and last 20 tokens to understand the structure
     logger.info(f"First 20 tokens: {token_tensor[:20].tolist()}")
     logger.info(f"Last 20 tokens: {token_tensor[-20:].tolist()}")
 
-    # The actual structure for text-to-image from vLLM AR should be:
-    # [small_image_tokens (256)] + [large_image_tokens (1024)] + [EOS]
-    # Total expected: 256 + 1024 + 1 = 1281 tokens
-    # But we got 16384 tokens - this suggests the output includes prompt tokens
+    # Remove EOS token (16385) from the end if present
+    eos_token_id = 16385
+    if len(token_ids) > 0 and token_ids[-1] == eos_token_id:
+        token_tensor = token_tensor[:-1]
+        logger.info(f"Removed EOS token, remaining: {len(token_tensor)} tokens")
 
-    # For GLM-Image, the expected structure is that the model generates ALL new tokens
-    # including both small preview and large image tokens
-    # Since we got 16384 tokens, and 1024*16 = 16384, this might be at 2x downsampling
-    # Let's try different interpretations
+    actual_tokens = len(token_tensor)
 
-    # Possibility 1: tokens are at 2x scale (64x64 = 4096 for large, 32x32 = 1024 for small)
-    # Possibility 2: the output is padded or has a different format
-    # Possibility 3: tokens include repeated EOS or padding
-
-    total_expected_t2i = small_image_tokens + large_image_tokens + 1  # +1 for EOS
-    total_expected_i2i = large_image_tokens + 1
-
-    # Try to detect the end of meaningful tokens by looking for EOS patterns
-    # EOS token is typically a high value or repeated value at the end
-
-    if len(token_ids) >= total_expected_t2i:
+    if actual_tokens >= small_image_tokens + large_image_tokens:
         # Text-to-image: extract large image tokens after small image tokens
         large_start = small_image_tokens
         large_end = large_start + large_image_tokens
         prior_token_ids_d32 = token_tensor[large_start:large_end]
+        actual_h, actual_w = token_h, token_w
         logger.info(f"Text-to-image mode: extracting tokens [{large_start}:{large_end}]")
-    elif len(token_ids) >= total_expected_i2i:
+    elif actual_tokens >= large_image_tokens:
         # Image-to-image: large image tokens are at the beginning
         prior_token_ids_d32 = token_tensor[:large_image_tokens]
+        actual_h, actual_w = token_h, token_w
         logger.info(f"Image-to-image mode: extracting tokens [0:{large_image_tokens}]")
     else:
-        # Fallback: use whatever tokens we have
-        logger.warning(
-            f"Unexpected token count: {len(token_ids)}, expected at least {total_expected_i2i}. Using available tokens."
-        )
-        prior_token_ids_d32 = token_tensor[:large_image_tokens]
+        # Insufficient tokens - try to infer the actual grid size
+        # The model might have generated for a different resolution
+        import math
+
+        # Try to find a square grid that fits the available tokens
+        # First check if it matches any of the small+large patterns
+        for scale in [1, 2, 4]:
+            test_h = token_h // scale
+            test_w = token_w // scale
+            test_small_h = test_h // 2
+            test_small_w = test_w // 2
+            test_large = test_h * test_w
+            test_small = test_small_h * test_small_w
+
+            if actual_tokens >= test_small + test_large:
+                # Found matching grid for t2i
+                prior_token_ids_d32 = token_tensor[test_small : test_small + test_large]
+                actual_h, actual_w = test_h, test_w
+                # Adjust output dimensions
+                height = test_h * factor
+                width = test_w * factor
+                logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}")
+                break
+            elif actual_tokens >= test_large:
+                # Found matching grid for i2i
+                prior_token_ids_d32 = token_tensor[:test_large]
+                actual_h, actual_w = test_h, test_w
+                height = test_h * factor
+                width = test_w * factor
+                logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}")
+                break
+        else:
+            # Last resort: find closest square grid
+            sqrt_tokens = int(math.sqrt(actual_tokens))
+            actual_h = actual_w = sqrt_tokens
+            usable_tokens = sqrt_tokens * sqrt_tokens
+            prior_token_ids_d32 = token_tensor[:usable_tokens]
+            height = sqrt_tokens * factor
+            width = sqrt_tokens * factor
+            logger.error(
+                f"Could not match grid pattern. Using {sqrt_tokens}x{sqrt_tokens} grid "
+                f"({usable_tokens} tokens), output will be {height}x{width}. "
+                f"This likely indicates a prompt format issue."
+            )
 
     # Log token value statistics for debugging
     logger.info(
-        f"prior_token_ids_d32: min={prior_token_ids_d32.min().item()}, "
+        f"prior_token_ids_d32: shape={prior_token_ids_d32.shape}, "
+        f"min={prior_token_ids_d32.min().item()}, "
         f"max={prior_token_ids_d32.max().item()}, "
         f"unique_count={prior_token_ids_d32.unique().numel()}"
     )
 
     # Upsample from 32x to 16x
-    prior_token_ids = _upsample_token_ids(prior_token_ids_d32, token_h, token_w)
+    prior_token_ids = _upsample_token_ids(prior_token_ids_d32, actual_h, actual_w)
 
     return prior_token_ids, height, width
 

From 69197a453777d9957088da0a2eb0110f841f68a3 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 14:20:56 +0800
Subject: [PATCH 37/53] use processor.tokenizer

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py               | 14 +++++++++-----
 .../model_executor/stage_configs/glm_image.yaml    |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index e319359eb93..c4c7204aea6 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -362,12 +362,16 @@ def _call_hf_processor(
                     input_ids = hf_inputs["input_ids"]
                     if hasattr(input_ids, "shape"):
                         logger.info(f"[GLM-Image T2I] input_ids shape: {input_ids.shape}")
-                    tokenizer = self.info.get_tokenizer()
-                    if tokenizer is not None and hasattr(input_ids, "__len__"):
-                        # Decode to check the format
+                    # Use processor's tokenizer (not ByT5Tokenizer from tokenizer/ dir)
+                    # GlmImageProcessor has its own tokenizer with a different vocabulary
+                    if hasattr(processor, "tokenizer") and processor.tokenizer is not None:
                         ids_list = input_ids[0].tolist() if hasattr(input_ids[0], "tolist") else list(input_ids[0])
-                        decoded = tokenizer.decode(ids_list)
-                        logger.info(f"[GLM-Image T2I] decoded input: {decoded}")
+                        try:
+                            decoded = processor.tokenizer.decode(ids_list)
+                            logger.info(f"[GLM-Image T2I] decoded input: {decoded}")
+                        except Exception as e:
+                            logger.warning(f"[GLM-Image T2I] could not decode: {e}")
+                            logger.info(f"[GLM-Image T2I] first 50 token ids: {ids_list[:50]}")
 
                 if "image_grid_thw" in hf_inputs:
                     logger.info(f"[GLM-Image T2I] image_grid_thw: {hf_inputs['image_grid_thw']}")
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index c8cddae9e7c..d26fa2c73ee 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -16,7 +16,7 @@ stage_args:
       model_stage: ar
       model_arch: GlmImageForConditionalGeneration
       model_subdir: vision_language_encoder # AR model config.json is in this subdirectory
-      tokenizer_subdir: tokenizer # Tokenizer files are in tokenizer/ subdirectory
+      tokenizer_subdir: processor # Use processor's tokenizer (not ByT5 from tokenizer/)
       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
       gpu_memory_utilization: 0.6

From c22875b56d5ef5bf07dd73701d5ca6a14a59e3ab Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 14:30:39 +0800
Subject: [PATCH 38/53] use temperature 0.9 and 0.15 top_p

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../model_executor/models/glm_image/glm_image_ar.py      | 9 +++++++++
 vllm_omni/model_executor/stage_configs/glm_image.yaml    | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index c4c7204aea6..94c0abc069c 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -1824,6 +1824,15 @@ def forward(
         Returns:
             Hidden states or intermediate tensors
         """
+        # Debug logging (first call only)
+        if not hasattr(self, "_logged_forward"):
+            self._logged_forward = True
+            logger.info(f"[GLM-Image Forward] input_ids shape: {input_ids.shape if input_ids is not None else None}")
+            logger.info(f"[GLM-Image Forward] positions shape: {positions.shape if positions is not None else None}")
+            logger.info(f"[GLM-Image Forward] pixel_values: {pixel_values is not None}")
+            logger.info(f"[GLM-Image Forward] image_grid_thw: {image_grid_thw}")
+            logger.info(f"[GLM-Image Forward] kwargs keys: {list(kwargs.keys())}")
+
         if intermediate_tensors is not None:
             inputs_embeds = None
 
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index d26fa2c73ee..edf7f6219b7 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -29,8 +29,8 @@ stage_args:
     final_output: false # AR is not the final output
     is_comprehension: true
     default_sampling_params:
-      temperature: 1.0 # Must use sampling (not greedy) for image token generation
-      top_p: 1.0
+      temperature: 0.9 # From model's generation_config.json
+      top_p: 0.75 # From model's generation_config.json
       top_k: -1
       max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384)
       seed: 42

From e37bfcb1d3adf9b40836bed2bbfde68c10c7601d Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 14:45:49 +0800
Subject: [PATCH 39/53] align image_grid_thw with transformers

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 87 ++++++++++++++++---
 1 file changed, 74 insertions(+), 13 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 94c0abc069c..5ed879016f9 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -1715,7 +1715,9 @@ def get_image_tokens(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        mm_features: list[MultiModalFeatureSpec],
+        mm_features: list[MultiModalFeatureSpec] | None = None,
+        image_grid_thw: list[list[int]] | None = None,
+        **kwargs,
     ) -> tuple[torch.Tensor, int]:
         """
         Compute M-RoPE position IDs for GLM-Image generation.
@@ -1727,19 +1729,30 @@ def get_mrope_input_positions(
           - height: row position in image grid
           - width: column position in image grid
 
+        For text-to-image generation, we also pre-compute positions for the tokens
+        that will be generated (small image + large image + EOS), similar to how
+        transformers GLM-Image caches decode positions.
+
         Args:
             input_tokens: List of input token IDs
-            mm_features: Multimodal feature specifications
+            mm_features: Multimodal feature specifications (optional)
+            image_grid_thw: Pre-extracted image grid dimensions (optional)
+            **kwargs: Additional arguments (hf_config, video_grid_thw, etc.)
 
         Returns:
-            Tuple of (position_ids [3, seq_len], mrope_position_delta)
+            Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta)
         """
-        # Gather image grid info from multimodal features
-        kwargs = MultiModalFeatureSpec.gather_kwargs(
-            mm_features,
-            {"image_grid_thw"},
-        )
-        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        # Get image_grid_thw from either the direct arg or mm_features
+        if image_grid_thw is None and mm_features is not None:
+            # Gather image grid info from multimodal features
+            feature_kwargs = MultiModalFeatureSpec.gather_kwargs(
+                mm_features,
+                {"image_grid_thw"},
+            )
+            image_grid_thw = [item.tolist() for item in feature_kwargs.get("image_grid_thw", [])]
+
+        if image_grid_thw is None:
+            image_grid_thw = []
 
         hf_config = self.config
         image_start_token_id = hf_config.image_start_token_id
@@ -1748,6 +1761,9 @@ def get_mrope_input_positions(
         seq_len = len(input_tokens)
         llm_pos_ids_list: list[torch.Tensor] = []
 
+        # Count completed images (have end marker) vs images to generate
+        num_complete_images = sum(1 for t in input_tokens if t == image_end_token_id)
+
         if image_grid_thw:
             # Build position IDs considering image regions
             current_pos = 0
@@ -1757,8 +1773,8 @@ def get_mrope_input_positions(
             while i < seq_len:
                 token = input_tokens[i]
 
-                if token == image_start_token_id and image_idx < len(image_grid_thw):
-                    # Start of image region
+                if token == image_start_token_id and image_idx < num_complete_images:
+                    # This is a completed image (source image for i2i)
                     # Add position for the start marker
                     llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]]))
                     current_pos += 1
@@ -1787,12 +1803,57 @@ def get_mrope_input_positions(
                     i += 1
 
                 else:
-                    # Regular text token
+                    # Regular text token (or trailing start marker for generation)
                     llm_pos_ids_list.append(torch.tensor([[current_pos], [current_pos], [current_pos]]))
                     current_pos += 1
                     i += 1
 
-            llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+            prefill_positions = torch.cat(llm_pos_ids_list, dim=1)
+
+            # Pre-compute decode positions for images that will be generated
+            # This is critical for text-to-image where we need to generate image tokens
+            num_decode_grids = len(image_grid_thw) - num_complete_images
+
+            if num_decode_grids > 0:
+                decode_pos_lists: list[torch.Tensor] = []
+                decode_pos = current_pos
+
+                # Process grids in reverse order (last grid first for GLM-Image t2i)
+                # For t2i with grids [[1,32,32], [1,16,16]]:
+                # - First generate small image (16x16 = 256 tokens)
+                # - Then generate large image (32x32 = 1024 tokens)
+                # - Finally generate EOS
+                for i in range(1, num_decode_grids + 1):
+                    grid_idx = -i
+                    _, h, w = image_grid_thw[grid_idx]
+                    total_tokens = h * w
+
+                    # Build 2D positions for this generated image
+                    h_indices = torch.arange(h).unsqueeze(1).expand(h, w).flatten()
+                    w_indices = torch.arange(w).unsqueeze(0).expand(h, w).flatten()
+
+                    decode_t = torch.full((total_tokens,), decode_pos, dtype=torch.long)
+                    decode_h = decode_pos + h_indices
+                    decode_w = decode_pos + w_indices
+
+                    decode_pos_lists.append(torch.stack([decode_t, decode_h, decode_w], dim=0))
+                    decode_pos = decode_pos + max(h, w)
+
+                # Add position for EOS token
+                decode_pos_lists.append(torch.tensor([[decode_pos], [decode_pos], [decode_pos]]))
+
+                decode_positions = torch.cat(decode_pos_lists, dim=1)
+
+                # Concatenate prefill and decode positions
+                llm_positions = torch.cat([prefill_positions, decode_positions], dim=1)
+
+                # Log for debugging
+                logger.info(
+                    f"[GLM-Image M-RoPE] prefill_len={prefill_positions.shape[1]}, "
+                    f"decode_len={decode_positions.shape[1]}, total_len={llm_positions.shape[1]}"
+                )
+            else:
+                llm_positions = prefill_positions
         else:
             # Pure text - all dimensions same
             llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1)

From ac8a81b4db5d46780d45de4df7dd149767bdfa20 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 15:20:23 +0800
Subject: [PATCH 40/53] fix params

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../offline_inference/glm_image/end2end.py    | 63 ++++++++++++++++---
 .../stage_configs/glm_image.yaml              |  5 +-
 .../glm_image_muilticonnector.yaml            |  9 +--
 3 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py
index 1a4a0273829..65c3653a3d2 100644
--- a/examples/offline_inference/glm_image/end2end.py
+++ b/examples/offline_inference/glm_image/end2end.py
@@ -51,6 +51,41 @@
 
 SEED = 42
 
+# GLM-Image special tokens
+GLM_IMAGE_EOS_TOKEN_ID = 16385  # eos_token_id from generation_config.json
+GLM_IMAGE_VISION_VOCAB_SIZE = 16512  # top_k should be vision_vocab_size
+
+
+def compute_max_tokens(height: int, width: int, factor: int = 32) -> int:
+    """
+    Compute max_new_tokens for GLM-Image AR generation.
+
+    GLM-Image generates tokens in this order for text-to-image:
+    1. Small preview image (half resolution in each dimension)
+    2. Large target image (full resolution)
+    3. EOS token
+
+    Args:
+        height: Target image height in pixels
+        width: Target image width in pixels
+        factor: Downsampling factor (32 for GLM-Image AR output)
+
+    Returns:
+        Total number of tokens to generate (small + large + EOS)
+    """
+    # Large image tokens (target resolution)
+    token_h = height // factor
+    token_w = width // factor
+    large_tokens = token_h * token_w
+
+    # Small preview tokens (half resolution in each dimension)
+    small_h = token_h // 2
+    small_w = token_w // 2
+    small_tokens = small_h * small_w
+
+    # Total: small + large + EOS
+    return small_tokens + large_tokens + 1
+
 
 def load_image(image_path: str) -> Image.Image:
     """Load an image from file path."""
@@ -226,14 +261,28 @@ def main(args: argparse.Namespace) -> None:
     # For multistage, the AR stage may need sampling params
     from vllm import SamplingParams
 
-    # IMPORTANT: GLM-Image AR model requires sampling (not greedy) for proper
-    # image token generation. Using temperature=0.0 causes degenerate repetitive
-    # tokens and black images. Must use temperature > 0 (default: 1.0).
+    # Compute max_tokens dynamically based on target image size
+    target_height = prompt_dict.get("height", 1024)
+    target_width = prompt_dict.get("width", 1024)
+    calculated_max_tokens = compute_max_tokens(target_height, target_width)
+
+    # Use calculated value unless user explicitly specified a different value
+    # Default args.max_tokens is 16384 (very large), so prefer calculated value
+    effective_max_tokens = calculated_max_tokens if args.max_tokens == 16384 else args.max_tokens
+
+    if args.verbose:
+        print(f"AR max_tokens: {effective_max_tokens} (calculated: {calculated_max_tokens}, arg: {args.max_tokens})")
+
+    # IMPORTANT: GLM-Image AR model requires these exact sampling parameters
+    # from generation_config.json for proper image token generation.
+    # - temperature=0.9, top_p=0.75, top_k=16512 (vision_vocab_size)
+    # - stop_token_ids=[16385] (eos_token_id) is CRITICAL to stop generation
     ar_sampling_params = SamplingParams(
-        temperature=1.0,  # Must use sampling for image token diversity
-        top_p=1.0,
-        top_k=-1,
-        max_tokens=args.max_tokens,
+        temperature=0.9,  # From generation_config.json
+        top_p=0.75,  # From generation_config.json
+        top_k=GLM_IMAGE_VISION_VOCAB_SIZE,  # 16512, vision vocabulary size
+        max_tokens=effective_max_tokens,
+        stop_token_ids=[GLM_IMAGE_EOS_TOKEN_ID],  # 16385, CRITICAL for stopping
         seed=args.seed,
         detokenize=False,
     )
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index edf7f6219b7..7deca12c9ba 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -31,8 +31,9 @@ stage_args:
     default_sampling_params:
       temperature: 0.9 # From model's generation_config.json
       top_p: 0.75 # From model's generation_config.json
-      top_k: -1
-      max_tokens: 16384 # Support up to 2048x2048 images (64x64 tokens * 4 = 16384)
+      top_k: 16512 # vision_vocab_size from generation_config.json
+      max_tokens: 1281 # For 1024x1024: small(16x16=256) + large(32x32=1024) + EOS(1)
+      stop_token_ids: [16385] # eos_token_id from generation_config.json
       seed: 42
       detokenize: false
 
diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
index c32b1cd3d07..d1e10cb4065 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
@@ -31,10 +31,11 @@ stage_args:
     final_output: false # AR is not the final output
     is_comprehension: true
     default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 16384 # Support up to 2048x2048 images
+      temperature: 0.9 # From model's generation_config.json
+      top_p: 0.75 # From model's generation_config.json
+      top_k: 16512 # vision_vocab_size from generation_config.json
+      max_tokens: 1281 # For 1024x1024: small(16x16=256) + large(32x32=1024) + EOS(1)
+      stop_token_ids: [16385] # eos_token_id from generation_config.json
       seed: 42
       detokenize: false
 

From a6ae872728cfd1e74065e9e3220d1b2599670295 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 15:50:24 +0800
Subject: [PATCH 41/53] fix mrope calc

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/worker/gpu_model_runner.py | 143 ++++++++++++++++++---------
 1 file changed, 97 insertions(+), 46 deletions(-)

diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 24d4ffd028e..b59298b9a25 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -8,7 +8,7 @@
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.models.interfaces import supports_mrope, supports_mm_encoder_only
+from vllm.model_executor.models.interfaces import supports_mm_encoder_only, supports_mrope
 from vllm.model_executor.models.interfaces_base import VllmModelForPooling
 from vllm.sampling_params import SamplingType
 from vllm.utils.import_utils import LazyLoader
@@ -17,6 +17,7 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner, IntermediateTensors, PerLayerAttnMetadata
 from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
+
 from vllm_omni.model_executor.models.output_templates import OmniOutput
 
 if TYPE_CHECKING:
@@ -116,6 +117,79 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
                 use_audio_in_video=use_audio_in_video,
             )
 
+    def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
+        """Calculate M-RoPE positions for scheduled tokens.
+
+        This method overrides the base vLLM implementation to support models
+        like GLM-Image that pre-compute decode positions with 2D spatial encoding.
+
+        For GLM-Image text-to-image generation:
+        - Prefill positions: Use pre-computed positions from get_mrope_input_positions
+        - Decode positions: Also use pre-computed 2D spatial positions instead of
+          the default linear positions from get_next_input_positions_tensor
+
+        The key difference from vLLM's default behavior:
+        - Default vLLM: decode positions use linear [N, N+1, N+2, ...] for all 3 dims
+        - GLM-Image needs: temporal=constant, height/width=2D grid pattern
+        """
+        from vllm.multimodal.utils import length_from_prompt_token_ids_or_embeds
+
+        mrope_pos_ptr = 0
+        for index, req_id in enumerate(self.input_batch.req_ids):
+            req = self.requests[req_id]
+            assert req.mrope_positions is not None
+
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = length_from_prompt_token_ids_or_embeds(req.prompt_token_ids, req.prompt_embeds)
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's mrope_positions are pre-computed
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.mrope_positions.np[:, dst_start:dst_end] = req.mrope_positions[:, src_start:src_end]
+                mrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                dst_start = mrope_pos_ptr
+
+                # Check if pre-computed decode positions are available
+                # GLM-Image's get_mrope_input_positions returns positions for
+                # both prefill and decode phases with proper 2D spatial encoding
+                total_precomputed = req.mrope_positions.shape[1]
+                decode_start = num_computed_tokens + prompt_part_len
+                decode_end = decode_start + completion_part_len
+
+                if decode_end <= total_precomputed:
+                    # Use pre-computed decode positions (for GLM-Image 2D spatial)
+                    self.mrope_positions.np[:, dst_start : dst_start + completion_part_len] = req.mrope_positions[
+                        :, decode_start:decode_end
+                    ]
+                else:
+                    # Fallback to default linear positions for text-only generation
+                    assert req.mrope_position_delta is not None
+                    MRotaryEmbedding.get_next_input_positions_tensor(
+                        out=self.mrope_positions.np,
+                        out_offset=dst_start,
+                        mrope_position_delta=req.mrope_position_delta,
+                        context_len=num_computed_tokens + prompt_part_len,
+                        num_new_tokens=completion_part_len,
+                    )
+
+                mrope_pos_ptr += completion_part_len
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
         output.
@@ -248,7 +322,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             except Exception as e:
                 logger.error(f"Error decoding additional information: {e}")
                 pass
-            
+
             if sampling_params and sampling_params.prompt_logprobs is not None:
                 self.num_prompt_logprobs[req_id] = (
                     self.input_batch.vocab_size
@@ -258,11 +332,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
-                
+
             # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
             if self.uses_xdrope_dim > 0:
                 self._init_xdrope_positions(req_state)
-                
+
             reqs_to_add.append(self.requests[req_id])
 
         # Update the states of the running/resumed requests.
@@ -281,14 +355,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             resumed_from_preemption = req_id in req_data.resumed_req_ids
             num_output_tokens = req_data.num_output_tokens[i]
             req_index = self.input_batch.req_id_to_index.get(req_id)
-            
+
             if req_state.prev_num_draft_len and self.use_async_scheduling:
                 # prev_num_draft_len is used in async scheduling mode with
                 # spec decode. it indicates if need to update num_computed_tokens
                 # of the request. for example:
                 # fist step: num_computed_tokens = 0, spec_tokens = [],
                 # prev_num_draft_len = 0.
-                # second step: num_computed_tokens = 100(prompt lenth),
+                # second step: num_computed_tokens = 100(prompt length),
                 # spec_tokens = [a,b], prev_num_draft_len = 0.
                 # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
                 # prev_num_draft_len = 2.
@@ -305,7 +379,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                     num_rejected = req_state.prev_num_draft_len - num_accepted
                     num_computed_tokens -= num_rejected
                     req_state.output_token_ids.extend([-1] * num_accepted)
-                    
+
             # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
 
@@ -327,12 +401,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 # failure. Align the cached state.
                 del req_state.output_token_ids[num_output_tokens:]
                 if req_index is not None:
-                    end_idx = (
-                        self.input_batch.num_prompt_tokens[req_index]
-                        + num_output_tokens
-                    )
+                    end_idx = self.input_batch.num_prompt_tokens[req_index] + num_output_tokens
                     self.input_batch.num_tokens_no_spec[req_index] = end_idx
-                    
+
             # Update the block IDs.
             if not resumed_from_preemption:
                 if new_block_ids is not None:
@@ -372,15 +443,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 # Add new_token_ids to token_ids_cpu.
                 start_token_index = num_computed_tokens
                 end_token_index = num_computed_tokens + len(new_token_ids)
-                self.input_batch.token_ids_cpu[
-                    req_index, start_token_index:end_token_index
-                ] = new_token_ids
+                self.input_batch.token_ids_cpu[req_index, start_token_index:end_token_index] = new_token_ids
                 self.input_batch.num_tokens_no_spec[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
             self.input_batch.update_req_spec_token_ids(req_state, scheduled_spec_tokens)
 
-
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for request in reqs_to_add:
@@ -457,10 +525,7 @@ def _dummy_run(
             # mm encoder dummy run may need to add in the future.
             return torch.tensor([]), torch.tensor([])
 
-        assert (
-            cudagraph_runtime_mode is None
-            or cudagraph_runtime_mode.valid_runtime_modes()
-        )
+        assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes()
 
         # If cudagraph_mode.decode_mode() == FULL and
         # cudagraph_mode.separate_routine(). This means that we are using
@@ -521,8 +586,7 @@ def _dummy_run(
                 max_num_scheduled_tokens=max_query_len,
                 use_cascade_attn=False,
                 allow_microbatching=allow_microbatching,
-                force_eager=is_profile
-                or (cudagraph_runtime_mode == CUDAGraphMode.NONE),
+                force_eager=is_profile or (cudagraph_runtime_mode == CUDAGraphMode.NONE),
                 # `force_uniform_decode` is used for cudagraph capture; because for
                 # capturing mixed prefill-decode batches, we sometimes use
                 # num_tokens == num_reqs which looks like a uniform decode batch to the
@@ -544,9 +608,7 @@ def _dummy_run(
             )
 
         num_tokens_padded = batch_desc.num_tokens
-        num_reqs_padded = (
-            batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
-        )
+        num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
         ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
             should_ubatch,
             num_scheduled_tokens,
@@ -625,17 +687,13 @@ def _dummy_run(
                 intermediate_tensors = None
             else:
                 if self.intermediate_tensors is None:
-                    self.intermediate_tensors = (
-                        self.model.make_empty_intermediate_tensors(
-                            batch_size=self.max_num_tokens,
-                            dtype=self.model_config.dtype,
-                            device=self.device,
-                        )
+                    self.intermediate_tensors = self.model.make_empty_intermediate_tensors(
+                        batch_size=self.max_num_tokens,
+                        dtype=self.model_config.dtype,
+                        device=self.device,
                     )
 
-                intermediate_tensors = self.sync_and_slice_intermediate_tensors(
-                    num_tokens_padded, None, False
-                )
+                intermediate_tensors = self.sync_and_slice_intermediate_tensors(num_tokens_padded, None, False)
 
             if ubatch_slices_padded is not None:
                 # Adjust values to reflect a single ubatch.
@@ -676,14 +734,8 @@ def _dummy_run(
                 # Therefore only use cudagraphs if the main model uses PIECEWISE
                 # NOTE(lucas): this is a hack, need to clean up.
                 use_cudagraphs = (
-                    (
-                        is_graph_capturing
-                        and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-                    )
-                    or (
-                        not is_graph_capturing
-                        and cudagraph_runtime_mode != CUDAGraphMode.NONE
-                    )
+                    (is_graph_capturing and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE)
+                    or (not is_graph_capturing and cudagraph_runtime_mode != CUDAGraphMode.NONE)
                 ) and not self.speculative_config.enforce_eager
 
                 # Note(gnovack) - We need to disable cudagraphs for one of the two
@@ -721,9 +773,7 @@ def _dummy_run(
             self.eplb_step(is_dummy=True, is_profile=is_profile)
 
         logit_indices = np.cumsum(num_scheduled_tokens) - 1
-        logit_indices_device = torch.from_numpy(logit_indices).to(
-            self.device, non_blocking=True
-        )
+        logit_indices_device = torch.from_numpy(logit_indices).to(self.device, non_blocking=True)
         return hidden_states, hidden_states[logit_indices_device]
 
     def _decode_and_store_request_payloads(self, scheduler_output: "SchedulerOutput") -> None:
@@ -1005,9 +1055,10 @@ def _preprocess(
                 except Exception as e:
                     logger.error(f"Error in preprocess for request {req_id}: {e}")
                     import traceback
+
                     traceback.print_exc()
                     raise e
-                #TODO: This is Model Specific Code, need to be generalized in the future ZTC
+                # TODO: This is Model Specific Code, need to be generalized in the future ZTC
                 # run talker mtp decode
                 if hasattr(self.model, "talker_mtp"):
                     _cudagraph_mode, batch_desc, _, _, _ = self._determine_batch_execution_and_padding(

From 48881ed8276f4724046aa50d7d30a32ea22abcce Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 16:36:51 +0800
Subject: [PATCH 42/53] fix import

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index b59298b9a25..93cb3771566 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -132,7 +132,7 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
         - Default vLLM: decode positions use linear [N, N+1, N+2, ...] for all 3 dims
         - GLM-Image needs: temporal=constant, height/width=2D grid pattern
         """
-        from vllm.multimodal.utils import length_from_prompt_token_ids_or_embeds
+        from vllm.utils import length_from_prompt_token_ids_or_embeds
 
         mrope_pos_ptr = 0
         for index, req_id in enumerate(self.input_batch.req_ids):

From 2dda88fd2ead135bfacc0691b88dad70957b8a4c Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 16:43:42 +0800
Subject: [PATCH 43/53] add debug logging

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/worker/gpu_model_runner.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 93cb3771566..72f5051abf2 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -106,6 +106,12 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
                 audio_feature_lengths=audio_feature_lengths,
                 use_audio_in_video=use_audio_in_video,
             )
+            logger.info(
+                f"[M-RoPE Init] prompt_len={len(req_state.prompt_token_ids)}, "
+                f"mrope_positions_shape={req_state.mrope_positions.shape}, "
+                f"mrope_position_delta={req_state.mrope_position_delta}, "
+                f"image_grid_thw={image_grid_thw}"
+            )
         else:
             req_state.mrope_positions, req_state.mrope_position_delta = MRotaryEmbedding.get_input_positions_tensor(
                 req_state.prompt_token_ids,
@@ -177,8 +183,18 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
                     self.mrope_positions.np[:, dst_start : dst_start + completion_part_len] = req.mrope_positions[
                         :, decode_start:decode_end
                     ]
+                    logger.debug(
+                        f"[M-RoPE] Using pre-computed decode positions: "
+                        f"decode_start={decode_start}, decode_end={decode_end}, "
+                        f"total_precomputed={total_precomputed}"
+                    )
                 else:
                     # Fallback to default linear positions for text-only generation
+                    logger.warning(
+                        f"[M-RoPE] Falling back to linear positions! "
+                        f"decode_end={decode_end} > total_precomputed={total_precomputed}, "
+                        f"num_prompt_tokens={num_prompt_tokens}, completion_part_len={completion_part_len}"
+                    )
                     assert req.mrope_position_delta is not None
                     MRotaryEmbedding.get_next_input_positions_tensor(
                         out=self.mrope_positions.np,

From e676c9392699cc8a80daa62fa0c724489cb73e2d Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 16:52:47 +0800
Subject: [PATCH 44/53] more logs

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 42 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 5ed879016f9..75ea6fbd902 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -405,8 +405,17 @@ def _get_mm_fields_config(
         result = {}
 
         # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding)
+        # For text-to-image, we don't have pixel_values but still need image_grid_thw
+        # Use "image" modality so it gets processed, or use flat for metadata-only fields
         if "image_grid_thw" in hf_inputs:
-            result["image_grid_thw"] = MultiModalFieldConfig.batched("image")
+            # Check if we have pixel_values (image-to-image) or not (text-to-image)
+            if "pixel_values" in hf_inputs:
+                # Image-to-image: batch with image modality
+                result["image_grid_thw"] = MultiModalFieldConfig.batched("image")
+            else:
+                # Text-to-image: use flat config to ensure it's passed through
+                # This is metadata that doesn't depend on actual image data
+                result["image_grid_thw"] = MultiModalFieldConfig.flat("image", allow_missing=True)
 
         # pixel_values only present in image-to-image mode
         if "pixel_values" in hf_inputs:
@@ -1742,6 +1751,12 @@ def get_mrope_input_positions(
         Returns:
             Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta)
         """
+        logger.info(
+            f"[GLM-Image M-RoPE] get_mrope_input_positions called: "
+            f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, "
+            f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}"
+        )
+
         # Get image_grid_thw from either the direct arg or mm_features
         if image_grid_thw is None and mm_features is not None:
             # Gather image grid info from multimodal features
@@ -1758,6 +1773,31 @@ def get_mrope_input_positions(
         image_start_token_id = hf_config.image_start_token_id
         image_end_token_id = hf_config.image_end_token_id
 
+        # For text-to-image: parse grid info from input tokens if not provided
+        # Input format: "text<sop>H W<eop><sop>h w<eop><|dit_token_16384|>"
+        # where H W is large image grid (e.g., 32 32) and h w is small image grid (e.g., 16 16)
+        if not image_grid_thw:
+            # Try to parse from kwargs (passed from processor)
+            hf_config_arg = kwargs.get("hf_config")
+            if hf_config_arg is not None and hasattr(hf_config_arg, "image_grid_thw"):
+                image_grid_thw = hf_config_arg.image_grid_thw
+
+            # If still empty, try to infer from input tokens
+            if not image_grid_thw:
+                # Check if this is a text-to-image request by looking for dit_token
+                # dit_token_id = image_start_token_id = 16384
+                has_dit_token = image_start_token_id in input_tokens
+                has_end_token = image_end_token_id in input_tokens
+
+                # Text-to-image: has dit_token but no end_token (nothing generated yet)
+                if has_dit_token and not has_end_token:
+                    # Default grids for text-to-image: large (32x32) and small (16x16)
+                    # These are the standard GLM-Image generation grids
+                    # The actual grid sizes should be parsed from the prompt, but for now use defaults
+                    # TODO: Parse grid sizes from prompt tokens like "<sop>32 32<eop>"
+                    image_grid_thw = [[1, 32, 32], [1, 16, 16]]
+                    logger.info(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}")
+
         seq_len = len(input_tokens)
         llm_pos_ids_list: list[torch.Tensor] = []
 

From fc540f720f1d4dd76b7f93a24a5b4b9be30f2e99 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 16:56:47 +0800
Subject: [PATCH 45/53] fix config

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../model_executor/models/glm_image/glm_image_ar.py  | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 75ea6fbd902..4c33611a959 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -406,16 +406,10 @@ def _get_mm_fields_config(
 
         # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding)
         # For text-to-image, we don't have pixel_values but still need image_grid_thw
-        # Use "image" modality so it gets processed, or use flat for metadata-only fields
+        # Always use batched("image") - the fallback in get_mrope_input_positions will handle
+        # the case where image_grid_thw is not passed through mm_features
         if "image_grid_thw" in hf_inputs:
-            # Check if we have pixel_values (image-to-image) or not (text-to-image)
-            if "pixel_values" in hf_inputs:
-                # Image-to-image: batch with image modality
-                result["image_grid_thw"] = MultiModalFieldConfig.batched("image")
-            else:
-                # Text-to-image: use flat config to ensure it's passed through
-                # This is metadata that doesn't depend on actual image data
-                result["image_grid_thw"] = MultiModalFieldConfig.flat("image", allow_missing=True)
+            result["image_grid_thw"] = MultiModalFieldConfig.batched("image")
 
         # pixel_values only present in image-to-image mode
         if "pixel_values" in hf_inputs:

From 585cecdce99d31e59e816649899dfb90fa8b745c Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 17:03:22 +0800
Subject: [PATCH 46/53] more logs

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py               |  8 +++++---
 vllm_omni/worker/gpu_model_runner.py               | 14 ++++++++++----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 4c33611a959..9df38a2d0b8 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -1745,7 +1745,7 @@ def get_mrope_input_positions(
         Returns:
             Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta)
         """
-        logger.info(
+        logger.warning(
             f"[GLM-Image M-RoPE] get_mrope_input_positions called: "
             f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, "
             f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}"
@@ -1790,7 +1790,7 @@ def get_mrope_input_positions(
                     # The actual grid sizes should be parsed from the prompt, but for now use defaults
                     # TODO: Parse grid sizes from prompt tokens like "<sop>32 32<eop>"
                     image_grid_thw = [[1, 32, 32], [1, 16, 16]]
-                    logger.info(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}")
+                    logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}")
 
         seq_len = len(input_tokens)
         llm_pos_ids_list: list[torch.Tensor] = []
@@ -1882,15 +1882,17 @@ def get_mrope_input_positions(
                 llm_positions = torch.cat([prefill_positions, decode_positions], dim=1)
 
                 # Log for debugging
-                logger.info(
+                logger.warning(
                     f"[GLM-Image M-RoPE] prefill_len={prefill_positions.shape[1]}, "
                     f"decode_len={decode_positions.shape[1]}, total_len={llm_positions.shape[1]}"
                 )
             else:
                 llm_positions = prefill_positions
+                logger.warning(f"[GLM-Image M-RoPE] No decode grids, prefill_len={prefill_positions.shape[1]}")
         else:
             # Pure text - all dimensions same
             llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1)
+            logger.warning(f"[GLM-Image M-RoPE] Pure text mode, positions_len={seq_len}")
 
         mrope_position_delta = (llm_positions.max() + 1 - seq_len).item()
         return llm_positions, mrope_position_delta
diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 72f5051abf2..55d7a5d2e32 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -96,6 +96,12 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
                 use_audio_in_video = bool(use_audio_in_video_value.item())
 
         if supports_mrope(self.model):
+            logger.warning(
+                f"[M-RoPE Init] Calling get_mrope_input_positions: "
+                f"prompt_len={len(req_state.prompt_token_ids)}, "
+                f"mm_features_count={len(req_state.mm_features) if req_state.mm_features else 0}, "
+                f"image_grid_thw={image_grid_thw}"
+            )
             req_state.mrope_positions, req_state.mrope_position_delta = self.model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
                 mm_features=req_state.mm_features,
@@ -106,13 +112,13 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
                 audio_feature_lengths=audio_feature_lengths,
                 use_audio_in_video=use_audio_in_video,
             )
-            logger.info(
-                f"[M-RoPE Init] prompt_len={len(req_state.prompt_token_ids)}, "
+            logger.warning(
+                f"[M-RoPE Init] Result: "
                 f"mrope_positions_shape={req_state.mrope_positions.shape}, "
-                f"mrope_position_delta={req_state.mrope_position_delta}, "
-                f"image_grid_thw={image_grid_thw}"
+                f"mrope_position_delta={req_state.mrope_position_delta}"
             )
         else:
+            logger.warning("[M-RoPE Init] Model does not support M-RoPE, using default")
             req_state.mrope_positions, req_state.mrope_position_delta = MRotaryEmbedding.get_input_positions_tensor(
                 req_state.prompt_token_ids,
                 hf_config=self.model_config.hf_config,

From 96208755d15328615f2312042e0ef6a240056e34 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 17:19:06 +0800
Subject: [PATCH 47/53] correct text-to-image detection for M-RoPE position
 computation

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 134 +++++++++++++++---
 1 file changed, 116 insertions(+), 18 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 9df38a2d0b8..8e359a9eb2e 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -1715,6 +1715,95 @@ def get_image_tokens(
         """Tokenize image features with VQ-VAE."""
         return self.model.get_image_tokens(hidden_states, image_grid_thw)
 
+    def _parse_grid_from_tokens(
+        self,
+        input_tokens: list[int],
+        hf_config,
+    ) -> list[list[int]] | None:
+        """
+        Parse image grid dimensions from prompt tokens.
+
+        For text-to-image, the prompt format is:
+        "text<sop>H W<eop><sop>h w<eop><bos>"
+
+        Where:
+        - <sop> is grid_bos_token_id (start of phrase, marks grid dimension start)
+        - <eop> is grid_eos_token_id (end of phrase, marks grid dimension end)
+        - H W is large image grid (e.g., "32 32" for 1024x1024)
+        - h w is small image grid (e.g., "16 16" for preview)
+        - <bos> is image_start_token_id (16384, marks start of image generation)
+
+        Returns:
+            List of grids [[1, H, W], [1, h, w]] or None if parsing fails
+        """
+        try:
+            # Get special token IDs from config or tokenizer
+            # We need grid_bos_token_id and grid_eos_token_id
+            # These are typically <sop> and <eop> tokens
+
+            # First try to get from hf_config
+            grid_bos_id = getattr(hf_config, "grid_bos_token_id", None)
+            grid_eos_id = getattr(hf_config, "grid_eos_token_id", None)
+
+            # If not in config, we need to infer from token patterns
+            # For GLM-Image, looking at the processor code:
+            # - grid_bos_token = tokenizer.grid_bos_token
+            # - grid_eos_token = tokenizer.grid_eos_token
+            # These are typically single-token markers
+
+            if grid_bos_id is None or grid_eos_id is None:
+                # Try to find pattern in tokens: look for repeated pattern of
+                # [marker] [number] [number] [marker]
+                # where numbers are small positive integers (grid dimensions like 16, 32)
+
+                # Use heuristics: grid dimensions are typically between 8 and 128
+                # represented as single tokens that decode to numbers
+
+                # For now, return None and let caller use defaults
+                logger.warning(
+                    "[GLM-Image M-RoPE] Cannot find grid_bos_token_id/grid_eos_token_id, will use default grids"
+                )
+                return None
+
+            # Find all <sop>...<eop> regions
+            grids = []
+            i = 0
+            while i < len(input_tokens):
+                if input_tokens[i] == grid_bos_id:
+                    # Found start of grid region, find end
+                    j = i + 1
+                    while j < len(input_tokens) and input_tokens[j] != grid_eos_id:
+                        j += 1
+
+                    if j < len(input_tokens):
+                        # Extract tokens between <sop> and <eop>
+                        grid_tokens = input_tokens[i + 1 : j]
+
+                        # These should decode to "H W" format
+                        # For now, we assume they're numeric token IDs that represent the dimensions
+                        # This is a simplification - actual implementation would need tokenizer
+
+                        if len(grid_tokens) >= 2:
+                            # Assume first two tokens are H and W values
+                            # This is a heuristic - actual values depend on tokenizer
+                            # For GLM-Image with ChatGLM tokenizer, numbers are tokenized specially
+                            h = grid_tokens[0] if grid_tokens[0] < 256 else 32  # fallback
+                            w = grid_tokens[1] if grid_tokens[1] < 256 else 32  # fallback
+                            grids.append([1, h, w])
+
+                    i = j + 1
+                else:
+                    i += 1
+
+            if len(grids) >= 2:
+                return grids
+
+            return None
+
+        except Exception as e:
+            logger.warning(f"[GLM-Image M-RoPE] Error parsing grids from tokens: {e}")
+            return None
+
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
@@ -1745,10 +1834,17 @@ def get_mrope_input_positions(
         Returns:
             Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta)
         """
+        hf_config = self.config
+        image_start_token_id = hf_config.image_start_token_id
+        image_end_token_id = hf_config.image_end_token_id
+
         logger.warning(
             f"[GLM-Image M-RoPE] get_mrope_input_positions called: "
             f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, "
-            f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}"
+            f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}, "
+            f"last_token={input_tokens[-1] if input_tokens else None}, "
+            f"image_start_token_id={image_start_token_id}, "
+            f"image_end_token_id={image_end_token_id}"
         )
 
         # Get image_grid_thw from either the direct arg or mm_features
@@ -1763,13 +1859,9 @@ def get_mrope_input_positions(
         if image_grid_thw is None:
             image_grid_thw = []
 
-        hf_config = self.config
-        image_start_token_id = hf_config.image_start_token_id
-        image_end_token_id = hf_config.image_end_token_id
-
         # For text-to-image: parse grid info from input tokens if not provided
-        # Input format: "text<sop>H W<eop><sop>h w<eop><|dit_token_16384|>"
-        # where H W is large image grid (e.g., 32 32) and h w is small image grid (e.g., 16 16)
+        # Input format: "text<sop>H W<eop><sop>h w<eop><bos>" where <bos>=image_start_token_id=16384
+        # For 1024x1024: H=32, W=32 (large), h=16, w=16 (small preview)
         if not image_grid_thw:
             # Try to parse from kwargs (passed from processor)
             hf_config_arg = kwargs.get("hf_config")
@@ -1778,19 +1870,25 @@ def get_mrope_input_positions(
 
             # If still empty, try to infer from input tokens
             if not image_grid_thw:
-                # Check if this is a text-to-image request by looking for dit_token
-                # dit_token_id = image_start_token_id = 16384
-                has_dit_token = image_start_token_id in input_tokens
+                # Check if this is a text-to-image request:
+                # - Prompt ends with image_start_token_id (16384, the <bos> token for image generation)
+                # - No image_end_token_id (16385) in prompt (no completed images)
+                prompt_ends_with_start = len(input_tokens) > 0 and input_tokens[-1] == image_start_token_id
                 has_end_token = image_end_token_id in input_tokens
 
-                # Text-to-image: has dit_token but no end_token (nothing generated yet)
-                if has_dit_token and not has_end_token:
-                    # Default grids for text-to-image: large (32x32) and small (16x16)
-                    # These are the standard GLM-Image generation grids
-                    # The actual grid sizes should be parsed from the prompt, but for now use defaults
-                    # TODO: Parse grid sizes from prompt tokens like "<sop>32 32<eop>"
-                    image_grid_thw = [[1, 32, 32], [1, 16, 16]]
-                    logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, using default grids: {image_grid_thw}")
+                # Text-to-image: ends with start token but no end token
+                if prompt_ends_with_start and not has_end_token:
+                    # Parse grid dimensions from prompt tokens
+                    # Format: ... <sop> H W <eop> <sop> h w <eop> <bos>
+                    # We need to find the grid_bos_token (<sop>) and grid_eos_token (<eop>)
+                    # and extract the dimensions between them
+                    image_grid_thw = self._parse_grid_from_tokens(input_tokens, hf_config)
+                    if image_grid_thw:
+                        logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, parsed grids: {image_grid_thw}")
+                    else:
+                        # Fallback to default 1024x1024 grids if parsing fails
+                        image_grid_thw = [[1, 32, 32], [1, 16, 16]]
+                        logger.warning(f"[GLM-Image M-RoPE] Text-to-image, using default grids: {image_grid_thw}")
 
         seq_len = len(input_tokens)
         llm_pos_ids_list: list[torch.Tensor] = []

From 0dad1617729b529e14c3df5f21a67083a633003e Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 17:31:54 +0800
Subject: [PATCH 48/53] override config detection

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/worker/gpu_model_runner.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 55d7a5d2e32..087a6f44e2d 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -40,6 +40,22 @@ def __init__(self, *args, **kwargs):
         self._omni_num_scheduled_tokens_np: np.ndarray | None = None
         self._omni_last_model_output: object | None = None
 
+        # Override uses_mrope for models that use M-RoPE but vLLM's default
+        # detection fails (e.g., GLM-Image has mrope_section in text_config,
+        # but vLLM only checks top-level hf_config.rope_parameters)
+        if not self.uses_mrope:
+            hf_config = self.model_config.hf_config
+            # Check text_config for nested configs (like GLM-Image)
+            text_config = getattr(hf_config, "text_config", None)
+            if text_config is not None:
+                rope_params = getattr(text_config, "rope_parameters", None)
+                if rope_params is not None and rope_params.get("mrope_section") is not None:
+                    self.uses_mrope = True
+                    logger.info(
+                        f"[OmniGPUModelRunner] Enabling M-RoPE for model_type={hf_config.model_type} "
+                        f"(detected mrope_section in text_config)"
+                    )
+
     def load_model(self, *args, **kwargs) -> None:
         super().load_model(*args, **kwargs)
         # TODO move this model specific logic to a separate class

From ed232c7d4b525531dbd3ab5dd80f6bf4a25a2b7e Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 17:37:08 +0800
Subject: [PATCH 49/53] use a straight detection of mrope_section

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/worker/gpu_model_runner.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 087a6f44e2d..8a24b65a0d1 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -41,20 +41,17 @@ def __init__(self, *args, **kwargs):
         self._omni_last_model_output: object | None = None
 
         # Override uses_mrope for models that use M-RoPE but vLLM's default
-        # detection fails (e.g., GLM-Image has mrope_section in text_config,
-        # but vLLM only checks top-level hf_config.rope_parameters)
+        # detection fails. GLM-Image uses M-RoPE (mrope_section in config.json)
+        # but transformers ignores it with warning:
+        # "Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'mrope_section'}"
+        # So we hardcode the detection based on model_type.
         if not self.uses_mrope:
             hf_config = self.model_config.hf_config
-            # Check text_config for nested configs (like GLM-Image)
-            text_config = getattr(hf_config, "text_config", None)
-            if text_config is not None:
-                rope_params = getattr(text_config, "rope_parameters", None)
-                if rope_params is not None and rope_params.get("mrope_section") is not None:
-                    self.uses_mrope = True
-                    logger.info(
-                        f"[OmniGPUModelRunner] Enabling M-RoPE for model_type={hf_config.model_type} "
-                        f"(detected mrope_section in text_config)"
-                    )
+            model_type = getattr(hf_config, "model_type", None)
+            # GLM-Image uses M-RoPE with mrope_section: [8, 12, 12]
+            if model_type in ("glm_image",):
+                self.uses_mrope = True
+                logger.info(f"[OmniGPUModelRunner] Enabling M-RoPE for model_type={model_type}")
 
     def load_model(self, *args, **kwargs) -> None:
         super().load_model(*args, **kwargs)

From eff86c1ffd48153497111a328d8ab6e3fadc9850 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 18:07:09 +0800
Subject: [PATCH 50/53] use get_model

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 vllm_omni/worker/gpu_model_runner.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 8a24b65a0d1..4c42054b24d 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -108,14 +108,17 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
             if use_audio_in_video_value is not None:
                 use_audio_in_video = bool(use_audio_in_video_value.item())
 
-        if supports_mrope(self.model):
+        # Get unwrapped model - self.model may be wrapped in CUDAGraphWrapper
+        # after load_model(), which would break the isinstance check in supports_mrope()
+        model = self.get_model()
+        if supports_mrope(model):
             logger.warning(
                 f"[M-RoPE Init] Calling get_mrope_input_positions: "
                 f"prompt_len={len(req_state.prompt_token_ids)}, "
                 f"mm_features_count={len(req_state.mm_features) if req_state.mm_features else 0}, "
                 f"image_grid_thw={image_grid_thw}"
             )
-            req_state.mrope_positions, req_state.mrope_position_delta = self.model.get_mrope_input_positions(
+            req_state.mrope_positions, req_state.mrope_position_delta = model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
                 mm_features=req_state.mm_features,
                 hf_config=self.model_config.hf_config,

From e93c18b355bfde73594717986e33e1c2c96e4a94 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 18:37:23 +0800
Subject: [PATCH 51/53] cleanup: remove debug logging and simplify docstrings

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 88 ++-----------------
 vllm_omni/worker/gpu_model_runner.py          | 35 +-------
 2 files changed, 10 insertions(+), 113 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 8e359a9eb2e..8cddd80ceec 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -343,10 +343,6 @@ def _call_hf_processor(
                 # Build messages format expected by processor
                 messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
 
-                logger.info("[GLM-Image T2I] Using GlmImageProcessor.apply_chat_template")
-                logger.info(f"[GLM-Image T2I] target_h={target_h}, target_w={target_w}")
-                logger.info(f"[GLM-Image T2I] prompt: {prompt[:200]}...")
-
                 # Use apply_chat_template which handles target dimensions
                 hf_inputs = processor.apply_chat_template(
                     messages,
@@ -357,25 +353,6 @@ def _call_hf_processor(
                     return_tensors="pt",
                 )
 
-                # Debug: log the tokenized input
-                if "input_ids" in hf_inputs:
-                    input_ids = hf_inputs["input_ids"]
-                    if hasattr(input_ids, "shape"):
-                        logger.info(f"[GLM-Image T2I] input_ids shape: {input_ids.shape}")
-                    # Use processor's tokenizer (not ByT5Tokenizer from tokenizer/ dir)
-                    # GlmImageProcessor has its own tokenizer with a different vocabulary
-                    if hasattr(processor, "tokenizer") and processor.tokenizer is not None:
-                        ids_list = input_ids[0].tolist() if hasattr(input_ids[0], "tolist") else list(input_ids[0])
-                        try:
-                            decoded = processor.tokenizer.decode(ids_list)
-                            logger.info(f"[GLM-Image T2I] decoded input: {decoded}")
-                        except Exception as e:
-                            logger.warning(f"[GLM-Image T2I] could not decode: {e}")
-                            logger.info(f"[GLM-Image T2I] first 50 token ids: {ids_list[:50]}")
-
-                if "image_grid_thw" in hf_inputs:
-                    logger.info(f"[GLM-Image T2I] image_grid_thw: {hf_inputs['image_grid_thw']}")
-
                 return hf_inputs
             else:
                 # Fallback: just tokenize (this won't work properly for generation)
@@ -1759,10 +1736,7 @@ def _parse_grid_from_tokens(
                 # Use heuristics: grid dimensions are typically between 8 and 128
                 # represented as single tokens that decode to numbers
 
-                # For now, return None and let caller use defaults
-                logger.warning(
-                    "[GLM-Image M-RoPE] Cannot find grid_bos_token_id/grid_eos_token_id, will use default grids"
-                )
+                # Cannot find grid tokens, let caller use defaults
                 return None
 
             # Find all <sop>...<eop> regions
@@ -1800,8 +1774,7 @@ def _parse_grid_from_tokens(
 
             return None
 
-        except Exception as e:
-            logger.warning(f"[GLM-Image M-RoPE] Error parsing grids from tokens: {e}")
+        except Exception:
             return None
 
     def get_mrope_input_positions(
@@ -1811,42 +1784,20 @@ def get_mrope_input_positions(
         image_grid_thw: list[list[int]] | None = None,
         **kwargs,
     ) -> tuple[torch.Tensor, int]:
-        """
-        Compute M-RoPE position IDs for GLM-Image generation.
-
-        GLM-Image uses 3D position encoding:
-        - For text tokens: all 3 dimensions (temporal, height, width) are the same
-        - For image tokens:
-          - temporal: constant (marks image region)
-          - height: row position in image grid
-          - width: column position in image grid
+        """Compute M-RoPE position IDs for GLM-Image.
 
-        For text-to-image generation, we also pre-compute positions for the tokens
-        that will be generated (small image + large image + EOS), similar to how
-        transformers GLM-Image caches decode positions.
+        GLM-Image uses 3D positional encoding where text tokens have identical
+        values across all dimensions, while image tokens use 2D grid positions.
 
-        Args:
-            input_tokens: List of input token IDs
-            mm_features: Multimodal feature specifications (optional)
-            image_grid_thw: Pre-extracted image grid dimensions (optional)
-            **kwargs: Additional arguments (hf_config, video_grid_thw, etc.)
+        For text-to-image, also pre-computes decode positions for generated tokens.
 
         Returns:
-            Tuple of (position_ids [3, seq_len + decode_len], mrope_position_delta)
+            Tuple of (position_ids [3, total_len], mrope_position_delta)
         """
         hf_config = self.config
         image_start_token_id = hf_config.image_start_token_id
         image_end_token_id = hf_config.image_end_token_id
 
-        logger.warning(
-            f"[GLM-Image M-RoPE] get_mrope_input_positions called: "
-            f"input_tokens_len={len(input_tokens)}, mm_features={mm_features is not None}, "
-            f"image_grid_thw={image_grid_thw}, kwargs_keys={list(kwargs.keys())}, "
-            f"last_token={input_tokens[-1] if input_tokens else None}, "
-            f"image_start_token_id={image_start_token_id}, "
-            f"image_end_token_id={image_end_token_id}"
-        )
-
         # Get image_grid_thw from either the direct arg or mm_features
         if image_grid_thw is None and mm_features is not None:
             # Gather image grid info from multimodal features
@@ -1879,16 +1830,10 @@ def get_mrope_input_positions(
                 # Text-to-image: ends with start token but no end token
                 if prompt_ends_with_start and not has_end_token:
                     # Parse grid dimensions from prompt tokens
-                    # Format: ... <sop> H W <eop> <sop> h w <eop> <bos>
-                    # We need to find the grid_bos_token (<sop>) and grid_eos_token (<eop>)
-                    # and extract the dimensions between them
                     image_grid_thw = self._parse_grid_from_tokens(input_tokens, hf_config)
-                    if image_grid_thw:
-                        logger.warning(f"[GLM-Image M-RoPE] Text-to-image detected, parsed grids: {image_grid_thw}")
-                    else:
+                    if not image_grid_thw:
                         # Fallback to default 1024x1024 grids if parsing fails
                         image_grid_thw = [[1, 32, 32], [1, 16, 16]]
-                        logger.warning(f"[GLM-Image M-RoPE] Text-to-image, using default grids: {image_grid_thw}")
 
         seq_len = len(input_tokens)
         llm_pos_ids_list: list[torch.Tensor] = []
@@ -1978,19 +1923,11 @@ def get_mrope_input_positions(
 
                 # Concatenate prefill and decode positions
                 llm_positions = torch.cat([prefill_positions, decode_positions], dim=1)
-
-                # Log for debugging
-                logger.warning(
-                    f"[GLM-Image M-RoPE] prefill_len={prefill_positions.shape[1]}, "
-                    f"decode_len={decode_positions.shape[1]}, total_len={llm_positions.shape[1]}"
-                )
             else:
                 llm_positions = prefill_positions
-                logger.warning(f"[GLM-Image M-RoPE] No decode grids, prefill_len={prefill_positions.shape[1]}")
         else:
             # Pure text - all dimensions same
             llm_positions = torch.arange(seq_len).view(1, -1).expand(3, -1)
-            logger.warning(f"[GLM-Image M-RoPE] Pure text mode, positions_len={seq_len}")
 
         mrope_position_delta = (llm_positions.max() + 1 - seq_len).item()
         return llm_positions, mrope_position_delta
@@ -2019,15 +1956,6 @@ def forward(
         Returns:
             Hidden states or intermediate tensors
         """
-        # Debug logging (first call only)
-        if not hasattr(self, "_logged_forward"):
-            self._logged_forward = True
-            logger.info(f"[GLM-Image Forward] input_ids shape: {input_ids.shape if input_ids is not None else None}")
-            logger.info(f"[GLM-Image Forward] positions shape: {positions.shape if positions is not None else None}")
-            logger.info(f"[GLM-Image Forward] pixel_values: {pixel_values is not None}")
-            logger.info(f"[GLM-Image Forward] image_grid_thw: {image_grid_thw}")
-            logger.info(f"[GLM-Image Forward] kwargs keys: {list(kwargs.keys())}")
-
         if intermediate_tensors is not None:
             inputs_embeds = None
 
diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 4c42054b24d..c80c3202626 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -112,12 +112,6 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
         # after load_model(), which would break the isinstance check in supports_mrope()
         model = self.get_model()
         if supports_mrope(model):
-            logger.warning(
-                f"[M-RoPE Init] Calling get_mrope_input_positions: "
-                f"prompt_len={len(req_state.prompt_token_ids)}, "
-                f"mm_features_count={len(req_state.mm_features) if req_state.mm_features else 0}, "
-                f"image_grid_thw={image_grid_thw}"
-            )
             req_state.mrope_positions, req_state.mrope_position_delta = model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
                 mm_features=req_state.mm_features,
@@ -128,13 +122,7 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
                 audio_feature_lengths=audio_feature_lengths,
                 use_audio_in_video=use_audio_in_video,
             )
-            logger.warning(
-                f"[M-RoPE Init] Result: "
-                f"mrope_positions_shape={req_state.mrope_positions.shape}, "
-                f"mrope_position_delta={req_state.mrope_position_delta}"
-            )
         else:
-            logger.warning("[M-RoPE Init] Model does not support M-RoPE, using default")
             req_state.mrope_positions, req_state.mrope_position_delta = MRotaryEmbedding.get_input_positions_tensor(
                 req_state.prompt_token_ids,
                 hf_config=self.model_config.hf_config,
@@ -148,17 +136,8 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
     def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
         """Calculate M-RoPE positions for scheduled tokens.
 
-        This method overrides the base vLLM implementation to support models
-        like GLM-Image that pre-compute decode positions with 2D spatial encoding.
-
-        For GLM-Image text-to-image generation:
-        - Prefill positions: Use pre-computed positions from get_mrope_input_positions
-        - Decode positions: Also use pre-computed 2D spatial positions instead of
-          the default linear positions from get_next_input_positions_tensor
-
-        The key difference from vLLM's default behavior:
-        - Default vLLM: decode positions use linear [N, N+1, N+2, ...] for all 3 dims
-        - GLM-Image needs: temporal=constant, height/width=2D grid pattern
+        Overrides base vLLM to use pre-computed 2D spatial positions for decode
+        phase (for models like GLM-Image) instead of linear positions.
         """
         from vllm.utils import length_from_prompt_token_ids_or_embeds
 
@@ -205,18 +184,8 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
                     self.mrope_positions.np[:, dst_start : dst_start + completion_part_len] = req.mrope_positions[
                         :, decode_start:decode_end
                     ]
-                    logger.debug(
-                        f"[M-RoPE] Using pre-computed decode positions: "
-                        f"decode_start={decode_start}, decode_end={decode_end}, "
-                        f"total_precomputed={total_precomputed}"
-                    )
                 else:
                     # Fallback to default linear positions for text-only generation
-                    logger.warning(
-                        f"[M-RoPE] Falling back to linear positions! "
-                        f"decode_end={decode_end} > total_precomputed={total_precomputed}, "
-                        f"num_prompt_tokens={num_prompt_tokens}, completion_part_len={completion_part_len}"
-                    )
                     assert req.mrope_position_delta is not None
                     MRotaryEmbedding.get_next_input_positions_tensor(
                         out=self.mrope_positions.np,

From ad849f09757ff7602d6627ff81e8f650ce2db3ad Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 16 Jan 2026 19:08:32 +0800
Subject: [PATCH 52/53] feat: add profiling points for stage timing analysis

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/pipeline_glm_image.py    |  58 +++++----
 vllm_omni/entrypoints/omni.py                 |  13 ++
 .../stage_input_processors/glm_image.py       | 112 +++---------------
 3 files changed, 57 insertions(+), 126 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index 9a03a934983..35d0e107f4d 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import re
+import time
 from collections.abc import Iterable
 
 import numpy as np
@@ -812,77 +813,59 @@ def _prepare_condition_image_kv_cache(
 
     @torch.inference_mode()
     def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
-        """
-        Main generation forward pass.
-
-        Args:
-            req: OmniDiffusionRequest with generation parameters
+        """Main generation forward pass."""
+        t_forward_start = time.perf_counter()
 
-        Returns:
-            DiffusionOutput containing generated image
-        """
         prompt = req.prompt or ""
         if isinstance(prompt, list):
             prompt = prompt[0] if prompt else ""
 
-        # Get pre-computed prompt embeddings if provided
         prompt_embeds = req.prompt_embeds if isinstance(req.prompt_embeds, torch.Tensor) else None
-
-        # Get condition images for Image Edit mode
-        # Use pre-processed images from pre_process_func
         preprocessed_images = req.preprocessed_image
         condition_images = getattr(req, "prompt_image", None)
         img_height = req.height
         img_width = req.width
-
         is_image_edit = preprocessed_images is not None
 
-        # Use image dimensions as default if available
         height = req.height or img_height or self.default_sample_size * self.vae_scale_factor
         width = req.width or img_width or self.default_sample_size * self.vae_scale_factor
         num_inference_steps = req.num_inference_steps or 50
         guidance_scale = req.guidance_scale or 1.5
 
-        # 0. Validate inputs
         self.check_inputs(prompt=prompt, height=height, width=width, prompt_embeds=prompt_embeds)
 
         batch_size = 1
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        # Set seed if provided
         generator = None
         if req.seed is not None:
             generator = torch.Generator(device=self.device).manual_seed(req.seed)
 
-        # 1. Get prior tokens - either from external source (multistage) or generate internally
-        # Check if prior_token_ids are provided externally (from AR stage in multistage mode)
+        # 1. Get prior tokens
+        t_prior_start = time.perf_counter()
         external_prior_tokens = req.extra.get("prior_token_ids") if req.extra else None
         external_prior_image_ids = req.extra.get("prior_token_image_ids") if req.extra else None
 
         if external_prior_tokens is not None:
-            # Multistage mode: use externally provided prior tokens from vLLM AR stage
-            logger.info("Using externally provided prior tokens from AR stage...")
             prior_token_id = external_prior_tokens
             if isinstance(prior_token_id, list):
                 prior_token_id = torch.tensor(prior_token_id, dtype=torch.long, device=self.device)
             elif isinstance(prior_token_id, torch.Tensor):
                 prior_token_id = prior_token_id.to(device=self.device, dtype=torch.long)
-            # Ensure shape is [1, num_tokens] for batch processing
             if prior_token_id.dim() == 1:
                 prior_token_id = prior_token_id.unsqueeze(0)
             prior_token_image_ids = external_prior_image_ids
         else:
-            # Single-stage mode: generate prior tokens with internal AR model
-            logger.info("Generating prior tokens with AR model...")
             prior_token_id, prior_token_image_ids = self.generate_prior_tokens(
                 prompt=prompt,
                 image=condition_images,
                 height=height,
                 width=width,
             )
+        t_prior_end = time.perf_counter()
 
         # 2. Encode prompt for glyph embeddings
-        logger.info("Encoding prompt...")
+        t_encode_start = time.perf_counter()
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             do_classifier_free_guidance=do_classifier_free_guidance,
@@ -891,19 +874,20 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
             device=self.device,
             dtype=self.transformer.dtype,
         )
+        t_encode_end = time.perf_counter()
 
         # 3. Prepare KV cache for Image Edit mode
+        t_kvcache_start = time.perf_counter()
         kv_caches = None
         if is_image_edit and prior_token_image_ids is not None:
-            logger.info("Preparing KV cache for Image Edit mode...")
             kv_caches = self._prepare_condition_image_kv_cache(
                 condition_images=preprocessed_images,
                 prior_token_image_ids=prior_token_image_ids,
                 prompt_embeds=prompt_embeds,
                 generator=generator,
             )
-            # Switch to read mode for denoising
             kv_caches.set_mode("read")
+        t_kvcache_end = time.perf_counter()
 
         # 4. Prepare latents
         latent_channels = self.transformer.in_channels
@@ -937,8 +921,8 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         target_size = torch.tensor([[height, width]], dtype=prompt_embeds.dtype, device=self.device)
         crop_coords = torch.zeros((1, 2), dtype=prompt_embeds.dtype, device=self.device)
 
-        # 7. Denoising loop with CFG-parallel support
-        logger.info(f"Starting denoising loop with {num_inference_steps} steps...")
+        # 7. Denoising loop
+        t_denoise_start = time.perf_counter()
         latents = self.diffuse(
             latents=latents,
             prior_token_id=prior_token_id,
@@ -951,9 +935,10 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
             do_classifier_free_guidance=do_classifier_free_guidance,
             kv_caches=kv_caches,
         )
+        t_denoise_end = time.perf_counter()
 
         # 8. VAE decode
-        logger.info("Decoding latents with VAE...")
+        t_vae_start = time.perf_counter()
         latents = latents.to(self.vae.dtype)
         latents_mean = (
             torch.tensor(self.vae.config.latents_mean)
@@ -967,8 +952,19 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         )
         latents = latents * latents_std + latents_mean
         image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
-
-        # 9. Leave post-process to vllm-omni pipeline
+        t_vae_end = time.perf_counter()
+
+        t_forward_end = time.perf_counter()
+
+        # Profile logging
+        logger.info(
+            f"[Profile] Diffusion forward: total={t_forward_end - t_forward_start:.3f}s | "
+            f"prior_tokens={t_prior_end - t_prior_start:.3f}s, "
+            f"prompt_encode={t_encode_end - t_encode_start:.3f}s, "
+            f"kv_cache={t_kvcache_end - t_kvcache_start:.3f}s, "
+            f"denoise({num_inference_steps} steps)={t_denoise_end - t_denoise_start:.3f}s, "
+            f"vae_decode={t_vae_end - t_vae_start:.3f}s"
+        )
 
         return DiffusionOutput(output=image)
 
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index 109d5f85473..3ea80ae47ec 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -608,6 +608,7 @@ def _run_generation(
         # Mark first input time for stage-0
         metrics.stage_first_ts[0] = metrics.stage_first_ts[0] or time.time()
 
+        _req_start_perf_ts: dict[str, float] = {}  # perf_counter for profiling
         for req_id, prompt in request_id_to_prompt.items():
             sp0 = sampling_params_list[0]  # type: ignore[index]
             task = {
@@ -617,6 +618,7 @@ def _run_generation(
             }
             self.stage_list[0].submit(task)
             _req_start_ts[req_id] = time.time()
+            _req_start_perf_ts[req_id] = time.perf_counter()
             logger.debug(f"[{self._name}] Enqueued request {req_id} to stage-0")
 
         pbar = None
@@ -659,6 +661,11 @@ def _run_generation(
                     continue
 
                 engine_outputs = _load(result, obj_key="engine_outputs", shm_key="engine_outputs_shm")
+                t_stage_completed = time.perf_counter()
+                stage_elapsed = t_stage_completed - _req_start_perf_ts.get(req_id, t_stage_completed)
+                logger.info(
+                    f"[Profile] Stage {stage_id} completed: req_id={req_id}, elapsed_from_start={stage_elapsed:.3f}s"
+                )
                 # Mark last output time for this stage whenever we receive outputs
                 metrics.stage_last_ts[stage_id] = max(metrics.stage_last_ts[stage_id] or 0.0, time.time())
                 try:
@@ -723,6 +730,7 @@ def _run_generation(
                 next_stage_id = stage_id + 1
                 if next_stage_id <= final_stage_id_to_prompt[req_id]:
                     next_stage: OmniStage = self.stage_list[next_stage_id]
+                    t_transition_start = time.perf_counter()
                     try:
                         next_inputs = next_stage.process_engine_inputs(self.stage_list, [request_id_to_prompt[req_id]])
                     except Exception as e:
@@ -731,6 +739,11 @@ def _run_generation(
                             f" at stage {next_stage_id}: {e}",
                         )
                         continue
+                    t_transition_end = time.perf_counter()
+                    logger.info(
+                        f"[Profile] Stage {stage_id}→{next_stage_id} transition: "
+                        f"process_inputs={t_transition_end - t_transition_start:.4f}s, req_id={req_id}"
+                    )
                     sp_next = sampling_params_list[next_stage_id]  # type: ignore[index]
 
                     # Check if we have a connector for this edge
diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index 79f5d91baf9..f8e7aba302c 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Stage input processor for GLM-Image: AR → Diffusion transition."""
 
+import time
 from typing import Any
 
 import torch
@@ -39,60 +40,23 @@ def _parse_generated_tokens(
     width: int,
     factor: int = 32,
 ) -> tuple[torch.Tensor, int, int]:
-    """Parse AR-generated tokens to extract prior_token_ids.
-
-    The AR model generates tokens in a specific format:
-    - For text-to-image: small_image_tokens + large_image_tokens + EOS
-    - For image-to-image: large_image_tokens + EOS
-
-    We need to extract the large_image_tokens and upsample them.
-
-    Args:
-        token_ids: Generated token IDs from AR model
-        height: Target image height
-        width: Target image width
-        factor: Downsampling factor (default 32 for AR output)
-
-    Returns:
-        Tuple of (upsampled_prior_token_ids, pixel_height, pixel_width)
-    """
+    """Parse AR-generated tokens to extract prior_token_ids."""
     # Calculate token dimensions for target image
     token_h = height // factor
     token_w = width // factor
     large_image_tokens = token_h * token_w
 
     # Calculate small preview image dimensions (used in text-to-image)
-    # GLM-Image generates a small preview at 1/4 resolution before the full image
-    # The preview grid is computed as target_grid / 2 in each dimension
     small_token_h = token_h // 2
     small_token_w = token_w // 2
     small_image_tokens = small_token_h * small_token_w
 
     token_tensor = torch.tensor(token_ids, dtype=torch.long)
 
-    # Log actual values for debugging
-    logger.info(
-        f"_parse_generated_tokens: total_tokens={len(token_ids)}, "
-        f"large_image_tokens={large_image_tokens} ({token_h}x{token_w}), "
-        f"small_image_tokens={small_image_tokens} ({small_token_h}x{small_token_w})"
-    )
-
-    # Analyze token distribution to find image tokens
-    logger.info(
-        f"Full sequence stats: min={token_tensor.min().item()}, "
-        f"max={token_tensor.max().item()}, "
-        f"unique={token_tensor.unique().numel()}"
-    )
-
-    # Print first 20 and last 20 tokens to understand the structure
-    logger.info(f"First 20 tokens: {token_tensor[:20].tolist()}")
-    logger.info(f"Last 20 tokens: {token_tensor[-20:].tolist()}")
-
     # Remove EOS token (16385) from the end if present
     eos_token_id = 16385
     if len(token_ids) > 0 and token_ids[-1] == eos_token_id:
         token_tensor = token_tensor[:-1]
-        logger.info(f"Removed EOS token, remaining: {len(token_tensor)} tokens")
 
     actual_tokens = len(token_tensor)
 
@@ -102,19 +66,14 @@ def _parse_generated_tokens(
         large_end = large_start + large_image_tokens
         prior_token_ids_d32 = token_tensor[large_start:large_end]
         actual_h, actual_w = token_h, token_w
-        logger.info(f"Text-to-image mode: extracting tokens [{large_start}:{large_end}]")
     elif actual_tokens >= large_image_tokens:
         # Image-to-image: large image tokens are at the beginning
         prior_token_ids_d32 = token_tensor[:large_image_tokens]
         actual_h, actual_w = token_h, token_w
-        logger.info(f"Image-to-image mode: extracting tokens [0:{large_image_tokens}]")
     else:
         # Insufficient tokens - try to infer the actual grid size
-        # The model might have generated for a different resolution
         import math
 
-        # Try to find a square grid that fits the available tokens
-        # First check if it matches any of the small+large patterns
         for scale in [1, 2, 4]:
             test_h = token_h // scale
             test_w = token_w // scale
@@ -124,43 +83,27 @@ def _parse_generated_tokens(
             test_small = test_small_h * test_small_w
 
             if actual_tokens >= test_small + test_large:
-                # Found matching grid for t2i
                 prior_token_ids_d32 = token_tensor[test_small : test_small + test_large]
                 actual_h, actual_w = test_h, test_w
-                # Adjust output dimensions
                 height = test_h * factor
                 width = test_w * factor
-                logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}")
+                logger.warning(f"Adjusted grid to {test_h}x{test_w}, output will be {height}x{width}")
                 break
             elif actual_tokens >= test_large:
-                # Found matching grid for i2i
                 prior_token_ids_d32 = token_tensor[:test_large]
                 actual_h, actual_w = test_h, test_w
                 height = test_h * factor
                 width = test_w * factor
-                logger.warning(f"Adjusted grid to {test_h}x{test_w} (scale={scale}), output will be {height}x{width}")
+                logger.warning(f"Adjusted grid to {test_h}x{test_w}, output will be {height}x{width}")
                 break
         else:
-            # Last resort: find closest square grid
             sqrt_tokens = int(math.sqrt(actual_tokens))
             actual_h = actual_w = sqrt_tokens
             usable_tokens = sqrt_tokens * sqrt_tokens
             prior_token_ids_d32 = token_tensor[:usable_tokens]
             height = sqrt_tokens * factor
             width = sqrt_tokens * factor
-            logger.error(
-                f"Could not match grid pattern. Using {sqrt_tokens}x{sqrt_tokens} grid "
-                f"({usable_tokens} tokens), output will be {height}x{width}. "
-                f"This likely indicates a prompt format issue."
-            )
-
-    # Log token value statistics for debugging
-    logger.info(
-        f"prior_token_ids_d32: shape={prior_token_ids_d32.shape}, "
-        f"min={prior_token_ids_d32.min().item()}, "
-        f"max={prior_token_ids_d32.max().item()}, "
-        f"unique_count={prior_token_ids_d32.unique().numel()}"
-    )
+            logger.error(f"Grid pattern mismatch. Using {sqrt_tokens}x{sqrt_tokens}, output: {height}x{width}")
 
     # Upsample from 32x to 16x
     prior_token_ids = _upsample_token_ids(prior_token_ids_d32, actual_h, actual_w)
@@ -174,26 +117,9 @@ def ar2diffusion(
     prompt: OmniTokensPrompt | TextPrompt | list | None = None,
     requires_multimodal_data: bool = False,
 ) -> list[dict[str, Any]]:
-    """
-    Process AR stage outputs to create Diffusion stage inputs.
-
-    This function bridges the AR model (which generates prior_token_ids) and
-    the Diffusion pipeline (which uses them for conditioned denoising).
-
-    Workflow:
-    1. Extract generated token_ids from AR stage output
-    2. Parse and upsample prior_token_ids (32x → 16x)
-    3. Package into diffusion request format with original prompt info
+    """Process AR stage outputs to create Diffusion stage inputs."""
+    t_start = time.perf_counter()
 
-    Args:
-        stage_list: List of stage objects containing outputs
-        engine_input_source: Source stage IDs (typically [0] for AR stage)
-        prompt: Original prompt data (contains height, width, prompt text, images)
-        requires_multimodal_data: Whether to pass multimodal data (condition images)
-
-    Returns:
-        List of dicts containing diffusion request parameters
-    """
     if not engine_input_source:
         raise ValueError("engine_input_source cannot be empty")
 
@@ -217,59 +143,55 @@ def ar2diffusion(
 
         # Get original prompt info
         original_prompt = prompt[i] if i < len(prompt) else {}
-        # Handle various prompt types - convert to dict for uniform access
-        # Note: TypedDict (TextPrompt, OmniTokensPrompt) doesn't support isinstance
         if isinstance(original_prompt, dict):
-            pass  # Already a dict
+            pass
         elif hasattr(original_prompt, "_asdict"):
-            # NamedTuple
             original_prompt = original_prompt._asdict()
         elif hasattr(original_prompt, "__dict__"):
             original_prompt = vars(original_prompt)
         else:
             original_prompt = {}
 
-        # Extract dimensions from original prompt or use defaults
         height = original_prompt.get("height", 1024)
         width = original_prompt.get("width", 1024)
         text_prompt = original_prompt.get("prompt", "")
 
         # Parse and upsample prior tokens
+        t_parse_start = time.perf_counter()
         prior_token_ids, pixel_h, pixel_w = _parse_generated_tokens(generated_token_ids, height, width)
+        t_parse_end = time.perf_counter()
 
-        # Build diffusion input
-        # The diffusion stage expects these in OmniDiffusionRequest format
         diffusion_input = {
             "prompt": text_prompt,
             "height": pixel_h,
             "width": pixel_w,
             "extra": {
                 "prior_token_ids": prior_token_ids,
-                # Pass condition image info for image-to-image mode
                 "prior_token_image_ids": output.multimodal_output.get("prior_token_image_ids")
                 if hasattr(output, "multimodal_output") and output.multimodal_output
                 else None,
             },
         }
 
-        # Include multimodal data (condition images) if required
         if requires_multimodal_data:
             mm_data = original_prompt.get("multi_modal_data")
             if mm_data:
                 diffusion_input["pil_image"] = mm_data.get("image")
 
-        # Copy other relevant parameters from original prompt
         for key in ["seed", "num_inference_steps", "guidance_scale", "negative_prompt"]:
             if key in original_prompt:
                 diffusion_input[key] = original_prompt[key]
 
         diffusion_inputs.append(diffusion_input)
         logger.info(
-            f"ar2diffusion: request {i}: prompt='{text_prompt[:50]}...', "
-            f"prior_token_ids shape={prior_token_ids.shape}, "
-            f"height={pixel_h}, width={pixel_w}"
+            f"[Profile] ar2diffusion request {i}: parse_tokens={t_parse_end - t_parse_start:.4f}s, "
+            f"num_ar_tokens={len(generated_token_ids)}, prior_shape={prior_token_ids.shape}"
         )
 
-    logger.info(f"ar2diffusion: processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs")
+    t_end = time.perf_counter()
+    logger.info(
+        f"[Profile] ar2diffusion total: {t_end - t_start:.4f}s, "
+        f"processed {len(ar_outputs)} AR outputs → {len(diffusion_inputs)} diffusion inputs"
+    )
 
     return diffusion_inputs

From 44f2d30c59339aa74adeabd3fdba1cd696322b89 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Tue, 20 Jan 2026 10:28:09 +0800
Subject: [PATCH 53/53] try implement i2i mode

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../models/glm_image/glm_image_ar.py          | 662 +++++++++++++++---
 .../stage_configs/glm_image.yaml              |   1 +
 .../glm_image_muilticonnector.yaml            |   1 +
 .../stage_input_processors/glm_image.py       |  16 +-
 4 files changed, 587 insertions(+), 93 deletions(-)

diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
index 8cddd80ceec..f1bae87be46 100644
--- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
+++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -77,19 +77,20 @@
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
-from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
     BaseDummyInputsBuilder,
     BaseMultiModalProcessor,
     BaseProcessingInfo,
     PromptReplacement,
     PromptUpdate,
-    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
+from vllm_omni.model_executor.models.output_templates import OmniOutput
+
 logger = init_logger(__name__)
 
 
@@ -172,15 +173,13 @@ def get_hf_processor(self, **kwargs: object):
             return None
 
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
-        # GLM-Image is an image GENERATION model, not an image understanding model.
-        # For text-to-image (t2i) mode: no multimodal input is needed
-        # For image-to-image (i2i) mode: source images are provided as input
+        # GLM-Image is an image GENERATION model that supports:
+        # - Text-to-image (t2i): no multimodal input needed
+        # - Image-to-image (i2i): source images provided as input
         #
-        # Return empty dict to indicate no multimodal inputs are required for
-        # profiling. Image-to-image mode will be handled dynamically at runtime.
-        # This prevents vLLM from trying to create dummy image inputs during
-        # model initialization.
-        return {}
+        # For i2i mode, we support up to 1 image as condition.
+        # The model architecture supports multiple images but typical usage is 1.
+        return {"image": 1}
 
     def get_num_image_tokens(
         self,
@@ -238,36 +237,34 @@ class GlmImageDummyInputsBuilder(BaseDummyInputsBuilder[GlmImageProcessingInfo])
     """
     Builds dummy inputs for GLM-Image model profiling.
 
-    GLM-Image is an image GENERATION model. For text-to-image mode,
-    no multimodal inputs are needed - just a text prompt.
+    GLM-Image is an image GENERATION model that supports:
+    - Text-to-image (t2i): no multimodal input needed
+    - Image-to-image (i2i): source images provided as input
+
+    For profiling purposes, we need to provide dummy multimodal data when
+    mm_counts["image"] > 0, which happens because get_supported_mm_limits
+    declares image support.
     """
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         """
         Generate dummy text for profiling.
 
-        For text-to-image mode (no images), returns a simple text prompt.
-        For image-to-image mode, includes image placeholders.
+        When images are requested (i2i mode profiling), include image placeholders
+        so that _get_prompt_updates can find and replace them. Each <|image|> token
+        will be expanded to grid_h * grid_w tokens by the replacement function.
         """
         num_images = mm_counts.get("image", 0)
 
-        # Text-to-image mode: return a simple text prompt for profiling
-        if num_images == 0:
+        if num_images > 0:
+            # i2i mode: include image placeholders that will be expanded
+            # The <|image|> placeholder will be tokenized to image_token_id (167855)
+            # and then replaced by _get_prompt_updates with actual grid tokens
+            return "<|image|>" * num_images + "A beautiful image."
+        else:
+            # t2i mode: simple text prompt, no image placeholders needed
             return "A beautiful image."
 
-        hf_config = self.info.get_hf_config()
-        # Get image token from config or use default
-        image_token_id = getattr(hf_config, "image_token_id", 167855)
-
-        tokenizer = self.info.get_tokenizer()
-        # Try to get the image token string
-        try:
-            image_token = tokenizer.convert_ids_to_tokens(image_token_id)
-        except Exception:
-            image_token = "<|image|>"
-
-        return image_token * num_images
-
     def get_dummy_mm_data(
         self,
         seq_len: int,
@@ -277,18 +274,20 @@ def get_dummy_mm_data(
         """
         Generate dummy multimodal data for profiling.
 
-        Returns empty dict if no images (text-to-image mode).
+        When images are requested, provide actual dummy images so the vision
+        encoder can be profiled. The image size is set to maximize features
+        for accurate memory profiling.
         """
         num_images = mm_counts.get("image", 0)
 
-        # Text-to-image mode: no multimodal data needed
+        # No images requested: t2i mode, no multimodal data needed
         if num_images == 0:
             return {}
 
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
-        # Default image size from config
+        # Use image size from config for maximum features profiling
         image_size = getattr(vision_config, "image_size", 2048)
         width = height = image_size
 
@@ -328,18 +327,29 @@ def _call_hf_processor(
         1. Build the prompt with target grid dimensions
         2. Build the image_grid_thw tensor for M-RoPE position encoding
 
-        For image-to-image mode, we use the full processor.
+        For image-to-image mode:
+        1. Process source images through the image processor
+        2. Build prompt with image placeholders expanded
+        3. Build image_grid_thw including source and target grids
         """
-        if not mm_data or not mm_data.get("image"):
-            # Text-to-image mode: use GlmImageProcessor with target dimensions
-            # This is critical - the processor adds grid tokens that tell the model
-            # what resolution to generate
-            processor = self.info.get_hf_processor()
-            if processor is not None:
-                # Get target dimensions from mm_kwargs or use defaults
-                target_h = mm_kwargs.get("target_h", 1024) if mm_kwargs else 1024
-                target_w = mm_kwargs.get("target_w", 1024) if mm_kwargs else 1024
+        processor = self.info.get_hf_processor()
+
+        # Debug: log mm_data contents
+        # NOTE: vLLM's ImageProcessorItems.get_processor_data() returns {"images": [...]} (plural)
+        # because ProcessorBatchItems adds 's' suffix: {f"{self.modality}s": self.get_all()}
+        logger.debug(
+            f"_call_hf_processor: mm_data keys={list(mm_data.keys()) if mm_data else None}, "
+            f"has_images={bool(mm_data and mm_data.get('images'))}"
+        )
+
+        # Get target dimensions from mm_kwargs or use defaults
+        target_h = mm_kwargs.get("target_h", 1024) if mm_kwargs else 1024
+        target_w = mm_kwargs.get("target_w", 1024) if mm_kwargs else 1024
 
+        if not mm_data or not mm_data.get("images"):
+            # Text-to-image mode
+            logger.debug("_call_hf_processor: entering t2i mode (no images)")
+            if processor is not None:
                 # Build messages format expected by processor
                 messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
 
@@ -360,14 +370,253 @@ def _call_hf_processor(
                 prompt_ids = tokenizer.encode(prompt)
                 return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
-        # Image-to-image mode: use full processor
-        return super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-            tok_kwargs=tok_kwargs,
+        # Image-to-image mode
+        # NOTE: Use "images" (plural) - this is what vLLM's ImageProcessorItems.get_processor_data() returns
+        images = mm_data.get("images")
+        if not isinstance(images, list):
+            images = [images]
+
+        logger.debug(
+            f"_call_hf_processor i2i: num_images={len(images)}, image_types={[type(img).__name__ for img in images]}"
         )
 
+        if processor is not None:
+            # Build messages with image objects directly in content
+            # This is how GlmImageProcessor expects images - embedded in the content dict
+            # NOT as a separate images= parameter
+            content = []
+            for img in images:
+                content.append({"type": "image", "image": img})
+            content.append({"type": "text", "text": prompt})
+            messages = [{"role": "user", "content": content}]
+
+            logger.debug(f"_call_hf_processor: calling apply_chat_template with {len(images)} images in content")
+
+            # Use apply_chat_template - processor will process images when they're in content
+            hf_inputs = processor.apply_chat_template(
+                messages,
+                tokenize=True,
+                target_h=target_h,
+                target_w=target_w,
+                return_dict=True,
+                return_tensors="pt",
+            )
+
+            logger.debug(f"_call_hf_processor: apply_chat_template returned keys: {list(hf_inputs.keys())}")
+
+            # CRITICAL: Slice image_grid_thw to only include source image grids
+            # GLM-Image's image_grid_thw has [num_source_images + 1, 3] shape:
+            # - First N entries are for source images (these need visual encoding)
+            # - Last entry is for the target image (for generation, no visual encoding)
+            # We need to slice it so batching works correctly with num_images
+            image_grid_thw = hf_inputs.get("image_grid_thw")
+            if image_grid_thw is not None and len(image_grid_thw) > 1:
+                num_source_images = len(image_grid_thw) - 1
+                # Keep only source image grids for multimodal processing
+                source_grids = image_grid_thw[:num_source_images]
+                hf_inputs["image_grid_thw"] = source_grids
+                logger.debug(
+                    f"_call_hf_processor: sliced image_grid_thw from {len(image_grid_thw)} \
+                        to {len(source_grids)} entries"
+                )
+                logger.debug(f"_call_hf_processor: source_grids={source_grids.tolist()}")
+
+            # Debug: Analyze input_ids for image tokens
+            input_ids = hf_inputs.get("input_ids")
+            if input_ids is not None:
+                if hasattr(input_ids, "tolist"):
+                    ids_list = input_ids.tolist()
+                    if isinstance(ids_list[0], list):
+                        ids_list = ids_list[0]  # Unbatch
+                else:
+                    ids_list = list(input_ids)
+
+                # Get image token ID from config
+                hf_config = self.info.get_hf_config()
+                image_token_id = getattr(hf_config, "image_token_id", 167855)
+
+                # Count image tokens
+                image_token_count = ids_list.count(image_token_id)
+                logger.debug(
+                    f"_call_hf_processor: input_ids length={len(ids_list)}, "
+                    f"image_token_id={image_token_id}, "
+                    f"image_token_count={image_token_count}"
+                )
+
+                # Log first/last few tokens to understand structure
+                logger.debug(f"_call_hf_processor: first 20 tokens: {ids_list[:20]}")
+                logger.debug(f"_call_hf_processor: last 20 tokens: {ids_list[-20:]}")
+
+                # Find positions of image tokens
+                image_positions = [i for i, t in enumerate(ids_list) if t == image_token_id]
+                if image_positions:
+                    logger.debug(f"_call_hf_processor: image token positions (first 10): {image_positions[:10]}")
+
+            return hf_inputs
+        else:
+            # Fallback without processor - this is not ideal but prevents crashes
+            logger.warning("GlmImageProcessor not available, using fallback for i2i")
+            tokenizer = self.info.get_tokenizer()
+            hf_config = self.info.get_hf_config()
+
+            # Get image token
+            image_token_id = getattr(hf_config, "image_token_id", 167855)
+            try:
+                image_token = tokenizer.convert_ids_to_tokens(image_token_id)
+            except Exception:
+                image_token = "<|image|>"
+
+            # Build prompt with image placeholders
+            image_placeholders = image_token * len(images)
+            full_prompt = f"{image_placeholders}{prompt}"
+            prompt_ids = tokenizer.encode(full_prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        """
+        Apply the HF processor on the multi-modal data only.
+
+        GLM-Image requires special handling because apply_chat_template always
+        adds a target <|image|> placeholder in addition to source image placeholders.
+        This causes an IndexError when the HF processor tries to find grid info
+        for the target placeholder (which doesn't exist for source-only processing).
+
+        Solution: Call the image processor directly to get pixel_values and
+        image_grid_thw, bypassing apply_chat_template's target handling.
+        """
+        mm_counts = mm_items.get_all_counts()
+        num_images = mm_counts.get("image", 0)
+
+        if num_images == 0:
+            # No images - call parent implementation
+            return super()._apply_hf_processor_mm_only(
+                mm_items=mm_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+
+        # For i2i mode, we need to process images directly with the image processor
+        # to avoid the apply_chat_template target placeholder issue
+        processor = self.info.get_hf_processor()
+        image_processor = processor.image_processor
+
+        # Get images from mm_items
+        images = mm_items.get_items("image", ImageProcessorItems)
+        image_list = [images.get(i) for i in range(images.get_count())]
+
+        logger.debug(f"_apply_hf_processor_mm_only: processing {len(image_list)} images directly")
+
+        # Process images directly with image processor
+        image_inputs = image_processor(
+            images=image_list,
+            return_tensors="pt",
+        )
+
+        # Get grid info for source images only (no target)
+        pixel_values = image_inputs.get("pixel_values")
+        image_grid_thw = image_inputs.get("image_grid_thw")
+
+        logger.debug(
+            f"_apply_hf_processor_mm_only: pixel_values shape=\
+                {pixel_values.shape if pixel_values is not None else None}, "
+            f"image_grid_thw shape={image_grid_thw.shape if image_grid_thw is not None else None}"
+        )
+
+        # Build input_ids with image token placeholders
+        # The _get_prompt_updates returns PromptReplacement(target=[image_token_id], ...)
+        # which needs to find image tokens in input_ids to replace them.
+        # We need to include one image_token_id per image so the replacement can work.
+        tokenizer = self.info.get_tokenizer()
+        image_token_id = tokenizer.convert_tokens_to_ids("<|image|>")
+
+        # Build input_ids: [image_token] * num_images + tokenized text
+        # This way _apply_prompt_updates can find the image tokens and replace them
+        dummy_text = self.dummy_inputs.get_dummy_text(mm_counts)
+        text_ids = tokenizer.encode(dummy_text, add_special_tokens=False)
+        input_ids = [image_token_id] * num_images + text_ids
+
+        logger.debug(
+            f"_apply_hf_processor_mm_only: built input_ids with {num_images} image tokens + {len(text_ids)} text tokens"
+        )
+
+        return BatchFeature(
+            dict(
+                input_ids=[input_ids],
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            ),
+            tensor_type="pt",
+        )
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: str | list[int],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_update: bool,
+    ) -> tuple[list[int], BatchFeature, bool]:
+        """
+        Override to handle GLM-Image i2i mode correctly.
+
+        Problem: When vLLM processes cached mm items (enable_hf_prompt_update=False),
+        the base implementation:
+        1. Gets prompt_ids from _apply_hf_processor_text_only (no image tokens)
+        2. Gets mm_data from _apply_hf_processor_mm_only
+        3. Returns is_update_applied=False
+
+        This causes _apply_prompt_updates to fail because prompt_ids has no image tokens.
+
+        Solution: For i2i mode, we build prompt_ids that include image placeholders,
+        and return is_update_applied=False so _apply_prompt_updates can expand them.
+        """
+        num_images = mm_items.get_all_counts().get("image", 0)
+
+        if num_images == 0 or enable_hf_prompt_update:
+            # t2i mode or normal flow - use parent implementation
+            return super()._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                tokenization_kwargs=tokenization_kwargs,
+                enable_hf_prompt_update=enable_hf_prompt_update,
+            )
+
+        # i2i mode with enable_hf_prompt_update=False (cache miss scenario)
+        # We need to build prompt_ids with image placeholders
+        logger.debug(f"_apply_hf_processor_main: i2i mode with enable_hf_prompt_update=False, num_images={num_images}")
+
+        # Get mm data from our overridden _apply_hf_processor_mm_only
+        mm_processed_data = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+        )
+
+        # Build prompt_ids with image placeholders
+        # _apply_prompt_updates will replace each [image_token_id] with expanded tokens
+        tokenizer = self.info.get_tokenizer()
+        image_token_id = tokenizer.convert_tokens_to_ids("<|image|>")
+
+        if isinstance(prompt, str):
+            text_ids = tokenizer.encode(prompt, add_special_tokens=False)
+        else:
+            text_ids = list(prompt)
+
+        # Prepend image placeholders - one per image
+        prompt_ids = [image_token_id] * num_images + text_ids
+
+        logger.debug(f"_apply_hf_processor_main: built prompt_ids with {num_images} image placeholders")
+
+        # Return is_update_applied=False so _apply_prompt_updates will expand the placeholders
+        return prompt_ids, mm_processed_data, False
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -376,24 +625,74 @@ def _get_mm_fields_config(
         """
         Get the multimodal field configuration.
 
-        For text-to-image: only image_grid_thw is needed (no pixel_values)
-        For image-to-image: both pixel_values and image_grid_thw are needed
+        For GLM-Image i2i mode:
+        - image_grid_thw has been sliced in _call_hf_processor to only include source images
+        - pixel_values has shape [total_patches, C, H, W] - only for source images
+
+        For t2i mode:
+        - No pixel_values, no source images - return empty config
         """
         result = {}
 
-        # image_grid_thw is needed for both t2i and i2i (for M-RoPE position encoding)
-        # For text-to-image, we don't have pixel_values but still need image_grid_thw
-        # Always use batched("image") - the fallback in get_mrope_input_positions will handle
-        # the case where image_grid_thw is not passed through mm_features
-        if "image_grid_thw" in hf_inputs:
-            result["image_grid_thw"] = MultiModalFieldConfig.batched("image")
+        # Debug: log hf_inputs keys
+        logger.debug(f"_get_mm_fields_config: hf_inputs keys: {list(hf_inputs.keys())}")
 
-        # pixel_values only present in image-to-image mode
-        if "pixel_values" in hf_inputs:
-            result["pixel_values"] = MultiModalFieldConfig.batched("image")
+        # Get image_grid_thw if present (already sliced in _call_hf_processor)
+        image_grid_thw = hf_inputs.get("image_grid_thw")
+
+        if "pixel_values" in hf_inputs and image_grid_thw is not None:
+            # i2i mode: pixel_values contains patches for source images
+            # image_grid_thw has already been sliced to only include source grids
+            num_source_images = len(image_grid_thw)
+            logger.debug(
+                f"_get_mm_fields_config: num_source_images={num_source_images}, image_grid_thw={image_grid_thw.shape}"
+            )
+
+            if num_source_images > 0:
+                # Calculate grid sizes for source images
+                image_grid_sizes = image_grid_thw.prod(-1)
+
+                result["pixel_values"] = MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes)
+
+                # Register image_grid_thw - it's been sliced in _call_hf_processor
+                # to only include source image grids, so batching will work correctly
+                result["image_grid_thw"] = MultiModalFieldConfig.batched("image")
+
+        logger.debug(f"_get_mm_fields_config: result keys: {list(result.keys())}")
 
         return result
 
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        """
+        Return whether the HF processor applies prompt updates.
+
+        For GLM-Image i2i mode, the HF processor's apply_chat_template already
+        expands <|image|> to N tokens (e.g., 4096 for 64x64 grid).
+
+        By returning True, we tell vLLM that HF processor DID apply prompt updates,
+        so vLLM will use _find_mm_placeholders to locate the expanded tokens
+        instead of trying to apply replacements.
+
+        For t2i mode (no images), there are no image placeholders to expand.
+        """
+        # Check if we have images (i2i mode)
+        num_images = mm_items.get_all_counts().get("image", 0)
+        if num_images > 0:
+            logger.debug(
+                f"_hf_processor_applies_updates: returning True for i2i mode "
+                f"(num_images={num_images}) - HF processor already expanded tokens"
+            )
+            return True
+
+        # For t2i mode (no images), use default behavior
+        return True
+
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -403,41 +702,93 @@ def _get_prompt_updates(
         """
         Get prompt updates for image tokens.
 
-        GLM-Image replaces each image placeholder with:
-        <|image_start|> + image_tokens + <|image_end|>
+        For GLM-Image image-to-image mode, the HF processor's apply_chat_template
+        already expands each <|image|> placeholder to the correct number of
+        image tokens (grid_h * grid_w tokens per source image).
 
-        Returns empty list if no images (text-to-image mode).
-        """
-        # Check if we have any images
-        if not mm_items.get_count("image", strict=False):
-            return []
+        The HF processor does:
+        1. Replace each <|image|> with num_image_tokens copies of <|placeholder|>
+        2. Replace all <|placeholder|> back to <|image|>
 
+        So the tokenized input already has the expanded tokens. We use
+        target=[image_token_id] to match each occurrence of the image token,
+        similar to how Qwen2VL handles this pattern.
+
+        We use image_grid_thw from out_mm_kwargs to get the actual processed grid
+        size, following the Qwen2VL pattern. This is critical because the HF processor
+        resizes images, so the original image size doesn't match the processed size.
+
+        For t2i mode (no images), we return an empty list since there are no
+        image placeholders to replace.
+        """
         hf_config = self.info.get_hf_config()
 
-        # Get special token IDs from config
+        # Get image token ID - this is the token that appears multiple times
+        # in the tokenized input after HF processor expansion
         image_token_id = getattr(hf_config, "image_token_id", 167855)
-        image_start_id = getattr(hf_config, "image_start_token_id", 16384)
-        image_end_id = getattr(hf_config, "image_end_token_id", 16385)
 
-        # Get image grid info to determine number of tokens per image
-        # For now, use a simple approach based on config
-        vision_config = hf_config.vision_config
-        image_size = getattr(vision_config, "image_size", 2048)
-        patch_size = getattr(vision_config, "patch_size", 16)
+        # Debug: log mm_items info
+        logger.debug(f"_get_prompt_updates: image_token_id={image_token_id}")
+        logger.debug(f"_get_prompt_updates: mm_items modalities={list(mm_items.get_all_counts().keys())}")
+        logger.debug(f"_get_prompt_updates: mm_items counts={mm_items.get_all_counts()}")
+        logger.debug(
+            f"_get_prompt_updates: out_mm_kwargs key={list(out_mm_kwargs.get_data().keys()) if out_mm_kwargs else None}"
+        )
 
-        # Default number of image tokens
-        num_image_tokens = (image_size // patch_size) ** 2
-        image_tokens = [image_token_id] * num_image_tokens
+        # Check if there are any images to process
+        num_images = mm_items.get_count("image", strict=False)
+        if num_images == 0:
+            # t2i mode: no images, no prompt updates needed
+            logger.debug("_get_prompt_updates: no images, returning empty list (t2i mode)")
+            return []
+
+        def get_replacement_glm_image(item_idx: int) -> list[int]:
+            """
+            Return replacement token IDs for an image placeholder.
+
+            For GLM-Image, each source image is represented by grid_h * grid_w tokens.
+            These are placeholder tokens that will be replaced by actual VQ-VAE
+            tokens during model forward pass.
+
+            IMPORTANT: We use image_grid_thw from out_mm_kwargs to get the actual
+            processed grid size. The HF processor resizes images, so the original
+            image size (from mm_items) doesn't match the actual token count.
+            """
+            # Get grid info from out_mm_kwargs (set by _get_mm_fields_config)
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item.get("image_grid_thw")
+
+            if grid_thw is not None:
+                grid_data = grid_thw.data if hasattr(grid_thw, "data") else grid_thw
+                if isinstance(grid_data, torch.Tensor):
+                    # grid is [t, h, w] - for images, t=1, so num_tokens = h * w
+                    num_tokens = int(grid_data.prod().item())
+                else:
+                    num_tokens = int(grid_data[0] * grid_data[1] * grid_data[2])
+                logger.debug(
+                    f"get_replacement_glm_image: item_idx={item_idx}, \
+                        grid={grid_data.tolist() if isinstance(grid_data, torch.Tensor) else grid_data},\
+                              num_tokens={num_tokens}"
+                )
+            else:
+                # Fallback: use default 1024x1024 grid size
+                # (1024/16) * (1024/16) = 64 * 64 = 4096 tokens
+                num_tokens = 64 * 64
+                logger.warning(
+                    f"get_replacement_glm_image: item_idx={item_idx}, \
+                    no grid_thw found, using default num_tokens={num_tokens}"
+                )
+
+            return [image_token_id] * num_tokens
 
         return [
             PromptReplacement(
                 modality="image",
+                # Use [token_id] to match each occurrence of image token
+                # The HF processor has already expanded <|image|> to multiple tokens
                 target=[image_token_id],
-                replacement=PromptUpdateDetails.select_token_id(
-                    [image_start_id] + image_tokens + [image_end_id],
-                    embed_token_id=image_token_id,
-                ),
-            )
+                replacement=get_replacement_glm_image,
+            ),
         ]
 
 
@@ -1539,7 +1890,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         pixel_values: torch.Tensor | None = None,
         image_grid_thw: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
+    ) -> tuple[torch.Tensor | IntermediateTensors, dict | None]:
         """
         Forward pass through the GLM-Image model.
 
@@ -1558,16 +1909,20 @@ def forward(
             image_grid_thw: Grid dimensions for source images
 
         Returns:
-            Hidden states or intermediate tensors for PP
+            Tuple of (hidden_states, prior_token_image_ids_info)
+            prior_token_image_ids_info is a dict with VQ-VAE tokens for i2i mode
         """
+        prior_token_image_ids_info = None
+
         # Handle intermediate tensors for pipeline parallelism
         if intermediate_tensors is not None:
-            return self.language_model(
+            hidden_states = self.language_model(
                 input_ids=None,
                 positions=positions,
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=None,
             )
+            return hidden_states, None
 
         # Process source images if provided (image-to-image generation)
         if pixel_values is not None and image_grid_thw is not None:
@@ -1577,6 +1932,27 @@ def forward(
             image_tokens = self.get_image_tokens(image_features, image_grid_thw)
             image_tokens = image_tokens.to(input_ids.device)
 
+            # Store prior_token_image_ids for diffusion stage (i2i mode)
+            # The tokens need to be upsampled from d32 to d16 (2x) for the DiT
+            # We store the raw tokens here; upsampling happens in ar2diffusion
+            split_sizes = (image_grid_thw.prod(dim=-1)).tolist()
+            image_tokens_list = torch.split(image_tokens, split_sizes, dim=0)
+
+            # Upsample each image's tokens for DiT (from d32 to d16)
+            upsampled_token_ids = []
+            for i, tokens in enumerate(image_tokens_list):
+                grid_t, grid_h, grid_w = image_grid_thw[i].tolist()
+                # Reshape to 2D grid
+                tokens_2d = tokens.view(1, 1, grid_h, grid_w)
+                # Upsample by 2x (nearest neighbor)
+                tokens_upsampled = F.interpolate(tokens_2d.float(), scale_factor=2, mode="nearest").to(dtype=torch.long)
+                upsampled_token_ids.append(tokens_upsampled.view(-1))
+
+            prior_token_image_ids_info = {
+                "prior_token_image_ids": upsampled_token_ids,
+                "image_grid_thw": image_grid_thw.tolist(),
+            }
+
             # Replace placeholder tokens with actual image tokens
             special_image_mask = input_ids == self.image_token_id
             if special_image_mask.sum() > 0:
@@ -1596,7 +1972,7 @@ def forward(
             inputs_embeds=inputs_embeds,
         )
 
-        return hidden_states
+        return hidden_states, prior_token_image_ids_info
 
 
 @MULTIMODAL_REGISTRY.register_processor(
@@ -1692,6 +2068,97 @@ def get_image_tokens(
         """Tokenize image features with VQ-VAE."""
         return self.model.get_image_tokens(hidden_states, image_grid_thw)
 
+    def _parse_and_validate_image_input(
+        self,
+        pixel_values: torch.Tensor | None = None,
+        image_grid_thw: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> dict | None:
+        """Parse and validate image inputs."""
+        if pixel_values is None:
+            return None
+        return {
+            "pixel_values": pixel_values,
+            "image_grid_thw": image_grid_thw,
+        }
+
+    def _process_image_input(
+        self,
+        image_input: dict,
+    ) -> list[torch.Tensor]:
+        """
+        Process image input through vision encoder to get embeddings.
+
+        For GLM-Image, we extract features using the vision encoder.
+        These are used for multimodal profiling. The actual VQ-VAE tokenization
+        happens during the forward pass.
+        """
+        pixel_values = image_input["pixel_values"]
+        image_grid_thw = image_input["image_grid_thw"]
+
+        # Get image features from vision encoder
+        image_features = self.model.get_image_features(pixel_values, image_grid_thw)
+
+        # Split by image grid sizes
+        split_sizes = (image_grid_thw.prod(dim=-1)).tolist()
+        image_features_list = torch.split(image_features, split_sizes, dim=0)
+
+        return list(image_features_list)
+
+    def embed_multimodal(
+        self,
+        **kwargs: object,
+    ) -> tuple[torch.Tensor, ...] | None:
+        """
+        Embed multimodal inputs (images) for vLLM's multimodal processing.
+
+        For GLM-Image, this extracts image features using the vision encoder.
+        These embeddings are used by vLLM for multimodal budget profiling.
+        The actual token replacement (via VQ-VAE) happens in the forward pass.
+
+        Returns:
+            Tuple of image embedding tensors, one per image
+        """
+        # Debug: log kwargs keys
+        logger.debug(f"embed_multimodal called with kwargs keys: {list(kwargs.keys())}")
+
+        # Parse image inputs - check for multiple possible keys
+        pixel_values = kwargs.get("pixel_values")
+        image_embeds = kwargs.get("image_embeds")  # Alternative key
+        image_grid_thw = kwargs.get("image_grid_thw")
+
+        # Debug: log what we found
+        logger.debug(f"pixel_values type: {type(pixel_values)}, image_grid_thw type: {type(image_grid_thw)}")
+
+        if pixel_values is None and image_embeds is None:
+            # No image inputs
+            logger.debug("No pixel_values or image_embeds found in kwargs")
+            return ()
+
+        # Use pixel_values if available, otherwise use image_embeds
+        if pixel_values is not None:
+            image_input = self._parse_and_validate_image_input(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+        else:
+            # Handle image_embeds case - these are pre-computed embeddings
+            if isinstance(image_embeds, torch.Tensor):
+                # Split by image grid sizes if available
+                if image_grid_thw is not None:
+                    split_sizes = (image_grid_thw.prod(dim=-1)).tolist()
+                    return tuple(torch.split(image_embeds, split_sizes, dim=0))
+                else:
+                    return (image_embeds,)
+            return ()
+
+        if image_input is None:
+            return ()
+
+        # Process images through vision encoder
+        image_embeddings = self._process_image_input(image_input)
+        return tuple(image_embeddings)
+
     def _parse_grid_from_tokens(
         self,
         input_tokens: list[int],
@@ -1932,6 +2399,9 @@ def get_mrope_input_positions(
         mrope_position_delta = (llm_positions.max() + 1 - seq_len).item()
         return llm_positions, mrope_position_delta
 
+    # Flag to indicate this model can output multimodal data (prior_token_image_ids for i2i)
+    have_multimodal_outputs = True
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1941,7 +2411,7 @@ def forward(
         pixel_values: torch.Tensor | None = None,
         image_grid_thw: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> torch.Tensor | IntermediateTensors:
+    ) -> OmniOutput | IntermediateTensors:
         """
         Forward pass through GLM-Image.
 
@@ -1954,12 +2424,12 @@ def forward(
             image_grid_thw: Grid dimensions for images
 
         Returns:
-            Hidden states or intermediate tensors
+            OmniOutput with hidden states and optional prior_token_image_ids for i2i
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        hidden_states = self.model(
+        hidden_states, prior_token_image_ids_info = self.model(
             input_ids=input_ids,
             positions=positions,
             intermediate_tensors=intermediate_tensors,
@@ -1968,7 +2438,19 @@ def forward(
             image_grid_thw=image_grid_thw,
         )
 
-        return hidden_states
+        # For intermediate tensors (PP), just return hidden states
+        if isinstance(hidden_states, IntermediateTensors):
+            return hidden_states
+
+        # Build multimodal outputs for i2i mode
+        multimodal_outputs = None
+        if prior_token_image_ids_info is not None:
+            multimodal_outputs = prior_token_image_ids_info
+
+        return OmniOutput(
+            text_hidden_states=hidden_states,
+            multimodal_outputs=multimodal_outputs,
+        )
 
     def compute_logits(
         self,
diff --git a/vllm_omni/model_executor/stage_configs/glm_image.yaml b/vllm_omni/model_executor/stage_configs/glm_image.yaml
index 7deca12c9ba..20eea93a8ae 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image.yaml
@@ -45,6 +45,7 @@ stage_args:
       process: true
       devices: "1" # Can use different GPU, or same GPU if memory allows
       max_batch_size: 1
+      requires_multimodal_data: true # Required for i2i mode to pass condition images
     engine_args:
       model_stage: dit
       model_arch: GlmImagePipeline # Required for diffusion model class resolution
diff --git a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
index d1e10cb4065..9f481463043 100644
--- a/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/glm_image_muilticonnector.yaml
@@ -47,6 +47,7 @@ stage_args:
       process: true
       devices: "1" # Use separate GPU for diffusion
       max_batch_size: 1
+      requires_multimodal_data: true # Required for i2i mode to pass condition images
     engine_args:
       model_stage: dit
       # Diffusion-specific parameters
diff --git a/vllm_omni/model_executor/stage_input_processors/glm_image.py b/vllm_omni/model_executor/stage_input_processors/glm_image.py
index f8e7aba302c..6d3d1efae2a 100644
--- a/vllm_omni/model_executor/stage_input_processors/glm_image.py
+++ b/vllm_omni/model_executor/stage_input_processors/glm_image.py
@@ -161,15 +161,25 @@ def ar2diffusion(
         prior_token_ids, pixel_h, pixel_w = _parse_generated_tokens(generated_token_ids, height, width)
         t_parse_end = time.perf_counter()
 
+        # Get prior_token_image_ids from AR model output (for i2i mode)
+        # This contains VQ-VAE tokens from input image, used for KV cache conditioning
+        prior_token_image_ids = None
+        if hasattr(output, "multimodal_output") and output.multimodal_output:
+            raw_prior_image_ids = output.multimodal_output.get("prior_token_image_ids")
+            if raw_prior_image_ids is not None:
+                # Wrap in list if it's a single tensor (expected by diffusion pipeline)
+                if isinstance(raw_prior_image_ids, torch.Tensor):
+                    prior_token_image_ids = [raw_prior_image_ids]
+                elif isinstance(raw_prior_image_ids, list):
+                    prior_token_image_ids = raw_prior_image_ids
+
         diffusion_input = {
             "prompt": text_prompt,
             "height": pixel_h,
             "width": pixel_w,
             "extra": {
                 "prior_token_ids": prior_token_ids,
-                "prior_token_image_ids": output.multimodal_output.get("prior_token_image_ids")
-                if hasattr(output, "multimodal_output") and output.multimodal_output
-                else None,
+                "prior_token_image_ids": prior_token_image_ids,
             },
         }