From f446d400bb07b1127fdd860f79d36f289f695d51 Mon Sep 17 00:00:00 2001
From: root <root@hk01dgx028.cm.cluster>
Date: Thu, 29 Jan 2026 03:17:06 +0000
Subject: [PATCH 1/2] debug: glm image

---
 vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index d222342b51b..7b63223f67d 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -491,7 +491,7 @@ def generate_prior_tokens(
             condition_grid = image_grid_thw[:-1]
             prior_token_image_embed = self.vision_language_encoder.get_image_features(
                 inputs["pixel_values"], condition_grid
-            )
+            ).pooler_output
             prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
             flat_prior_token_image_ids = self.vision_language_encoder.get_image_tokens(
                 prior_token_image_embed, condition_grid
@@ -802,7 +802,7 @@ def _prepare_condition_image_kv_cache(
 
         # Process each condition image through transformer to populate KV cache
         for condition_image, condition_prior_token_id in zip(condition_images, prior_token_image_ids):
-            condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype)
+            condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype).unsqueeze(0) # [bz=1, 3, H, W]
 
             # Encode condition image to latent space
             # Use argmax (mode) for deterministic encoding of condition images

From 814ce4a4e6c4b347c771bb89a6d7801eb46fe0e2 Mon Sep 17 00:00:00 2001
From: root <root@hk01dgx028.cm.cluster>
Date: Thu, 29 Jan 2026 03:53:18 +0000
Subject: [PATCH 2/2] Refactor: Simplify condition image processing in
 GlmImagePipeline

---
 vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index 7b63223f67d..f5c167c32a9 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -802,7 +802,7 @@ def _prepare_condition_image_kv_cache(
 
         # Process each condition image through transformer to populate KV cache
         for condition_image, condition_prior_token_id in zip(condition_images, prior_token_image_ids):
-            condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype).unsqueeze(0) # [bz=1, 3, H, W]
+            condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype)
 
             # Encode condition image to latent space
             # Use argmax (mode) for deterministic encoding of condition images
@@ -859,7 +859,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
         preprocessed_images = (
             None
             if isinstance(first_prompt, str)
-            else first_prompt.get("additional_information", {}).get("preprocessed_image")
+            else [first_prompt.get("additional_information", {}).get("preprocessed_image")]
         )
         condition_images = (
             None