From f446d400bb07b1127fdd860f79d36f289f695d51 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 29 Jan 2026 03:17:06 +0000 Subject: [PATCH 1/2] debug: glm image --- vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index d222342b51b..7b63223f67d 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -491,7 +491,7 @@ def generate_prior_tokens( condition_grid = image_grid_thw[:-1] prior_token_image_embed = self.vision_language_encoder.get_image_features( inputs["pixel_values"], condition_grid - ) + ).pooler_output prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0) flat_prior_token_image_ids = self.vision_language_encoder.get_image_tokens( prior_token_image_embed, condition_grid @@ -802,7 +802,7 @@ def _prepare_condition_image_kv_cache( # Process each condition image through transformer to populate KV cache for condition_image, condition_prior_token_id in zip(condition_images, prior_token_image_ids): - condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype) + condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype).unsqueeze(0) # [bz=1, 3, H, W] # Encode condition image to latent space # Use argmax (mode) for deterministic encoding of condition images From 814ce4a4e6c4b347c771bb89a6d7801eb46fe0e2 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 29 Jan 2026 03:53:18 +0000 Subject: [PATCH 2/2] Refactor: Simplify condition image processing in GlmImagePipeline --- vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 7b63223f67d..f5c167c32a9 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -802,7 +802,7 @@ def _prepare_condition_image_kv_cache( # Process each condition image through transformer to populate KV cache for condition_image, condition_prior_token_id in zip(condition_images, prior_token_image_ids): - condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype).unsqueeze(0) # [bz=1, 3, H, W] + condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype) # Encode condition image to latent space # Use argmax (mode) for deterministic encoding of condition images @@ -859,7 +859,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: preprocessed_images = ( None if isinstance(first_prompt, str) - else first_prompt.get("additional_information", {}).get("preprocessed_image") + else [first_prompt.get("additional_information", {}).get("preprocessed_image")] ) condition_images = ( None