diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py index 6b041aece56..ee2f471f5a8 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py @@ -95,6 +95,9 @@ def pre_process_func( else: image = cast(PIL.Image.Image | torch.Tensor | np.ndarray, raw_image) + if isinstance(image, PIL.Image.Image) and image.mode != "RGBA": + image = image.convert("RGBA") + # 1. calculate dimensions image_size = image.size assert request.sampling_params.resolution in [640, 1024], ( @@ -652,6 +655,8 @@ def forward( width = req.sampling_params.width else: # fallback to run pre-processing in pipeline (debug only) + if isinstance(image, PIL.Image.Image) and image.mode != "RGBA": + image = image.convert("RGBA") image_size = image[0].size if isinstance(image, list) else image.size assert resolution in [640, 1024], f"resolution must be either 640 or 1024, but got {resolution}" calculated_width, calculated_height = calculate_dimensions( diff --git a/vllm_omni/diffusion/request.py b/vllm_omni/diffusion/request.py index a6005290cdc..56a770461b0 100644 --- a/vllm_omni/diffusion/request.py +++ b/vllm_omni/diffusion/request.py @@ -28,17 +28,23 @@ class OmniDiffusionRequest: def __post_init__(self): """Initialize dependent fields after dataclass initialization.""" + # Detect whether user explicitly provided guidance_scale. + # The sentinel default is 0.0 (false-like); any truthy value means + # the caller set it intentionally. We must resolve this BEFORE + # auto-filling guidance_scale_2, otherwise the sentinel leaks into + # guidance_scale_2. + if self.sampling_params.guidance_scale: + self.sampling_params.guidance_scale_provided = True + else: + self.sampling_params.guidance_scale = 1.0 + # Set do_classifier_free_guidance based on guidance scale and negative prompt if self.sampling_params.guidance_scale > 1.0 and any( (not isinstance(p, str) and p.get("negative_prompt")) for p in self.prompts ): self.sampling_params.do_classifier_free_guidance = True + + # Auto-fill guidance_scale_2 from the (now-resolved) guidance_scale + # so downstream code always has a valid value. if self.sampling_params.guidance_scale_2 is None: self.sampling_params.guidance_scale_2 = self.sampling_params.guidance_scale - - # The dataclass default value is 0 (false-like), used to detect whether user explicitly provides this value - # After this check is done, reset this value to old default 1 - if self.sampling_params.guidance_scale: - self.sampling_params.guidance_scale_provided = True - else: - self.sampling_params.guidance_scale = 1.0 diff --git a/vllm_omni/diffusion/worker/diffusion_model_runner.py b/vllm_omni/diffusion/worker/diffusion_model_runner.py index accb173e1a0..972c95c292c 100644 --- a/vllm_omni/diffusion/worker/diffusion_model_runner.py +++ b/vllm_omni/diffusion/worker/diffusion_model_runner.py @@ -221,6 +221,7 @@ def execute_model(self, req: OmniDiffusionRequest) -> DiffusionOutput: not getattr(req, "skip_cache_refresh", False) and self.cache_backend is not None and self.cache_backend.is_enabled() + and req.sampling_params.num_inference_steps is not None ): self.cache_backend.refresh(self.pipeline, req.sampling_params.num_inference_steps) diff --git a/vllm_omni/entrypoints/async_omni_diffusion.py b/vllm_omni/entrypoints/async_omni_diffusion.py index 08812223db5..52bd6031c64 100644 --- a/vllm_omni/entrypoints/async_omni_diffusion.py +++ b/vllm_omni/entrypoints/async_omni_diffusion.py @@ -172,9 +172,6 @@ async def generate( if request_id is None: request_id = f"diff-{uuid.uuid4().hex[:16]}" - if sampling_params.guidance_scale: - sampling_params.guidance_scale_provided = True - if lora_request is not None: sampling_params.lora_request = lora_request diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index d0a2d5aaa23..4d93dcf0ab9 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2050,20 +2050,26 @@ async def _create_diffusion_chat_completion( except ValueError: logger.warning("Invalid size format: %s", extra_body.get("size")) - # Get request parameters from extra_body - # Text-to-image parameters (ref: text_to_image.py) - num_inference_steps = extra_body.get("num_inference_steps", 50) + # Get request parameters from extra_body. + # Avoid hardcoded defaults here — let each pipeline's forward() + # method apply its own model-specific default when the user does + # not provide a value. + num_inference_steps = extra_body.get("num_inference_steps") guidance_scale = extra_body.get("guidance_scale") - true_cfg_scale = extra_body.get("true_cfg_scale") # Qwen-Image specific + true_cfg_scale = extra_body.get("true_cfg_scale") or extra_body.get("cfg_scale") seed = extra_body.get("seed") negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) # Text-to-video parameters (ref: text_to_video.py) num_frames = extra_body.get("num_frames") - guidance_scale_2 = extra_body.get("guidance_scale_2") # For video high-noise CFG + guidance_scale_2 = extra_body.get("guidance_scale_2") lora_body = extra_body.get("lora") + # Qwen-Image-Layered parameters + layers = extra_body.get("layers") + resolution = extra_body.get("resolution") + logger.info( "Diffusion chat request %s: prompt=%r, ref_images=%d, params=%s", request_id, @@ -2087,25 +2093,27 @@ async def _create_diffusion_chat_completion( "negative_prompt": negative_prompt, } gen_params = OmniDiffusionSamplingParams( - num_inference_steps=num_inference_steps, height=height, width=width, num_outputs_per_prompt=num_outputs_per_prompt, seed=seed, ) + # Only override defaults when the user explicitly provides values + if num_inference_steps is not None: + gen_params.num_inference_steps = num_inference_steps if guidance_scale is not None: gen_params.guidance_scale = guidance_scale - - # Add Qwen-Image specific parameter if true_cfg_scale is not None: gen_params.true_cfg_scale = true_cfg_scale - - # Add video generation parameters if set if num_frames is not None: gen_params.num_frames = num_frames if guidance_scale_2 is not None: gen_params.guidance_scale_2 = guidance_scale_2 + if layers is not None: + gen_params.layers = layers + if resolution is not None: + gen_params.resolution = resolution # Parse per-request LoRA (works for both AsyncOmniDiffusion and AsyncOmni). if lora_body and isinstance(lora_body, dict): diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py index c0f10af2b42..5768c3b6d99 100644 --- a/vllm_omni/inputs/data.py +++ b/vllm_omni/inputs/data.py @@ -234,8 +234,9 @@ class OmniDiffusionSamplingParams: step_index: int | None = None boundary_ratio: float | None = None - # Scheduler parameters - num_inference_steps: int = 50 + # Scheduler parameters – ``None`` means "not explicitly set by the caller"; + # each pipeline's ``forward()`` decides its own model-specific default. + num_inference_steps: int | None = None guidance_scale: float = 0.0 guidance_scale_provided: bool = False guidance_scale_2: float | None = None