From 1939c908cad65bed31f889d33dd5a8a26a4c43aa Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Mon, 18 May 2026 15:04:53 +0000 Subject: [PATCH 01/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/api_server.py | 27 ++++++++++++++++--- .../entrypoints/openai/protocol/images.py | 21 +++++++++++++++ vllm_omni/entrypoints/openai/serving_chat.py | 20 ++++++++++++-- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index c1467f7190a..2941ed85e9a 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1533,6 +1533,12 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) # Keep /images validation semantics: invalid LoRA should fail with 400. _parse_lora_request(request.lora) extra_body["lora"] = request.lora + if request.bot_task is not None: + extra_body["bot_task"] = request.bot_task + if request.use_system_prompt is not None: + extra_body["use_system_prompt"] = request.use_system_prompt + if request.system_prompt is not None: + extra_body["system_prompt"] = request.system_prompt generation_result = await chat_handler.generate_diffusion_images( prompt=request.prompt, @@ -1544,9 +1550,10 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) status_code=generation_result.error.code if generation_result.error else 400, content=generation_result.model_dump(), ) - flat_images, _, _ = generation_result + flat_images, _, _, cot_output = generation_result image_data = [ImageData(b64_json=encode_image_base64(img), revised_prompt=None) for img in flat_images] - return ImageGenerationResponse(created=int(time.time()), data=image_data) + + return ImageGenerationResponse(created=int(time.time()), data=image_data, cot_output=cot_output) # Build params - pass through user values directly prompt: OmniTextPrompt = {"prompt": request.prompt} @@ -1558,6 +1565,8 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) extra_args["use_system_prompt"] = request.use_system_prompt if request.system_prompt is not None: extra_args["system_prompt"] = request.system_prompt + if request.bot_task is not None: + extra_args["bot_task"] = request.bot_task if extra_args: gen_params.extra_args = extra_args # Parse per-request LoRA (compatible with chat's extra_body.lora shape). @@ -1626,6 +1635,15 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) # Extract images from result images = _extract_images_from_result(result) + # Extract CoT output from the result if available + cot_output = None + if hasattr(result, "request_output") and result.request_output: + if hasattr(result.request_output, "outputs"): + for output in result.request_output.outputs: + if hasattr(output, "text") and output.text: + cot_output = output.text + break + logger.debug(f"Successfully generated {len(images)} image(s)") # Determine output format (default to png) @@ -1641,6 +1659,7 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) "created": int(time.time()), "data": image_data, "output_format": output_format, + "cot_output": cot_output, } if request.size: response_kwargs["size"] = size_str @@ -1725,6 +1744,7 @@ async def edit_images( ) try: # 2. Build prompt & images params + cot_output = None prompt: OmniTextPrompt = {"prompt": prompt} if negative_prompt is not None: prompt["negative_prompt"] = negative_prompt @@ -1935,7 +1955,7 @@ async def edit_images( status_code=generation_result.error.code if generation_result.error else 400, detail=generation_result.message, ) - images, _, _ = generation_result + images, _, _, cot_output = generation_result else: # Single-stage diffusion: use the direct path. result = await _generate_with_async_omni( @@ -1965,6 +1985,7 @@ async def edit_images( data=image_data, output_format=output_format, size=size_str, + cot_output=cot_output, ) except (EngineGenerateError, EngineDeadError) as exc: diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 0fb22a548cf..b5dc6c9054e 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -99,6 +99,26 @@ def validate_use_system_prompt(cls, v): raise ValueError(f"Invalid use_system_prompt type: {v}. Must be one of: {valid_types[1:] + [None]}") return v + bot_task: str | None = Field( + default=None, + description=( + "Bot task type. Options: t2i_think, t2i_recaption, t2i_vanilla, " + "it2i_think, it2i_recaption, t2t, i2t, " + "or simplified: think, recaption, vanilla" + ), + ) + + @field_validator("bot_task") + @classmethod + def validate_bot_task(cls, v): + if v is None: + return None + valid_tasks_full = ["t2i_think", "t2i_recaption", "t2i_vanilla", "it2i_think", "it2i_recaption", "t2t", "i2t"] + valid_tasks_simple = ["think", "recaption", "vanilla"] + if v not in valid_tasks_full and v not in valid_tasks_simple: + raise ValueError(f"Invalid bot_task: {v}. Must be one of: {valid_tasks_full + valid_tasks_simple}") + return v + num_inference_steps: int | None = Field( default=None, ge=1, @@ -165,3 +185,4 @@ class ImageGenerationResponse(BaseModel): data: list[ImageData] = Field(..., description="Array of generated images") output_format: str = Field(None, description="The output format of the image generation") size: str = Field(None, description="The size of the image generated") + cot_output: str | None = Field(None, description="Chain-of-Thought output from the model") diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 2c375fa2928..27b4df18b27 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2274,6 +2274,7 @@ def _build_multistage_generation_inputs( "custom_system_prompt": custom_system_prompt, "num_images": len(reference_images) if reference_images else 1, } + if bot_task is not None: build_kwargs["bot_task"] = bot_task elif "bot_task" in extra_body: @@ -2391,7 +2392,7 @@ async def generate_diffusion_images( extra_body: dict[str, Any] | None = None, reference_images: list[str] | None = None, request_id: str | None = None, - ) -> tuple[list[Image.Image], dict[str, Any], float] | ErrorResponse: + ) -> tuple[list[Image.Image], dict[str, Any], float, str | None] | ErrorResponse: """Generate diffusion images and return raw images plus generation stats.""" if request_id is None: request_id = f"chatcmpl-{uuid.uuid4().hex[:16]}" @@ -2497,11 +2498,13 @@ async def generate_diffusion_images( sampling_params_list = [gen_params] result = None + all_outputs = [] async for output in diffusion_engine.generate( prompt=engine_prompt, sampling_params_list=sampling_params_list, request_id=request_id, ): + all_outputs.append(output) result = output if result is None: return self._create_error_response("No output generated from AsyncOmni", status_code=500) @@ -2515,6 +2518,19 @@ async def generate_diffusion_images( images = getattr(result.request_output, "images", []) stage_durations = result.stage_durations peak_memory_mb = result.peak_memory_mb + cot_output = None + + for output in all_outputs: + req_out = getattr(output, "request_output", None) + if req_out: + prompt = getattr(req_out, "prompt", None) + if isinstance(prompt, dict): + extra = prompt.get("extra", {}) + if isinstance(extra, dict): + ar_text = extra.get("ar_generated_text") + if isinstance(ar_text, str) and ar_text.strip(): + cot_output = ar_text + break flat_images: list[Image.Image] = [] for item in images: @@ -2523,7 +2539,7 @@ async def generate_diffusion_images( else: flat_images.append(item) - return flat_images, stage_durations, peak_memory_mb + return flat_images, stage_durations, peak_memory_mb, cot_output async def _create_diffusion_chat_completion( self, From 36705b1976349265d4a9355382411bd3b4cd14bd Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Tue, 19 May 2026 11:40:52 +0000 Subject: [PATCH 02/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/protocol/images.py | 11 ++++------- vllm_omni/entrypoints/openai/serving_chat.py | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index b5dc6c9054e..010201a358c 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -102,9 +102,7 @@ def validate_use_system_prompt(cls, v): bot_task: str | None = Field( default=None, description=( - "Bot task type. Options: t2i_think, t2i_recaption, t2i_vanilla, " - "it2i_think, it2i_recaption, t2t, i2t, " - "or simplified: think, recaption, vanilla" + "Bot task type. Options: think, recaption, think_recaption, vanilla" ), ) @@ -113,10 +111,9 @@ def validate_use_system_prompt(cls, v): def validate_bot_task(cls, v): if v is None: return None - valid_tasks_full = ["t2i_think", "t2i_recaption", "t2i_vanilla", "it2i_think", "it2i_recaption", "t2t", "i2t"] - valid_tasks_simple = ["think", "recaption", "vanilla"] - if v not in valid_tasks_full and v not in valid_tasks_simple: - raise ValueError(f"Invalid bot_task: {v}. Must be one of: {valid_tasks_full + valid_tasks_simple}") + valid_tasks = ["think", "recaption", "think_recaption", "vanilla"] + if v not in valid_tasks: + raise ValueError(f"Invalid bot_task: {v}. Must be one of: {valid_tasks}") return v num_inference_steps: int | None = Field( diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 27b4df18b27..a05e5eaca18 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2248,7 +2248,7 @@ def _build_multistage_generation_inputs( layers = extra_body.get("layers") resolution = extra_body.get("resolution") bot_task = extra_body.get("bot_task") - sys_type = extra_body.get("sys_type") + use_system_prompt = extra_body.get("use_system_prompt") or extra_body.get("sys_type") custom_system_prompt = extra_body.get("system_prompt") engine_prompt_data: dict[str, Any] | None = None @@ -2262,7 +2262,7 @@ def _build_multistage_generation_inputs( prompt_token_ids: list[int] | None = None system_prompt_type: str | None = None - if bot_task is not None or sys_type is not None or custom_system_prompt is not None: + if bot_task is not None or use_system_prompt is not None or custom_system_prompt is not None: from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( build_prompt, build_prompt_tokens, @@ -2270,7 +2270,7 @@ def _build_multistage_generation_inputs( build_kwargs: dict[str, Any] = { "task": "it2i" if reference_images else "t2i", - "sys_type": sys_type, + "sys_type": use_system_prompt, "custom_system_prompt": custom_system_prompt, "num_images": len(reference_images) if reference_images else 1, } @@ -2307,6 +2307,14 @@ def _build_multistage_generation_inputs( if negative_prompt is not None: engine_prompt["negative_prompt"] = negative_prompt + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_stop_token_ids + stop_token_ids = resolve_stop_token_ids( + task="it2i" if reference_images else "t2i", + bot_task=build_kwargs.get("bot_task"), + tokenizer=tokenizer + ) + engine_prompt["stop_token_ids"] = stop_token_ids + mm_processor_kwargs: dict[str, Any] = {} if height is not None: mm_processor_kwargs["target_h"] = height @@ -2509,11 +2517,13 @@ async def generate_diffusion_images( if result is None: return self._create_error_response("No output generated from AsyncOmni", status_code=500) else: + all_outputs = [] result = await engine.generate( prompt=gen_prompt, sampling_params=gen_params, request_id=request_id, ) + all_outputs.append(result) images = getattr(result.request_output, "images", []) stage_durations = result.stage_durations From ce2a843473c92814a9898f92bb2eac59b6f02bbd Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Tue, 19 May 2026 14:19:10 +0800 Subject: [PATCH 03/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/protocol/images.py | 4 +--- vllm_omni/entrypoints/openai/serving_chat.py | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 010201a358c..50d90b08e76 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -101,9 +101,7 @@ def validate_use_system_prompt(cls, v): bot_task: str | None = Field( default=None, - description=( - "Bot task type. Options: think, recaption, think_recaption, vanilla" - ), + description=("Bot task type. Options: think, recaption, think_recaption, vanilla"), ) @field_validator("bot_task") diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index a05e5eaca18..540cdd66f89 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2308,10 +2308,9 @@ def _build_multistage_generation_inputs( engine_prompt["negative_prompt"] = negative_prompt from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_stop_token_ids + stop_token_ids = resolve_stop_token_ids( - task="it2i" if reference_images else "t2i", - bot_task=build_kwargs.get("bot_task"), - tokenizer=tokenizer + task="it2i" if reference_images else "t2i", bot_task=build_kwargs.get("bot_task"), tokenizer=tokenizer ) engine_prompt["stop_token_ids"] = stop_token_ids From 41412b2fe3ec211bec248d4b736f43a08cee6dbd Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Tue, 19 May 2026 19:31:51 +0000 Subject: [PATCH 04/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- ...est_hunyuan_image3_online_offline_align.py | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py diff --git a/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py b/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py new file mode 100644 index 00000000000..73b4441ef53 --- /dev/null +++ b/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import asyncio +import base64 +import io +import shutil +import tempfile +from pathlib import Path + +import pytest +from PIL import Image + +from tests.e2e.accuracy.helpers import assert_similarity, model_output_dir + +pytestmark = [pytest.mark.full_model, pytest.mark.diffusion] + +MODEL_NAME = "/data/HunyuanImage-3.0-Instruct" +SEED = 42 +NUM_INFERENCE_STEPS = 50 +GUIDANCE_SCALE = 5.0 +HEIGHT, WIDTH = 1024, 1024 +PSNR_THRESHOLD = 40.0 +SSIM_THRESHOLD = 0.99 +PROMPT = "A brown and white dog is running on the grass." + + +async def run_offline_inference(model_path: str, output_path: Path) -> tuple[Image.Image, str]: + """Run offline inference using the end2end.py script.""" + import subprocess + import sys + + script_path = Path(__file__).resolve().parents[3] / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py" + cmd = [ + sys.executable, + str(script_path), + "--modality", "text2img", + "--model", model_path, + "--prompts", PROMPT, + "--bot-task", "think", + "--sys-type", "en_unified", + "--seed", str(SEED), + "--steps", str(NUM_INFERENCE_STEPS), + "--output", str(output_path), + "--deploy-config", "vllm_omni/deploy/hunyuan_image3_dit.yaml", + "--height", str(HEIGHT), + "--width", str(WIDTH), + "--guidance-scale", str(GUIDANCE_SCALE), + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Offline inference failed: {result.stderr}" + + image_files = list(output_path.glob("output_0_0.png")) + assert len(image_files) == 1, f"Expected 1 image, found {len(image_files)}" + + image = Image.open(image_files[0]).convert("RGB") + + cot_output = "" + for line in result.stdout.split("\n"): + if "[Output] Text:" in line: + cot_output = "\n".join(result.stdout.split("[Output] Text:\n")[1].split("[Output] Saved image")[0].strip().split("\n")) + break + + return image, cot_output + + +async def run_online_inference(model_path: str) -> tuple[Image.Image, str]: + """Run online inference using the OpenAI-compatible API.""" + import httpx + import subprocess + import sys + import time + + server_cmd = [ + "vllm", "serve", + model_path, + "--omni", + "--host", "localhost", + "--port", "8091", + "--deploy-config", "vllm_omni/deploy/hunyuan_image3_dit.yaml", + "--enforce-eager", + ] + + server_process = subprocess.Popen( + server_cmd, + stdout=sys.stdout, + stderr=sys.stderr, + ) + + health_url = "http://localhost:8091/health" + start_time = time.time() + while time.time() - start_time < 300: + try: + async with httpx.AsyncClient() as client: + resp = await client.get(health_url, timeout=2) + if resp.status_code == 200: + print("Online server ready!") + break + except Exception: + pass + time.sleep(2) + else: + server_process.terminate() + raise RuntimeError("Online server failed to start within 5 minutes") + + try: + payload = { + "prompt": PROMPT, + "use_system_prompt": "en_unified", + "bot_task": "think", + "num_inference_steps": NUM_INFERENCE_STEPS, + "n": 1, + "seed": SEED, + "size": f"{WIDTH}x{HEIGHT}", + "guidance_scale": GUIDANCE_SCALE, + } + + async with httpx.AsyncClient() as client: + response = await client.post( + "http://localhost:8091/v1/images/generations", + json=payload, + timeout=300, + ) + + assert response.status_code == 200, f"Online inference failed: {response.text}" + data = response.json() + assert "data" in data and len(data["data"]) > 0, "No images in response" + + b64_json = data["data"][0]["b64_json"] + assert b64_json, "No b64_json in response" + + img_bytes = base64.b64decode(b64_json) + image = Image.open(io.BytesIO(img_bytes)).convert("RGB") + cot_output = data.get("cot_output", "") + + return image, cot_output + finally: + server_process.terminate() + try: + server_process.wait(timeout=10) + except subprocess.TimeoutExpired: + server_process.kill() + + +async def test_online_offline_align(accuracy_artifact_root: Path) -> None: + """Test alignment between online and offline inference for HunyuanImage-3.""" + output_dir = model_output_dir(accuracy_artifact_root, MODEL_NAME + "-online-offline-align") + output_dir.mkdir(parents=True, exist_ok=True) + + try: + with tempfile.TemporaryDirectory() as tmpdir: + tmp = Path(tmpdir) + + # Run offline inference + print("Running offline inference...") + offline_image, _ = await run_offline_inference(MODEL_NAME, tmp) + offline_image.save(output_dir / "offline_image.png") + + # Run online inference + print("Running online inference...") + online_image, _ = await run_online_inference(MODEL_NAME) + online_image.save(output_dir / "online_image.png") + + # Compare images + print("\n--- Alignment ---") + assert_similarity( + model_name=f"{MODEL_NAME} online vs offline", + vllm_image=online_image, + diffusers_image=offline_image, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + width=WIDTH, + height=HEIGHT, + ) + + finally: + print(f"\nCleaning up {output_dir}") + if output_dir.exists(): + shutil.rmtree(output_dir, ignore_errors=True) + artifacts_dir = accuracy_artifact_root + if artifacts_dir.exists() and not any(artifacts_dir.iterdir()): + artifacts_dir.rmdir() + + +if __name__ == "__main__": + asyncio.run(test_online_offline_align(Path("./output"))) \ No newline at end of file From 788c27daff8badba96505182039bfe6a92867fe9 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Tue, 19 May 2026 19:51:54 +0800 Subject: [PATCH 05/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- ...est_hunyuan_image3_online_offline_align.py | 61 +++++++++++++------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py b/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py index 73b4441ef53..52fe12b5b76 100644 --- a/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py +++ b/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py @@ -32,22 +32,36 @@ async def run_offline_inference(model_path: str, output_path: Path) -> tuple[Ima import subprocess import sys - script_path = Path(__file__).resolve().parents[3] / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py" + script_path = ( + Path(__file__).resolve().parents[3] / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py" + ) cmd = [ sys.executable, str(script_path), - "--modality", "text2img", - "--model", model_path, - "--prompts", PROMPT, - "--bot-task", "think", - "--sys-type", "en_unified", - "--seed", str(SEED), - "--steps", str(NUM_INFERENCE_STEPS), - "--output", str(output_path), - "--deploy-config", "vllm_omni/deploy/hunyuan_image3_dit.yaml", - "--height", str(HEIGHT), - "--width", str(WIDTH), - "--guidance-scale", str(GUIDANCE_SCALE), + "--modality", + "text2img", + "--model", + model_path, + "--prompts", + PROMPT, + "--bot-task", + "think", + "--sys-type", + "en_unified", + "--seed", + str(SEED), + "--steps", + str(NUM_INFERENCE_STEPS), + "--output", + str(output_path), + "--deploy-config", + "vllm_omni/deploy/hunyuan_image3_dit.yaml", + "--height", + str(HEIGHT), + "--width", + str(WIDTH), + "--guidance-scale", + str(GUIDANCE_SCALE), ] result = subprocess.run(cmd, capture_output=True, text=True) @@ -61,7 +75,9 @@ async def run_offline_inference(model_path: str, output_path: Path) -> tuple[Ima cot_output = "" for line in result.stdout.split("\n"): if "[Output] Text:" in line: - cot_output = "\n".join(result.stdout.split("[Output] Text:\n")[1].split("[Output] Saved image")[0].strip().split("\n")) + cot_output = "\n".join( + result.stdout.split("[Output] Text:\n")[1].split("[Output] Saved image")[0].strip().split("\n") + ) break return image, cot_output @@ -69,18 +85,23 @@ async def run_offline_inference(model_path: str, output_path: Path) -> tuple[Ima async def run_online_inference(model_path: str) -> tuple[Image.Image, str]: """Run online inference using the OpenAI-compatible API.""" - import httpx import subprocess import sys import time + import httpx + server_cmd = [ - "vllm", "serve", + "vllm", + "serve", model_path, "--omni", - "--host", "localhost", - "--port", "8091", - "--deploy-config", "vllm_omni/deploy/hunyuan_image3_dit.yaml", + "--host", + "localhost", + "--port", + "8091", + "--deploy-config", + "vllm_omni/deploy/hunyuan_image3_dit.yaml", "--enforce-eager", ] @@ -186,4 +207,4 @@ async def test_online_offline_align(accuracy_artifact_root: Path) -> None: if __name__ == "__main__": - asyncio.run(test_online_offline_align(Path("./output"))) \ No newline at end of file + asyncio.run(test_online_offline_align(Path("./output"))) From 126ebe641e309d0c97e6d713305cbbf97419a858 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Tue, 19 May 2026 20:04:25 +0000 Subject: [PATCH 06/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- ...est_hunyuan_image3_online_offline_align.py | 210 ------------------ 1 file changed, 210 deletions(-) delete mode 100644 tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py diff --git a/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py b/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py deleted file mode 100644 index 52fe12b5b76..00000000000 --- a/tests/e2e/accuracy/test_hunyuan_image3_online_offline_align.py +++ /dev/null @@ -1,210 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - -import asyncio -import base64 -import io -import shutil -import tempfile -from pathlib import Path - -import pytest -from PIL import Image - -from tests.e2e.accuracy.helpers import assert_similarity, model_output_dir - -pytestmark = [pytest.mark.full_model, pytest.mark.diffusion] - -MODEL_NAME = "/data/HunyuanImage-3.0-Instruct" -SEED = 42 -NUM_INFERENCE_STEPS = 50 -GUIDANCE_SCALE = 5.0 -HEIGHT, WIDTH = 1024, 1024 -PSNR_THRESHOLD = 40.0 -SSIM_THRESHOLD = 0.99 -PROMPT = "A brown and white dog is running on the grass." - - -async def run_offline_inference(model_path: str, output_path: Path) -> tuple[Image.Image, str]: - """Run offline inference using the end2end.py script.""" - import subprocess - import sys - - script_path = ( - Path(__file__).resolve().parents[3] / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py" - ) - cmd = [ - sys.executable, - str(script_path), - "--modality", - "text2img", - "--model", - model_path, - "--prompts", - PROMPT, - "--bot-task", - "think", - "--sys-type", - "en_unified", - "--seed", - str(SEED), - "--steps", - str(NUM_INFERENCE_STEPS), - "--output", - str(output_path), - "--deploy-config", - "vllm_omni/deploy/hunyuan_image3_dit.yaml", - "--height", - str(HEIGHT), - "--width", - str(WIDTH), - "--guidance-scale", - str(GUIDANCE_SCALE), - ] - - result = subprocess.run(cmd, capture_output=True, text=True) - assert result.returncode == 0, f"Offline inference failed: {result.stderr}" - - image_files = list(output_path.glob("output_0_0.png")) - assert len(image_files) == 1, f"Expected 1 image, found {len(image_files)}" - - image = Image.open(image_files[0]).convert("RGB") - - cot_output = "" - for line in result.stdout.split("\n"): - if "[Output] Text:" in line: - cot_output = "\n".join( - result.stdout.split("[Output] Text:\n")[1].split("[Output] Saved image")[0].strip().split("\n") - ) - break - - return image, cot_output - - -async def run_online_inference(model_path: str) -> tuple[Image.Image, str]: - """Run online inference using the OpenAI-compatible API.""" - import subprocess - import sys - import time - - import httpx - - server_cmd = [ - "vllm", - "serve", - model_path, - "--omni", - "--host", - "localhost", - "--port", - "8091", - "--deploy-config", - "vllm_omni/deploy/hunyuan_image3_dit.yaml", - "--enforce-eager", - ] - - server_process = subprocess.Popen( - server_cmd, - stdout=sys.stdout, - stderr=sys.stderr, - ) - - health_url = "http://localhost:8091/health" - start_time = time.time() - while time.time() - start_time < 300: - try: - async with httpx.AsyncClient() as client: - resp = await client.get(health_url, timeout=2) - if resp.status_code == 200: - print("Online server ready!") - break - except Exception: - pass - time.sleep(2) - else: - server_process.terminate() - raise RuntimeError("Online server failed to start within 5 minutes") - - try: - payload = { - "prompt": PROMPT, - "use_system_prompt": "en_unified", - "bot_task": "think", - "num_inference_steps": NUM_INFERENCE_STEPS, - "n": 1, - "seed": SEED, - "size": f"{WIDTH}x{HEIGHT}", - "guidance_scale": GUIDANCE_SCALE, - } - - async with httpx.AsyncClient() as client: - response = await client.post( - "http://localhost:8091/v1/images/generations", - json=payload, - timeout=300, - ) - - assert response.status_code == 200, f"Online inference failed: {response.text}" - data = response.json() - assert "data" in data and len(data["data"]) > 0, "No images in response" - - b64_json = data["data"][0]["b64_json"] - assert b64_json, "No b64_json in response" - - img_bytes = base64.b64decode(b64_json) - image = Image.open(io.BytesIO(img_bytes)).convert("RGB") - cot_output = data.get("cot_output", "") - - return image, cot_output - finally: - server_process.terminate() - try: - server_process.wait(timeout=10) - except subprocess.TimeoutExpired: - server_process.kill() - - -async def test_online_offline_align(accuracy_artifact_root: Path) -> None: - """Test alignment between online and offline inference for HunyuanImage-3.""" - output_dir = model_output_dir(accuracy_artifact_root, MODEL_NAME + "-online-offline-align") - output_dir.mkdir(parents=True, exist_ok=True) - - try: - with tempfile.TemporaryDirectory() as tmpdir: - tmp = Path(tmpdir) - - # Run offline inference - print("Running offline inference...") - offline_image, _ = await run_offline_inference(MODEL_NAME, tmp) - offline_image.save(output_dir / "offline_image.png") - - # Run online inference - print("Running online inference...") - online_image, _ = await run_online_inference(MODEL_NAME) - online_image.save(output_dir / "online_image.png") - - # Compare images - print("\n--- Alignment ---") - assert_similarity( - model_name=f"{MODEL_NAME} online vs offline", - vllm_image=online_image, - diffusers_image=offline_image, - ssim_threshold=SSIM_THRESHOLD, - psnr_threshold=PSNR_THRESHOLD, - width=WIDTH, - height=HEIGHT, - ) - - finally: - print(f"\nCleaning up {output_dir}") - if output_dir.exists(): - shutil.rmtree(output_dir, ignore_errors=True) - artifacts_dir = accuracy_artifact_root - if artifacts_dir.exists() and not any(artifacts_dir.iterdir()): - artifacts_dir.rmdir() - - -if __name__ == "__main__": - asyncio.run(test_online_offline_align(Path("./output"))) From 6dd12b58717fddc309e3846354602ef7bf4fb105 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Thu, 21 May 2026 18:30:47 +0000 Subject: [PATCH 07/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/api_server.py | 34 ++++++++++----- .../entrypoints/openai/protocol/images.py | 21 ++-------- vllm_omni/entrypoints/openai/serving_chat.py | 42 ++++++++----------- 3 files changed, 44 insertions(+), 53 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 2941ed85e9a..956a55e5505 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1533,12 +1533,10 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) # Keep /images validation semantics: invalid LoRA should fail with 400. _parse_lora_request(request.lora) extra_body["lora"] = request.lora - if request.bot_task is not None: - extra_body["bot_task"] = request.bot_task - if request.use_system_prompt is not None: - extra_body["use_system_prompt"] = request.use_system_prompt - if request.system_prompt is not None: - extra_body["system_prompt"] = request.system_prompt + if request.model_extra: + for k, v in request.model_extra.items(): + if v is not None and k not in extra_body: + extra_body[k] = v generation_result = await chat_handler.generate_diffusion_images( prompt=request.prompt, @@ -1550,10 +1548,21 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) status_code=generation_result.error.code if generation_result.error else 400, content=generation_result.model_dump(), ) - flat_images, _, _, cot_output = generation_result + flat_images, stage_durations, _ = generation_result + cot_output = stage_durations.get("cot_output") if isinstance(stage_durations, dict) else None image_data = [ImageData(b64_json=encode_image_base64(img), revised_prompt=None) for img in flat_images] - return ImageGenerationResponse(created=int(time.time()), data=image_data, cot_output=cot_output) + response_kwargs = { + "created": int(time.time()), + "data": image_data, + "output_format": output_format, + } + if request.size: + response_kwargs["size"] = size_str + if cot_output is not None: + response_kwargs["cot_output"] = cot_output + + return ImageGenerationResponse(**response_kwargs) # Build params - pass through user values directly prompt: OmniTextPrompt = {"prompt": request.prompt} @@ -1565,8 +1574,10 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) extra_args["use_system_prompt"] = request.use_system_prompt if request.system_prompt is not None: extra_args["system_prompt"] = request.system_prompt - if request.bot_task is not None: - extra_args["bot_task"] = request.bot_task + if request.model_extra: + for k, v in request.model_extra.items(): + if v is not None and k not in extra_args: + extra_args[k] = v if extra_args: gen_params.extra_args = extra_args # Parse per-request LoRA (compatible with chat's extra_body.lora shape). @@ -1955,7 +1966,8 @@ async def edit_images( status_code=generation_result.error.code if generation_result.error else 400, detail=generation_result.message, ) - images, _, _, cot_output = generation_result + images, stage_durations, _ = generation_result + cot_output = stage_durations.get("cot_output") if isinstance(stage_durations, dict) else None else: # Single-stage diffusion: use the direct path. result = await _generate_with_async_omni( diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 50d90b08e76..dab7df4998a 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -10,7 +10,7 @@ from enum import Enum from typing import Any -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator from vllm_omni.entrypoints.openai.image_api_utils import validate_layered_layers @@ -30,6 +30,8 @@ class ImageGenerationRequest(BaseModel): for advanced diffusion parameters. """ + model_config = ConfigDict(extra="allow") + # Required fields prompt: str = Field(..., description="Text description of the desired image(s)") @@ -99,21 +101,6 @@ def validate_use_system_prompt(cls, v): raise ValueError(f"Invalid use_system_prompt type: {v}. Must be one of: {valid_types[1:] + [None]}") return v - bot_task: str | None = Field( - default=None, - description=("Bot task type. Options: think, recaption, think_recaption, vanilla"), - ) - - @field_validator("bot_task") - @classmethod - def validate_bot_task(cls, v): - if v is None: - return None - valid_tasks = ["think", "recaption", "think_recaption", "vanilla"] - if v not in valid_tasks: - raise ValueError(f"Invalid bot_task: {v}. Must be one of: {valid_tasks}") - return v - num_inference_steps: int | None = Field( default=None, ge=1, @@ -176,8 +163,8 @@ class ImageGenerationResponse(BaseModel): Returns generated images with metadata. """ + model_config = ConfigDict(extra="allow") created: int = Field(..., description="Unix timestamp of when the generation completed") data: list[ImageData] = Field(..., description="Array of generated images") output_format: str = Field(None, description="The output format of the image generation") size: str = Field(None, description="The size of the image generated") - cot_output: str | None = Field(None, description="Chain-of-Thought output from the model") diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 540cdd66f89..29eb0a77315 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2262,6 +2262,8 @@ def _build_multistage_generation_inputs( prompt_token_ids: list[int] | None = None system_prompt_type: str | None = None + build_kwargs: dict[str, Any] = {} + if bot_task is not None or use_system_prompt is not None or custom_system_prompt is not None: from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( build_prompt, @@ -2280,7 +2282,7 @@ def _build_multistage_generation_inputs( elif "bot_task" in extra_body: # Explicit None from the caller is plain-mode; omitted lets # each task fall back to its default trigger. - build_kwargs["bot_task"] = None + build_kwargs["bot_task"] = extra_body["bot_task"] if tokenizer is not None: # Feed segment-tokenized prompt_token_ids so AR matches HF # apply_chat_template byte-for-byte (engine BPE would merge @@ -2307,13 +2309,6 @@ def _build_multistage_generation_inputs( if negative_prompt is not None: engine_prompt["negative_prompt"] = negative_prompt - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_stop_token_ids - - stop_token_ids = resolve_stop_token_ids( - task="it2i" if reference_images else "t2i", bot_task=build_kwargs.get("bot_task"), tokenizer=tokenizer - ) - engine_prompt["stop_token_ids"] = stop_token_ids - mm_processor_kwargs: dict[str, Any] = {} if height is not None: mm_processor_kwargs["target_h"] = height @@ -2399,7 +2394,7 @@ async def generate_diffusion_images( extra_body: dict[str, Any] | None = None, reference_images: list[str] | None = None, request_id: str | None = None, - ) -> tuple[list[Image.Image], dict[str, Any], float, str | None] | ErrorResponse: + ) -> tuple[list[Image.Image], dict[str, Any], float] | ErrorResponse: """Generate diffusion images and return raw images plus generation stats.""" if request_id is None: request_id = f"chatcmpl-{uuid.uuid4().hex[:16]}" @@ -2505,41 +2500,35 @@ async def generate_diffusion_images( sampling_params_list = [gen_params] result = None - all_outputs = [] async for output in diffusion_engine.generate( prompt=engine_prompt, sampling_params_list=sampling_params_list, request_id=request_id, ): - all_outputs.append(output) result = output if result is None: return self._create_error_response("No output generated from AsyncOmni", status_code=500) else: - all_outputs = [] result = await engine.generate( prompt=gen_prompt, sampling_params=gen_params, request_id=request_id, ) - all_outputs.append(result) images = getattr(result.request_output, "images", []) stage_durations = result.stage_durations peak_memory_mb = result.peak_memory_mb cot_output = None - for output in all_outputs: - req_out = getattr(output, "request_output", None) - if req_out: - prompt = getattr(req_out, "prompt", None) - if isinstance(prompt, dict): - extra = prompt.get("extra", {}) - if isinstance(extra, dict): - ar_text = extra.get("ar_generated_text") - if isinstance(ar_text, str) and ar_text.strip(): - cot_output = ar_text - break + req_out = getattr(result, "request_output", None) + if req_out: + prompt_obj = getattr(req_out, "prompt", None) + if isinstance(prompt_obj, dict): + extra = prompt_obj.get("extra", {}) + if isinstance(extra, dict): + ar_text = extra.get("ar_generated_text") + if isinstance(ar_text, str) and ar_text.strip(): + cot_output = ar_text flat_images: list[Image.Image] = [] for item in images: @@ -2548,7 +2537,10 @@ async def generate_diffusion_images( else: flat_images.append(item) - return flat_images, stage_durations, peak_memory_mb, cot_output + if cot_output is not None: + stage_durations["cot_output"] = cot_output + + return flat_images, stage_durations, peak_memory_mb async def _create_diffusion_chat_completion( self, From 7b745ee4cc57c45ba3101633f24a14cf363cea50 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Thu, 21 May 2026 18:45:03 +0000 Subject: [PATCH 08/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/api_server.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 956a55e5505..eb5a9f2ab75 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1555,10 +1555,7 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) response_kwargs = { "created": int(time.time()), "data": image_data, - "output_format": output_format, } - if request.size: - response_kwargs["size"] = size_str if cot_output is not None: response_kwargs["cot_output"] = cot_output From 576e195bc06bf82c5da03f75a3fc5feaba797857 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Thu, 21 May 2026 18:51:37 +0800 Subject: [PATCH 09/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/serving_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 29eb0a77315..e58f47cec6d 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2539,7 +2539,7 @@ async def generate_diffusion_images( if cot_output is not None: stage_durations["cot_output"] = cot_output - + return flat_images, stage_durations, peak_memory_mb async def _create_diffusion_chat_completion( From edf704879e9eea62105e3a63a363e43796a102f2 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Fri, 22 May 2026 11:52:35 +0000 Subject: [PATCH 10/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/api_server.py | 33 +++++++------------ .../entrypoints/openai/protocol/images.py | 13 ++++++++ vllm_omni/entrypoints/openai/serving_chat.py | 5 +-- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index eb5a9f2ab75..c5c9d094908 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1533,6 +1533,12 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) # Keep /images validation semantics: invalid LoRA should fail with 400. _parse_lora_request(request.lora) extra_body["lora"] = request.lora + if request.bot_task is not None: + extra_body["bot_task"] = request.bot_task + if request.use_system_prompt is not None: + extra_body["use_system_prompt"] = request.use_system_prompt + if request.system_prompt is not None: + extra_body["system_prompt"] = request.system_prompt if request.model_extra: for k, v in request.model_extra.items(): if v is not None and k not in extra_body: @@ -1548,18 +1554,10 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) status_code=generation_result.error.code if generation_result.error else 400, content=generation_result.model_dump(), ) - flat_images, stage_durations, _ = generation_result - cot_output = stage_durations.get("cot_output") if isinstance(stage_durations, dict) else None + flat_images, _, _, _ = generation_result image_data = [ImageData(b64_json=encode_image_base64(img), revised_prompt=None) for img in flat_images] - response_kwargs = { - "created": int(time.time()), - "data": image_data, - } - if cot_output is not None: - response_kwargs["cot_output"] = cot_output - - return ImageGenerationResponse(**response_kwargs) + return ImageGenerationResponse(created=int(time.time()), data=image_data) # Build params - pass through user values directly prompt: OmniTextPrompt = {"prompt": request.prompt} @@ -1571,6 +1569,8 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) extra_args["use_system_prompt"] = request.use_system_prompt if request.system_prompt is not None: extra_args["system_prompt"] = request.system_prompt + if request.bot_task is not None: + extra_args["bot_task"] = request.bot_task if request.model_extra: for k, v in request.model_extra.items(): if v is not None and k not in extra_args: @@ -1643,15 +1643,6 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) # Extract images from result images = _extract_images_from_result(result) - # Extract CoT output from the result if available - cot_output = None - if hasattr(result, "request_output") and result.request_output: - if hasattr(result.request_output, "outputs"): - for output in result.request_output.outputs: - if hasattr(output, "text") and output.text: - cot_output = output.text - break - logger.debug(f"Successfully generated {len(images)} image(s)") # Determine output format (default to png) @@ -1667,7 +1658,6 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) "created": int(time.time()), "data": image_data, "output_format": output_format, - "cot_output": cot_output, } if request.size: response_kwargs["size"] = size_str @@ -1963,8 +1953,7 @@ async def edit_images( status_code=generation_result.error.code if generation_result.error else 400, detail=generation_result.message, ) - images, stage_durations, _ = generation_result - cot_output = stage_durations.get("cot_output") if isinstance(stage_durations, dict) else None + images, _, _, cot_output = generation_result else: # Single-stage diffusion: use the direct path. result = await _generate_with_async_omni( diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index dab7df4998a..3b949380bf8 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -34,6 +34,19 @@ class ImageGenerationRequest(BaseModel): # Required fields prompt: str = Field(..., description="Text description of the desired image(s)") + bot_task: str | None = Field( + None, + description="Task mode for the model (e.g., 'cot' enables chain-of-thought generation). " + "Only supported by specific diffusion models." + ) + system_prompt: str | None = Field( + None, + description="Custom system prompt to guide the model's behavior." + ) + use_system_prompt: bool | None = Field( + None, + description="Whether to apply the model's default system prompt." + ) # OpenAI standard fields model: str | None = Field( diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index e58f47cec6d..4f9e7dcc8b8 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2537,10 +2537,7 @@ async def generate_diffusion_images( else: flat_images.append(item) - if cot_output is not None: - stage_durations["cot_output"] = cot_output - - return flat_images, stage_durations, peak_memory_mb + return flat_images, stage_durations, peak_memory_mb, cot_output async def _create_diffusion_chat_completion( self, From 5e7bb58149a64dce5db2bfbad99d397b72f4ceee Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Fri, 22 May 2026 12:53:46 +0800 Subject: [PATCH 11/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/protocol/images.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 3b949380bf8..41f998a4d89 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -37,16 +37,10 @@ class ImageGenerationRequest(BaseModel): bot_task: str | None = Field( None, description="Task mode for the model (e.g., 'cot' enables chain-of-thought generation). " - "Only supported by specific diffusion models." - ) - system_prompt: str | None = Field( - None, - description="Custom system prompt to guide the model's behavior." - ) - use_system_prompt: bool | None = Field( - None, - description="Whether to apply the model's default system prompt." + "Only supported by specific diffusion models.", ) + system_prompt: str | None = Field(None, description="Custom system prompt to guide the model's behavior.") + use_system_prompt: bool | None = Field(None, description="Whether to apply the model's default system prompt.") # OpenAI standard fields model: str | None = Field( From ec16a36950e943ee0e3ebb512bd37825cdd69775 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Fri, 22 May 2026 14:13:20 +0000 Subject: [PATCH 12/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/protocol/images.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 41f998a4d89..a4d1aad7d8e 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -30,8 +30,6 @@ class ImageGenerationRequest(BaseModel): for advanced diffusion parameters. """ - model_config = ConfigDict(extra="allow") - # Required fields prompt: str = Field(..., description="Text description of the desired image(s)") bot_task: str | None = Field( @@ -39,8 +37,6 @@ class ImageGenerationRequest(BaseModel): description="Task mode for the model (e.g., 'cot' enables chain-of-thought generation). " "Only supported by specific diffusion models.", ) - system_prompt: str | None = Field(None, description="Custom system prompt to guide the model's behavior.") - use_system_prompt: bool | None = Field(None, description="Whether to apply the model's default system prompt.") # OpenAI standard fields model: str | None = Field( @@ -170,7 +166,6 @@ class ImageGenerationResponse(BaseModel): Returns generated images with metadata. """ - model_config = ConfigDict(extra="allow") created: int = Field(..., description="Unix timestamp of when the generation completed") data: list[ImageData] = Field(..., description="Array of generated images") output_format: str = Field(None, description="The output format of the image generation") From 3d57eedbe2dd4bab7022e4c230336ebbdf66ac54 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Fri, 22 May 2026 14:46:32 +0000 Subject: [PATCH 13/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/protocol/images.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index a4d1aad7d8e..01ea6dbdd06 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -170,3 +170,8 @@ class ImageGenerationResponse(BaseModel): data: list[ImageData] = Field(..., description="Array of generated images") output_format: str = Field(None, description="The output format of the image generation") size: str = Field(None, description="The size of the image generated") + cot_output: str | None = Field( + None, + description="Chain-of-thought text output from the AR stage. " + "Only present for image editing (IT2I) with CoT-enabled models." + ) \ No newline at end of file From de71edd61f446279c0fac6865587a1e375aeea57 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Fri, 22 May 2026 14:50:03 +0800 Subject: [PATCH 14/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/protocol/images.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 01ea6dbdd06..dbf4c24b348 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -10,7 +10,7 @@ from enum import Enum from typing import Any -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, Field, field_validator from vllm_omni.entrypoints.openai.image_api_utils import validate_layered_layers @@ -173,5 +173,5 @@ class ImageGenerationResponse(BaseModel): cot_output: str | None = Field( None, description="Chain-of-thought text output from the AR stage. " - "Only present for image editing (IT2I) with CoT-enabled models." - ) \ No newline at end of file + "Only present for image editing (IT2I) with CoT-enabled models.", + ) From 22cc419e9ec63527083127493d63b70e7b575197 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Fri, 22 May 2026 14:51:41 +0000 Subject: [PATCH 15/15] Align Offline and Online Inference Signed-off-by: skf1999 <13234016272@163.com> --- vllm_omni/entrypoints/openai/api_server.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index c5c9d094908..ddc6e36815a 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1539,10 +1539,6 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) extra_body["use_system_prompt"] = request.use_system_prompt if request.system_prompt is not None: extra_body["system_prompt"] = request.system_prompt - if request.model_extra: - for k, v in request.model_extra.items(): - if v is not None and k not in extra_body: - extra_body[k] = v generation_result = await chat_handler.generate_diffusion_images( prompt=request.prompt, @@ -1571,10 +1567,6 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) extra_args["system_prompt"] = request.system_prompt if request.bot_task is not None: extra_args["bot_task"] = request.bot_task - if request.model_extra: - for k, v in request.model_extra.items(): - if v is not None and k not in extra_args: - extra_args[k] = v if extra_args: gen_params.extra_args = extra_args # Parse per-request LoRA (compatible with chat's extra_body.lora shape).