vllm-project · Gaohan123 · Apr 22, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
@@ -122,6 +122,18 @@ async def async_request_chat_completions(
                                     output.peak_memory_mb = first_item.get("peak_memory_mb", 0.0)
                 except (IndexError, TypeError, AttributeError):
                     pass
+
+                if (not output.stage_durations or output.peak_memory_mb == 0.0) and isinstance(
+                    resp_json.get("metrics"), dict
+                ):
+                    m = resp_json["metrics"]
+                    if not output.stage_durations and isinstance(m.get("stage_durations"), dict):
+                        output.stage_durations = m.get("stage_durations") or {}
+                    if output.peak_memory_mb == 0.0 and m.get("peak_memory_mb") is not None:
+                        try:
+                            output.peak_memory_mb = float(m.get("peak_memory_mb") or 0.0)
+                        except (TypeError, ValueError):
+                            pass
             else:
                 output.error = f"HTTP {response.status}: {await response.text()}"
                 output.success = False

@@ -0,0 +1,107 @@
+import pytest
+
+from benchmarks.diffusion.backends import RequestFuncInput, async_request_chat_completions
+
+
+class _MockResponse:
+    def __init__(self, payload: dict, status: int = 200):
+        self._payload = payload
+        self.status = status
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        return False
+
+    async def json(self):
+        return self._payload
+
+    async def text(self):
+        return str(self._payload)
+
+
+class _MockSession:
+    def __init__(self, payload: dict):
+        self._payload = payload
+
+    def post(self, *args, **kwargs):
+        return _MockResponse(self._payload)
+
+
+@pytest.mark.core_model
+@pytest.mark.benchmark
+@pytest.mark.cpu
+@pytest.mark.asyncio
+async def test_chat_completions_metrics_fallback_to_top_level():
+    payload = {
+        "choices": [
+            {
+                "message": {
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": "data:image/png;base64,abc"},
+                        }
+                    ]
+                }
+            }
+        ],
+        "metrics": {
+            "stage_durations": {"diffusion": 1.25},
+            "peak_memory_mb": 4096.0,
+        },
+    }
+
+    output = await async_request_chat_completions(
+        RequestFuncInput(
+            prompt="draw a cat",
+            api_url="http://test.local/v1/chat/completions",
+            model="ByteDance-Seed/BAGEL-7B-MoT",
+        ),
+        session=_MockSession(payload),
+    )
+
+    assert output.success is True
+    assert output.stage_durations == {"diffusion": 1.25}
+    assert output.peak_memory_mb == 4096.0
+
+
+@pytest.mark.core_model
+@pytest.mark.benchmark
+@pytest.mark.cpu
+@pytest.mark.asyncio
+async def test_chat_completions_metrics_message_level_takes_precedence():
+    payload = {
+        "choices": [
+            {
+                "message": {
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": "data:image/png;base64,abc"},
+                            "stage_durations": {"message_stage": 0.7},
+                            "peak_memory_mb": 1234.0,
+                        }
+                    ]
+                }
+            }
+        ],
+        "metrics": {
+            "stage_durations": {"top_level_stage": 9.9},
+            "peak_memory_mb": 9999.0,
+        },
+    }
+
+    output = await async_request_chat_completions(
+        RequestFuncInput(
+            prompt="draw a dog",
+            api_url="http://test.local/v1/chat/completions",
+            model="ByteDance-Seed/BAGEL-7B-MoT",
+        ),
+        session=_MockSession(payload),
+    )
+
+    assert output.success is True
+    assert output.stage_durations == {"message_stage": 0.7}
+    assert output.peak_memory_mb == 1234.0
@@ -107,7 +107,7 @@ def test_bagel_text2img_online(omni_server, openai_client) -> None:
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_bagel_img2img_online(omni_server, openai_client) -> None:
-    """Test Bagel img2img via OpenAI-compatible chat completions API."""
+    """Test Bagel img2img with explicit height/width in chat completions API."""
     input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB")
     buffer = BytesIO()
     input_image.save(buffer, format="JPEG")
@@ -118,6 +118,8 @@ def test_bagel_img2img_online(omni_server, openai_client) -> None:
         "messages": _build_img2img_messages(IMG2IMG_PROMPT, image_b64),
         "modalities": ["image"],
         "extra_body": {
+            "height": 512,
+            "width": 512,
             "num_inference_steps": 2,
             "guidance_scale": 0.0,
             "seed": 42,

@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Unit tests for OmniChatCompletionResponse/StreamResponse metrics field."""
 
+from types import SimpleNamespace
+
 import pytest
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
@@ -50,3 +52,36 @@ def test_omni_chat_completion_stream_response_metrics():
     )
     assert response.modality == "audio"
     assert response.metrics == {"stage_latency": 0.5}
+
+
+def test_create_image_choice_exposes_diffusion_metrics():
+    """Ensure image chat content exposes profiler metrics for clients."""
+    from PIL import Image
+
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    stage_durations = {"prefill": 0.12, "diffusion": 1.23}
+    peak_memory_mb = 3210.5
+    omni_outputs = SimpleNamespace(
+        request_output=None,
+        stage_durations=stage_durations,
+        peak_memory_mb=peak_memory_mb,
+        images=[Image.new("RGB", (2, 2), color=(255, 0, 0))],
+    )
+
+    choices = OmniOpenAIServingChat._create_image_choice(  # type: ignore[misc]
+        None,
+        omni_outputs=omni_outputs,
+        role="assistant",
+        request=SimpleNamespace(return_token_ids=False),
+    )
+
+    assert len(choices) == 1
+    content = choices[0].message.content
+    assert isinstance(content, list)
+    assert len(content) == 1
+    first_item = content[0]
+    assert first_item["type"] == "image_url"
+    assert first_item["image_url"]["url"].startswith("data:image/png;base64,")
+    assert first_item["stage_durations"] == stage_durations
+    assert first_item["peak_memory_mb"] == peak_memory_mb
@@ -1594,7 +1594,13 @@ async def chat_completion_full_generator(
                 logger.warning(f"Unsupported final output type: {omni_outputs.final_output_type}")
                 continue
             if omni_outputs.metrics:
-                response_metrics = omni_outputs.metrics
+                response_metrics = dict(omni_outputs.metrics)
+            if omni_outputs.final_output_type == "image":
+                # Expose diffusion profiler metrics on the top-level response for benchmarks / clients.
+                if response_metrics is None:
+                    response_metrics = {}
+                response_metrics.setdefault("stage_durations", omni_outputs.stage_durations or {})
+                response_metrics.setdefault("peak_memory_mb", float(omni_outputs.peak_memory_mb or 0.0))
             choices.extend(choices_data)
 
         response = OmniChatCompletionResponse(

@@ -203,6 +203,13 @@ def _get_subparsers(self):
 class OmniBagelMultiModalProcessor(BaseMultiModalProcessor[OmniBagelProcessingInfo]):
     IMG2IMG_PLACEHOLDER = "<|fim_middle|>"
 
+    @staticmethod
+    def _mm_kwargs_for_bagel_img2img_hf(mm_kwargs: Mapping[str, object]) -> dict[str, object]:
+        # OpenAI / GLM-style serving may pass target_h/target_w for output grid sizing.
+        # BagelProcessor does not accept these in img2img mode; strip here so callers
+        # (e.g. serving_chat) can stay model-agnostic.
+        return {k: v for k, v in mm_kwargs.items() if k not in ("target_h", "target_w")}
+
     def _cached_apply_hf_processor(self, inputs, timing_ctx):
         # img2img: prompt text must be modified based on mm data presence,
         # so text and mm data cannot be tokenized separately — bypass cache.
@@ -248,7 +255,7 @@ def _call_hf_processor(
             if "images" in img2img_data:
                 del img2img_data["images"]
             img2img_data["images"] = img2img_data.pop("pixel_values_img2img")
-            kwargs_img2img = dict(mm_kwargs)
+            kwargs_img2img = self._mm_kwargs_for_bagel_img2img_hf(mm_kwargs)
             kwargs_img2img["is_img2img"] = True
             out_img2img = super()._call_hf_processor(prompt, img2img_data, kwargs_img2img, tok_kwargs)
             if "pixel_values" in out_img2img:
@@ -262,7 +269,7 @@ def _call_hf_processor(
         elif has_img2img:
             mm_data = dict(mm_data)
             mm_data["images"] = mm_data.pop("pixel_values_img2img")
-            mm_kwargs = dict(mm_kwargs)
+            mm_kwargs = self._mm_kwargs_for_bagel_img2img_hf(mm_kwargs)
             mm_kwargs["is_img2img"] = True
             outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)
             if "pixel_values" in outputs: