diff --git a/benchmarks/diffusion/backends.py b/benchmarks/diffusion/backends.py index 13ce7c8309d..d33160f1377 100644 --- a/benchmarks/diffusion/backends.py +++ b/benchmarks/diffusion/backends.py @@ -122,6 +122,18 @@ async def async_request_chat_completions( output.peak_memory_mb = first_item.get("peak_memory_mb", 0.0) except (IndexError, TypeError, AttributeError): pass + + if (not output.stage_durations or output.peak_memory_mb == 0.0) and isinstance( + resp_json.get("metrics"), dict + ): + m = resp_json["metrics"] + if not output.stage_durations and isinstance(m.get("stage_durations"), dict): + output.stage_durations = m.get("stage_durations") or {} + if output.peak_memory_mb == 0.0 and m.get("peak_memory_mb") is not None: + try: + output.peak_memory_mb = float(m.get("peak_memory_mb") or 0.0) + except (TypeError, ValueError): + pass else: output.error = f"HTTP {response.status}: {await response.text()}" output.success = False diff --git a/tests/benchmarks/test_diffusion_backends_metrics.py b/tests/benchmarks/test_diffusion_backends_metrics.py new file mode 100644 index 00000000000..2d51d0f1d38 --- /dev/null +++ b/tests/benchmarks/test_diffusion_backends_metrics.py @@ -0,0 +1,107 @@ +import pytest + +from benchmarks.diffusion.backends import RequestFuncInput, async_request_chat_completions + + +class _MockResponse: + def __init__(self, payload: dict, status: int = 200): + self._payload = payload + self.status = status + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + async def json(self): + return self._payload + + async def text(self): + return str(self._payload) + + +class _MockSession: + def __init__(self, payload: dict): + self._payload = payload + + def post(self, *args, **kwargs): + return _MockResponse(self._payload) + + +@pytest.mark.core_model +@pytest.mark.benchmark +@pytest.mark.cpu +@pytest.mark.asyncio +async def test_chat_completions_metrics_fallback_to_top_level(): + payload = { + "choices": [ + { + "message": { + "content": [ + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,abc"}, + } + ] + } + } + ], + "metrics": { + "stage_durations": {"diffusion": 1.25}, + "peak_memory_mb": 4096.0, + }, + } + + output = await async_request_chat_completions( + RequestFuncInput( + prompt="draw a cat", + api_url="http://test.local/v1/chat/completions", + model="ByteDance-Seed/BAGEL-7B-MoT", + ), + session=_MockSession(payload), + ) + + assert output.success is True + assert output.stage_durations == {"diffusion": 1.25} + assert output.peak_memory_mb == 4096.0 + + +@pytest.mark.core_model +@pytest.mark.benchmark +@pytest.mark.cpu +@pytest.mark.asyncio +async def test_chat_completions_metrics_message_level_takes_precedence(): + payload = { + "choices": [ + { + "message": { + "content": [ + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,abc"}, + "stage_durations": {"message_stage": 0.7}, + "peak_memory_mb": 1234.0, + } + ] + } + } + ], + "metrics": { + "stage_durations": {"top_level_stage": 9.9}, + "peak_memory_mb": 9999.0, + }, + } + + output = await async_request_chat_completions( + RequestFuncInput( + prompt="draw a dog", + api_url="http://test.local/v1/chat/completions", + model="ByteDance-Seed/BAGEL-7B-MoT", + ), + session=_MockSession(payload), + ) + + assert output.success is True + assert output.stage_durations == {"message_stage": 0.7} + assert output.peak_memory_mb == 1234.0 diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py index a3f999f13da..5a6078be29d 100644 --- a/tests/e2e/online_serving/test_bagel_online.py +++ b/tests/e2e/online_serving/test_bagel_online.py @@ -107,7 +107,7 @@ def test_bagel_text2img_online(omni_server, openai_client) -> None: @hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_bagel_img2img_online(omni_server, openai_client) -> None: - """Test Bagel img2img via OpenAI-compatible chat completions API.""" + """Test Bagel img2img with explicit height/width in chat completions API.""" input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") buffer = BytesIO() input_image.save(buffer, format="JPEG") @@ -118,6 +118,8 @@ def test_bagel_img2img_online(omni_server, openai_client) -> None: "messages": _build_img2img_messages(IMG2IMG_PROMPT, image_b64), "modalities": ["image"], "extra_body": { + "height": 512, + "width": 512, "num_inference_steps": 2, "guidance_scale": 0.0, "seed": 42, diff --git a/tests/entrypoints/openai_api/test_serving_chat_metrics.py b/tests/entrypoints/openai_api/test_serving_chat_metrics.py index d25af6c3843..0647c40a33f 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_metrics.py +++ b/tests/entrypoints/openai_api/test_serving_chat_metrics.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Unit tests for OmniChatCompletionResponse/StreamResponse metrics field.""" +from types import SimpleNamespace + import pytest pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -50,3 +52,36 @@ def test_omni_chat_completion_stream_response_metrics(): ) assert response.modality == "audio" assert response.metrics == {"stage_latency": 0.5} + + +def test_create_image_choice_exposes_diffusion_metrics(): + """Ensure image chat content exposes profiler metrics for clients.""" + from PIL import Image + + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + stage_durations = {"prefill": 0.12, "diffusion": 1.23} + peak_memory_mb = 3210.5 + omni_outputs = SimpleNamespace( + request_output=None, + stage_durations=stage_durations, + peak_memory_mb=peak_memory_mb, + images=[Image.new("RGB", (2, 2), color=(255, 0, 0))], + ) + + choices = OmniOpenAIServingChat._create_image_choice( # type: ignore[misc] + None, + omni_outputs=omni_outputs, + role="assistant", + request=SimpleNamespace(return_token_ids=False), + ) + + assert len(choices) == 1 + content = choices[0].message.content + assert isinstance(content, list) + assert len(content) == 1 + first_item = content[0] + assert first_item["type"] == "image_url" + assert first_item["image_url"]["url"].startswith("data:image/png;base64,") + assert first_item["stage_durations"] == stage_durations + assert first_item["peak_memory_mb"] == peak_memory_mb diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 8cddac6a6c5..125e19bf539 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -1594,7 +1594,13 @@ async def chat_completion_full_generator( logger.warning(f"Unsupported final output type: {omni_outputs.final_output_type}") continue if omni_outputs.metrics: - response_metrics = omni_outputs.metrics + response_metrics = dict(omni_outputs.metrics) + if omni_outputs.final_output_type == "image": + # Expose diffusion profiler metrics on the top-level response for benchmarks / clients. + if response_metrics is None: + response_metrics = {} + response_metrics.setdefault("stage_durations", omni_outputs.stage_durations or {}) + response_metrics.setdefault("peak_memory_mb", float(omni_outputs.peak_memory_mb or 0.0)) choices.extend(choices_data) response = OmniChatCompletionResponse( diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index cbb775680cc..9f07ef6a05b 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -203,6 +203,13 @@ def _get_subparsers(self): class OmniBagelMultiModalProcessor(BaseMultiModalProcessor[OmniBagelProcessingInfo]): IMG2IMG_PLACEHOLDER = "<|fim_middle|>" + @staticmethod + def _mm_kwargs_for_bagel_img2img_hf(mm_kwargs: Mapping[str, object]) -> dict[str, object]: + # OpenAI / GLM-style serving may pass target_h/target_w for output grid sizing. + # BagelProcessor does not accept these in img2img mode; strip here so callers + # (e.g. serving_chat) can stay model-agnostic. + return {k: v for k, v in mm_kwargs.items() if k not in ("target_h", "target_w")} + def _cached_apply_hf_processor(self, inputs, timing_ctx): # img2img: prompt text must be modified based on mm data presence, # so text and mm data cannot be tokenized separately — bypass cache. @@ -248,7 +255,7 @@ def _call_hf_processor( if "images" in img2img_data: del img2img_data["images"] img2img_data["images"] = img2img_data.pop("pixel_values_img2img") - kwargs_img2img = dict(mm_kwargs) + kwargs_img2img = self._mm_kwargs_for_bagel_img2img_hf(mm_kwargs) kwargs_img2img["is_img2img"] = True out_img2img = super()._call_hf_processor(prompt, img2img_data, kwargs_img2img, tok_kwargs) if "pixel_values" in out_img2img: @@ -262,7 +269,7 @@ def _call_hf_processor( elif has_img2img: mm_data = dict(mm_data) mm_data["images"] = mm_data.pop("pixel_values_img2img") - mm_kwargs = dict(mm_kwargs) + mm_kwargs = self._mm_kwargs_for_bagel_img2img_hf(mm_kwargs) mm_kwargs["is_img2img"] = True outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs) if "pixel_values" in outputs: