Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions benchmarks/diffusion/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,18 @@ async def async_request_chat_completions(
output.peak_memory_mb = first_item.get("peak_memory_mb", 0.0)
except (IndexError, TypeError, AttributeError):
pass

if (not output.stage_durations or output.peak_memory_mb == 0.0) and isinstance(
resp_json.get("metrics"), dict
):
m = resp_json["metrics"]
if not output.stage_durations and isinstance(m.get("stage_durations"), dict):
output.stage_durations = m.get("stage_durations") or {}
if output.peak_memory_mb == 0.0 and m.get("peak_memory_mb") is not None:
try:
output.peak_memory_mb = float(m.get("peak_memory_mb") or 0.0)
except (TypeError, ValueError):
pass
else:
output.error = f"HTTP {response.status}: {await response.text()}"
output.success = False
Expand Down
107 changes: 107 additions & 0 deletions tests/benchmarks/test_diffusion_backends_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import pytest

from benchmarks.diffusion.backends import RequestFuncInput, async_request_chat_completions


class _MockResponse:
def __init__(self, payload: dict, status: int = 200):
self._payload = payload
self.status = status

async def __aenter__(self):
return self

async def __aexit__(self, exc_type, exc, tb):
return False

async def json(self):
return self._payload

async def text(self):
return str(self._payload)


class _MockSession:
def __init__(self, payload: dict):
self._payload = payload

def post(self, *args, **kwargs):
return _MockResponse(self._payload)


@pytest.mark.core_model
@pytest.mark.benchmark
@pytest.mark.cpu
@pytest.mark.asyncio
async def test_chat_completions_metrics_fallback_to_top_level():
payload = {
"choices": [
{
"message": {
"content": [
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,abc"},
}
]
}
}
],
"metrics": {
"stage_durations": {"diffusion": 1.25},
"peak_memory_mb": 4096.0,
},
}

output = await async_request_chat_completions(
RequestFuncInput(
prompt="draw a cat",
api_url="http://test.local/v1/chat/completions",
model="ByteDance-Seed/BAGEL-7B-MoT",
),
session=_MockSession(payload),
)

assert output.success is True
assert output.stage_durations == {"diffusion": 1.25}
assert output.peak_memory_mb == 4096.0


@pytest.mark.core_model
@pytest.mark.benchmark
@pytest.mark.cpu
@pytest.mark.asyncio
async def test_chat_completions_metrics_message_level_takes_precedence():
payload = {
"choices": [
{
"message": {
"content": [
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,abc"},
"stage_durations": {"message_stage": 0.7},
"peak_memory_mb": 1234.0,
}
]
}
}
],
"metrics": {
"stage_durations": {"top_level_stage": 9.9},
"peak_memory_mb": 9999.0,
},
}

output = await async_request_chat_completions(
RequestFuncInput(
prompt="draw a dog",
api_url="http://test.local/v1/chat/completions",
model="ByteDance-Seed/BAGEL-7B-MoT",
),
session=_MockSession(payload),
)

assert output.success is True
assert output.stage_durations == {"message_stage": 0.7}
assert output.peak_memory_mb == 1234.0
4 changes: 3 additions & 1 deletion tests/e2e/online_serving/test_bagel_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_bagel_text2img_online(omni_server, openai_client) -> None:
@hardware_test(res={"cuda": "H100", "rocm": "MI325"})
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_bagel_img2img_online(omni_server, openai_client) -> None:
"""Test Bagel img2img via OpenAI-compatible chat completions API."""
"""Test Bagel img2img with explicit height/width in chat completions API."""
input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB")
buffer = BytesIO()
input_image.save(buffer, format="JPEG")
Expand All @@ -118,6 +118,8 @@ def test_bagel_img2img_online(omni_server, openai_client) -> None:
"messages": _build_img2img_messages(IMG2IMG_PROMPT, image_b64),
"modalities": ["image"],
"extra_body": {
"height": 512,
"width": 512,
"num_inference_steps": 2,
"guidance_scale": 0.0,
"seed": 42,
Expand Down
35 changes: 35 additions & 0 deletions tests/entrypoints/openai_api/test_serving_chat_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
"""Unit tests for OmniChatCompletionResponse/StreamResponse metrics field."""

from types import SimpleNamespace

import pytest

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
Expand Down Expand Up @@ -50,3 +52,36 @@ def test_omni_chat_completion_stream_response_metrics():
)
assert response.modality == "audio"
assert response.metrics == {"stage_latency": 0.5}


def test_create_image_choice_exposes_diffusion_metrics():
"""Ensure image chat content exposes profiler metrics for clients."""
from PIL import Image

from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

stage_durations = {"prefill": 0.12, "diffusion": 1.23}
peak_memory_mb = 3210.5
omni_outputs = SimpleNamespace(
request_output=None,
stage_durations=stage_durations,
peak_memory_mb=peak_memory_mb,
images=[Image.new("RGB", (2, 2), color=(255, 0, 0))],
)

choices = OmniOpenAIServingChat._create_image_choice( # type: ignore[misc]
None,
omni_outputs=omni_outputs,
role="assistant",
request=SimpleNamespace(return_token_ids=False),
)

assert len(choices) == 1
content = choices[0].message.content
assert isinstance(content, list)
assert len(content) == 1
first_item = content[0]
assert first_item["type"] == "image_url"
assert first_item["image_url"]["url"].startswith("data:image/png;base64,")
assert first_item["stage_durations"] == stage_durations
assert first_item["peak_memory_mb"] == peak_memory_mb
8 changes: 7 additions & 1 deletion vllm_omni/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1594,7 +1594,13 @@ async def chat_completion_full_generator(
logger.warning(f"Unsupported final output type: {omni_outputs.final_output_type}")
continue
if omni_outputs.metrics:
response_metrics = omni_outputs.metrics
response_metrics = dict(omni_outputs.metrics)
if omni_outputs.final_output_type == "image":
# Expose diffusion profiler metrics on the top-level response for benchmarks / clients.
if response_metrics is None:
response_metrics = {}
response_metrics.setdefault("stage_durations", omni_outputs.stage_durations or {})
response_metrics.setdefault("peak_memory_mb", float(omni_outputs.peak_memory_mb or 0.0))
choices.extend(choices_data)

response = OmniChatCompletionResponse(
Expand Down
11 changes: 9 additions & 2 deletions vllm_omni/model_executor/models/bagel/bagel.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ def _get_subparsers(self):
class OmniBagelMultiModalProcessor(BaseMultiModalProcessor[OmniBagelProcessingInfo]):
IMG2IMG_PLACEHOLDER = "<|fim_middle|>"

@staticmethod
def _mm_kwargs_for_bagel_img2img_hf(mm_kwargs: Mapping[str, object]) -> dict[str, object]:
# OpenAI / GLM-style serving may pass target_h/target_w for output grid sizing.
# BagelProcessor does not accept these in img2img mode; strip here so callers
# (e.g. serving_chat) can stay model-agnostic.
return {k: v for k, v in mm_kwargs.items() if k not in ("target_h", "target_w")}

def _cached_apply_hf_processor(self, inputs, timing_ctx):
# img2img: prompt text must be modified based on mm data presence,
# so text and mm data cannot be tokenized separately — bypass cache.
Expand Down Expand Up @@ -248,7 +255,7 @@ def _call_hf_processor(
if "images" in img2img_data:
del img2img_data["images"]
img2img_data["images"] = img2img_data.pop("pixel_values_img2img")
kwargs_img2img = dict(mm_kwargs)
kwargs_img2img = self._mm_kwargs_for_bagel_img2img_hf(mm_kwargs)
kwargs_img2img["is_img2img"] = True
out_img2img = super()._call_hf_processor(prompt, img2img_data, kwargs_img2img, tok_kwargs)
if "pixel_values" in out_img2img:
Expand All @@ -262,7 +269,7 @@ def _call_hf_processor(
elif has_img2img:
mm_data = dict(mm_data)
mm_data["images"] = mm_data.pop("pixel_values_img2img")
mm_kwargs = dict(mm_kwargs)
mm_kwargs = self._mm_kwargs_for_bagel_img2img_hf(mm_kwargs)
mm_kwargs["is_img2img"] = True
outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)
if "pixel_values" in outputs:
Expand Down
Loading