From c645a6b3718ba041dd27f28289a1721926d6d9d9 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Wed, 25 Mar 2026 17:29:06 +0800 Subject: [PATCH 01/11] [CI] remove SGLang benchmark/testing comparison Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- benchmarks/diffusion/backends.py | 252 ------------------ .../diffusion/diffusion_benchmark_serving.py | 4 +- .../perf/scripts/run_diffusion_benchmark.py | 172 ++---------- .../test_qwen_image_sglang_diffusion.json | 241 ----------------- 4 files changed, 17 insertions(+), 652 deletions(-) delete mode 100644 tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json diff --git a/benchmarks/diffusion/backends.py b/benchmarks/diffusion/backends.py index f12995f6cc9..fa53f87aed7 100644 --- a/benchmarks/diffusion/backends.py +++ b/benchmarks/diffusion/backends.py @@ -1,6 +1,5 @@ import asyncio import base64 -import json import mimetypes import os import time @@ -335,263 +334,12 @@ async def async_request_v1_videos( return output -async def async_request_image_sglang( - input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: tqdm | None = None, -) -> RequestFuncOutput: - output = RequestFuncOutput() - output.start_time = time.perf_counter() - - # Check if we need to use multipart (for image edits with input images) - if input.image_paths and len(input.image_paths) > 0: - # Use multipart/form-data for image edits - data = aiohttp.FormData() - data.add_field("model", input.model) - data.add_field("prompt", input.prompt) - data.add_field("response_format", "b64_json") - - if input.width and input.height: - data.add_field("size", f"{input.width}x{input.height}") - - # Merge extra parameters - for key, value in input.extra_body.items(): - data.add_field(key, str(value)) - - # Add image file(s) - for idx, img_path in enumerate(input.image_paths): - if os.path.exists(img_path): - data.add_field( - "image", - open(img_path, "rb"), - filename=os.path.basename(img_path), - content_type="application/octet-stream", - ) - else: - output.error = f"Image file not found: {img_path}" - output.success = False - if pbar: - pbar.update(1) - return output - - try: - async with session.post(input.api_url, data=data) as response: - if response.status == 200: - resp_json = await response.json() - output.response_body = resp_json - output.success = True - if "peak_memory_mb" in resp_json: - output.peak_memory_mb = resp_json["peak_memory_mb"] - else: - output.error = f"HTTP {response.status}: {await response.text()}" - output.success = False - except Exception as e: - output.error = str(e) - output.success = False - else: - # Use JSON for text-to-image generation - payload = { - "model": input.model, - "prompt": input.prompt, - "n": 1, - "response_format": "b64_json", - } - - if input.width and input.height: - payload["size"] = f"{input.width}x{input.height}" - - if input.num_inference_steps: - payload["num_inference_steps"] = input.num_inference_steps - - payload.update(input.extra_body) - - try: - async with session.post(input.api_url, json=payload) as response: - if response.status == 200: - resp_json = await response.json() - output.response_body = resp_json - output.success = True - if "peak_memory_mb" in resp_json: - output.peak_memory_mb = resp_json["peak_memory_mb"] - else: - output.error = f"HTTP {response.status}: {await response.text()}" - output.success = False - except Exception as e: - output.error = str(e) - output.success = False - - output.latency = time.perf_counter() - output.start_time - - # Check SLO if defined - if input.slo_ms is not None and output.success: - output.slo_achieved = (output.latency * 1000.0) <= input.slo_ms - - if pbar: - pbar.update(1) - return output - - -async def async_request_video_sglang( - input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: tqdm | None = None, -) -> RequestFuncOutput: - output = RequestFuncOutput() - output.start_time = time.perf_counter() - - # 1. Submit Job - job_id = None - # Check if we need to upload images (Multipart) or just send JSON - if input.image_paths and len(input.image_paths) > 0: - # Use multipart/form-data - data = aiohttp.FormData() - data.add_field("model", input.model) - data.add_field("prompt", input.prompt) - - if input.width and input.height: - data.add_field("size", f"{input.width}x{input.height}") - - # Add extra body fields to form data if possible, or assume simple key-values - # Note: Nested dicts in extra_body might need JSON serialization if API expects it stringified - if input.extra_body: - data.add_field("extra_body", json.dumps(input.extra_body)) - - # Explicitly add fps/num_frames if they are not in extra_body (bench_serving logic overrides) - if input.num_frames: - data.add_field("num_frames", str(input.num_frames)) - if input.fps: - data.add_field("fps", str(input.fps)) - - # Add image file - # Currently only support single image upload as 'input_reference' per API spec - img_path = input.image_paths[0] - if os.path.exists(img_path): - data.add_field( - "input_reference", - open(img_path, "rb"), - filename=os.path.basename(img_path), - content_type="application/octet-stream", - ) - else: - output.error = f"Image file not found: {img_path}" - output.success = False - if pbar: - pbar.update(1) - return output - - try: - async with session.post(input.api_url, data=data) as response: - if response.status == 200: - resp_json = await response.json() - job_id = resp_json.get("id") - else: - output.error = f"Submit failed HTTP {response.status}: {await response.text()}" - output.success = False - if pbar: - pbar.update(1) - return output - except Exception as e: - output.error = f"Submit exception: {str(e)}" - output.success = False - if pbar: - pbar.update(1) - return output - - else: - # Use JSON - payload: dict[str, Any] = { - "model": input.model, - "prompt": input.prompt, - } - if input.width and input.height: - payload["size"] = f"{input.width}x{input.height}" - if input.num_frames: - payload["num_frames"] = input.num_frames - if input.fps: - payload["fps"] = input.fps - if input.num_inference_steps: - payload["num_inference_steps"] = input.num_inference_steps - - payload.update(input.extra_body) - - try: - async with session.post(input.api_url, json=payload) as response: - if response.status == 200: - resp_json = await response.json() - job_id = resp_json.get("id") - else: - output.error = f"Submit failed HTTP {response.status}: {await response.text()}" - output.success = False - if pbar: - pbar.update(1) - return output - except Exception as e: - output.error = f"Submit exception: {str(e)}" - output.success = False - if pbar: - pbar.update(1) - return output - - if not job_id: - output.error = "No job_id returned" - output.success = False - if pbar: - pbar.update(1) - return output - - # 2. Poll for completion - # Assuming the API returns a 'status' field. - # We construct the check URL. Assuming api_url is like .../v1/videos - # The check url should be .../v1/videos/{id} - check_url = f"{input.api_url}/{job_id}" - - while True: - try: - async with session.get(check_url) as response: - if response.status == 200: - status_data = await response.json() - status = status_data.get("status") - if status == "completed": - output.success = True - output.response_body = status_data - if "peak_memory_mb" in status_data: - output.peak_memory_mb = status_data["peak_memory_mb"] - break - elif status == "failed": - output.success = False - output.error = f"Job failed: {status_data.get('error')}" - break - else: - # queued or processing - await asyncio.sleep(1.0) - else: - output.success = False - output.error = f"Poll failed HTTP {response.status}: {await response.text()}" - break - except Exception as e: - output.success = False - output.error = f"Poll exception: {str(e)}" - break - - output.latency = time.perf_counter() - output.start_time - - # Check SLO if defined - if input.slo_ms is not None and output.success: - output.slo_achieved = (output.latency * 1000.0) <= input.slo_ms - - if pbar: - pbar.update(1) - return output - - backends_function_mapping = { "2i": { "vllm-omni": (async_request_chat_completions, "/v1/chat/completions"), "openai": (async_request_openai_images, "/v1/images/generations"), - "sglang": (async_request_image_sglang, "/v1/images/generations"), }, "2v": { "v1/videos": (async_request_v1_videos, "/v1/videos"), - "sglang": (async_request_video_sglang, "/v1/videos"), }, } diff --git a/benchmarks/diffusion/diffusion_benchmark_serving.py b/benchmarks/diffusion/diffusion_benchmark_serving.py index 7178742d306..91f302a0adf 100644 --- a/benchmarks/diffusion/diffusion_benchmark_serving.py +++ b/benchmarks/diffusion/diffusion_benchmark_serving.py @@ -1,4 +1,4 @@ -# adapted from sglang and fastvideo +# adapted from fastvideo # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project @@ -994,7 +994,7 @@ async def limited_request_func(req, session, pbar): "--backend", type=str, default="vllm-omni", - choices=["vllm-omni", "openai", "sglang", "v1/videos"], + choices=["vllm-omni", "openai", "v1/videos"], help="Backend to target the benchmark to.", ) parser.add_argument( diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 80b5eb03a30..078eaaea33f 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -1,21 +1,15 @@ """ Performance benchmark CI runner for diffusion models. -Supports two server backends: +Supports vLLM-Omni server backend: - vllm-omni (default): starts DiffusionServer via vllm_omni.entrypoints.cli.main, benchmarks with diffusion_benchmark_serving.py --backend vllm-omni - - sglang: starts SglangServer via `sglang serve`, - benchmarks with diffusion_benchmark_serving.py --backend sglang A config JSON file is REQUIRED via --config-file: pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json -JSON config entries are distinguished by a "server_type" field ("vllm-omni" or "sglang"). -sglang entries support two additional fields under server_params: - - "env": dict of extra environment variables (e.g. SGLANG_CACHE_DIT_ENABLED) - - "cache_dit_config": dict written to a temp YAML and passed as - --cache-dit-config to sglang serve (requires cache-dit == 1.3.0) +JSON config entries use a "server_type" field, and this runner executes +the vllm-omni path. All benchmark results for a session are consolidated into a single JSON file under BENCHMARK_RESULT_DIR (override via the DIFFUSION_BENCHMARK_DIR environment variable). @@ -23,7 +17,6 @@ timestamp) together with the raw metrics returned by the benchmark script. """ -import importlib.metadata import json import os import socket @@ -286,145 +279,18 @@ def __exit__(self, *_): _kill_process_tree(self.proc.pid) -_CACHE_DIT_REQUIRED_VERSION = os.environ.get("CACHE_DIT_VERSION", "1.3.0") - - -def _check_cache_dit_version(required: str = _CACHE_DIT_REQUIRED_VERSION) -> None: - """Verify that the installed cache-dit package matches *required* exactly. - - Raises RuntimeError if the package is not installed or the version differs. - """ - try: - installed = importlib.metadata.version("cache-dit") - except importlib.metadata.PackageNotFoundError: - raise RuntimeError( - f"cache-dit is not installed. Please install version {required}: pip install cache-dit=={required}" - ) - if installed != required: - raise RuntimeError( - f"cache-dit version mismatch: required {required}, " - f"but found {installed}. " - f"Please install the correct version: pip install cache-dit=={required}" - ) - - -class SglangServer: - """Start a sglang serve process for diffusion benchmarking. - - Supports two Cache-DiT activation modes: - 1. Environment variable: pass env={"SGLANG_CACHE_DIT_ENABLED": "true"} - 2. YAML config file: pass cache_dit_config={...} (written to a temp - file and forwarded as --cache-dit-config; requires cache-dit >= 1.3.0) - """ - - server_type = "sglang" - - def __init__( - self, - model: str, - serve_args: list[str], - *, - port: int | None = None, - env_overrides: dict[str, str] | None = None, - cache_dit_config: dict[str, Any] | None = None, - ) -> None: - self.model = model - self.serve_args = serve_args - self.host = "127.0.0.1" - self.port = port if port is not None else _get_open_port() - self.env_overrides = env_overrides or {} - self.cache_dit_config = cache_dit_config - self.proc: subprocess.Popen | None = None - self._tmp_yaml: str | None = None - self.test_name: str = "" - if self.cache_dit_config is not None: - _check_cache_dit_version() - - @staticmethod - def _write_cache_dit_yaml(config: dict[str, Any]) -> str: - """Serialize config dict to a temp YAML file and return its path. - - Tries PyYAML first for clean block-style output; falls back to - json.dump since JSON is valid YAML and correctly handles arbitrary - nesting, lists, booleans, and null values. - """ - tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) - try: - import yaml # PyYAML - - yaml.dump(config, tmp, default_flow_style=False, allow_unicode=True) - except ImportError: - json.dump(config, tmp, indent=2, ensure_ascii=False) - tmp.write("\n") - tmp.close() - print(f" Cache-DiT config written to: {tmp.name}") - return tmp.name - - def _start_server(self) -> None: - env = os.environ.copy() - env.update(self.env_overrides) - - cmd = [ - "sglang", - "serve", - "--model-path", - self.model, - "--host", - self.host, - "--port", - str(self.port), - ] + self.serve_args - - if self.cache_dit_config is not None: - self._tmp_yaml = self._write_cache_dit_yaml(self.cache_dit_config) - cmd += ["--cache-dit-config", self._tmp_yaml] - - print(f"Launching SglangServer: {' '.join(cmd)}") - if self.env_overrides: - print(f" Extra env: {self.env_overrides}") - - self.proc = subprocess.Popen( - cmd, - env=env, - cwd=str(Path(__file__).parent.parent.parent.parent), - ) - _wait_for_port(self.host, self.port) - print(f"SglangServer ready on {self.host}:{self.port}") - - def __enter__(self): - self._start_server() - return self - - def __exit__(self, *_): - if self.proc: - _kill_process_tree(self.proc.pid) - if self._tmp_yaml: - try: - Path(self._tmp_yaml).unlink(missing_ok=True) - except Exception: - pass - - # --------------------------------------------------------------------------- # Config helpers # --------------------------------------------------------------------------- -def _build_serve_args(serve_args_dict: dict[str, Any], server_type: str = "vllm-omni") -> list[str]: - """Convert a serve_args dict from test.json into a flat CLI argument list. - - Boolean handling differs by server type: - - vllm-omni uses store_true/store_false style: True → add flag only, - False → omit flag entirely. - - sglang accepts explicit boolean values: always emit ``--flag true/false``. - """ +def _build_serve_args(serve_args_dict: dict[str, Any]) -> list[str]: + """Convert a serve_args dict from test.json into a flat CLI argument list.""" args: list[str] = [] for key, value in serve_args_dict.items(): flag = f"--{key}" if isinstance(value, bool): - if server_type == "sglang": - args.extend([flag, str(value).lower()]) - elif value: + if value: args.append(flag) elif isinstance(value, dict): args.extend([flag, json.dumps(value, separators=(",", ":"))]) @@ -442,16 +308,15 @@ def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]] if test_name in seen: continue seen.add(test_name) - server_type = cfg.get("server_type", "vllm-omni") + if cfg.get("server_type", "vllm-omni") != "vllm-omni": + raise ValueError(f"Unsupported server_type in config: {cfg.get('server_type')}") result.append( { "test_name": test_name, - "server_type": server_type, + "server_type": "vllm-omni", "model": cfg["server_params"]["model"], - "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {}), server_type), - "env_overrides": cfg["server_params"].get("env", {}), - "cache_dit_config": cfg["server_params"].get("cache_dit_config"), - "benchmark_backend": server_type, # "vllm-omni" or "sglang" + "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {})), + "benchmark_backend": "vllm-omni", } ) return result @@ -466,17 +331,10 @@ def _test_param_mapping(configs: list[dict[str, Any]]) -> dict[str, list[dict]]: return mapping -def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer | SglangServer: - """Factory: return the appropriate server instance for the given config.""" +def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer: + """Factory: return a vLLM-Omni diffusion server instance for the config.""" model = server_cfg["model"] serve_args = server_cfg["serve_args"] - if server_cfg["server_type"] == "sglang": - return SglangServer( - model=model, - serve_args=serve_args, - env_overrides=server_cfg.get("env_overrides", {}), - cache_dit_config=server_cfg.get("cache_dit_config"), - ) return DiffusionServer(model=model, serve_args=serve_args) @@ -496,7 +354,7 @@ def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer | SglangServer: @pytest.fixture(scope="module") def diffusion_server(request): - """Start one server (vllm-omni or sglang) per unique test configuration.""" + """Start one vLLM-Omni server per unique test configuration.""" with _server_lock: server_cfg: dict[str, Any] = request.param test_name = server_cfg["test_name"] @@ -694,7 +552,7 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): """ test_name = benchmark_params["test_name"] params = benchmark_params["params"] - backend = diffusion_server.server_type # "vllm-omni" or "sglang" + backend = diffusion_server.server_type # "vllm-omni" result = run_benchmark( host=diffusion_server.host, diff --git a/tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json b/tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json deleted file mode 100644 index 33119e9028a..00000000000 --- a/tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json +++ /dev/null @@ -1,241 +0,0 @@ -[ - { - "test_name": "test_sglang_qwen_image_single_device", - "description": "sglang: single-device baseline (no parallelism)", - "server_type": "sglang", - "server_params": { - "model": "Qwen/Qwen-Image", - "serve_args": { - "num-gpus": 1, - "dit-cpu-offload": false, - "text-encoder-cpu-offload": false, - "vae-cpu-offload": false - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20", - "dataset": "random", - "task": "t2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "1536x1536_steps35", - "dataset": "random", - "task": "t2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - } - ] - }, - - { - "test_name": "test_sglang_qwen_image_ulysses2_cfg2_vae_patch4", - "description": "sglang: USP=2 + CFG-parallel=2 + VAE patch parallel via --use-parallel-tiling", - "server_type": "sglang", - "server_params": { - "model": "Qwen/Qwen-Image", - "serve_args": { - "num-gpus": 4, - "sp-degree": 2, - "ulysses-degree": 2, - "enable-cfg-parallel": true, - "use-parallel-tiling": true, - "dit-cpu-offload": false, - "text-encoder-cpu-offload": false, - "vae-cpu-offload": false - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20", - "dataset": "random", - "task": "t2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "1536x1536_steps35", - "dataset": "random", - "task": "t2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - } - ] - }, - - { - "test_name": "test_sglang_qwen_image_ulysses2_cfg2_cache_dit", - "description": "sglang: USP=2 + CFG-parallel=2 + Cache-DiT via YAML config (--cache-dit-config)", - "server_type": "sglang", - "server_params": { - "model": "Qwen/Qwen-Image", - "cache_dit_config": { - "cache_config": { - "Fn_compute_blocks": 1, - "Bn_compute_blocks": 0, - "max_warmup_steps": 4, - "residual_diff_threshold": 0.24, - "max_continuous_cached_steps": 3, - "enable_taylorseer": false, - "taylorseer_order": 1, - "scm_steps_mask_policy": "none", - "scm_steps_policy": "dynamic" - }, - "parallelism_config": { - "ulysses_size": 2, - "attention_backend": "native" - } - }, - "serve_args": { - "num-gpus": 4, - "sp-degree": 2, - "ulysses-degree": 2, - "enable-cfg-parallel": true, - "dit-cpu-offload": false, - "text-encoder-cpu-offload": false, - "vae-cpu-offload": false - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20", - "dataset": "random", - "task": "t2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "1536x1536_steps35", - "dataset": "random", - "task": "t2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - } - ] - } -] From 9bf8d00f1490b6dab2131ae7720edaed6fa7a5d3 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Thu, 26 Mar 2026 10:11:10 +0800 Subject: [PATCH 02/11] trigger nightly CI Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> From 034a61f3c8b42fe977c0fad7cdd8dd2a2ab48615 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Thu, 26 Mar 2026 10:25:01 +0800 Subject: [PATCH 03/11] trigger nightly CI once more Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> From ecb756e2619c07e8a0567ebc4de6ed407119c001 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:05:51 +0800 Subject: [PATCH 04/11] [CI] lower Qwen Image perf test threshold Based on occassional fails such as https://buildkite.com/vllm/vllm-omni/builds/5232/steps/canvas Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- tests/dfx/perf/tests/test_qwen_image_vllm_omni.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 2921f39edd2..2b95a9a4dfd 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -132,7 +132,7 @@ {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} ], "baseline": { - "throughput_qps": 0.29, + "throughput_qps": 0.20, "latency_p99": 8.5, "peak_memory_mb_max": 61000, "peak_memory_mb_mean": 61000 @@ -215,7 +215,7 @@ {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} ], "baseline": { - "throughput_qps": 0.41, + "throughput_qps": 0.35, "latency_p99": 5.33, "peak_memory_mb_max": 74000, "peak_memory_mb_mean": 74000 From a29fb8939623e60eaf213e0a180ebb0aa6758ecf Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Fri, 27 Mar 2026 10:07:50 +0800 Subject: [PATCH 05/11] Further lower threshold Based on https://buildkite.com/vllm/vllm-omni/builds/5318/steps/canvas?sid=019d2cf6-1ec5-4d1c-a30f-e1fa841a1162&tab=output Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- tests/dfx/perf/tests/test_qwen_image_vllm_omni.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 2b95a9a4dfd..66478e73bb7 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -21,7 +21,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.40, + "throughput_qps": 0.30, "latency_p99": 2.55, "peak_memory_mb_max": 67000, "peak_memory_mb_mean": 67000 @@ -178,7 +178,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.52, - "latency_p99": 2.36, + "latency_p99": 2.20, "peak_memory_mb_max": 67000, "peak_memory_mb_mean": 67000 } From ccdbaab4df6d117d53fad82ce36d0f1e9acf94cd Mon Sep 17 00:00:00 2001 From: Didan Deng <33117903+wtomin@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:55:56 +0800 Subject: [PATCH 06/11] remove mixed resolution Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- .../perf/tests/test_qwen_image_vllm_omni.json | 63 ------------------- 1 file changed, 63 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 66478e73bb7..eea180ad03f 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -43,27 +43,6 @@ "peak_memory_mb_max": 74000, "peak_memory_mb_mean": 74000 } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.14, - "latency_p99": 25.0, - "peak_memory_mb_max": 74000, - "peak_memory_mb_mean": 74000 - } } ] }, @@ -116,27 +95,6 @@ "peak_memory_mb_max": 61000, "peak_memory_mb_mean": 61000 } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.20, - "latency_p99": 8.5, - "peak_memory_mb_max": 61000, - "peak_memory_mb_mean": 61000 - } } ] }, @@ -199,27 +157,6 @@ "peak_memory_mb_max": 74000, "peak_memory_mb_mean": 74000 } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.35, - "latency_p99": 5.33, - "peak_memory_mb_max": 74000, - "peak_memory_mb_mean": 74000 - } } ] } From 6eb682684fa559f6aa02aded0e6d0a6bafbe2379 Mon Sep 17 00:00:00 2001 From: Didan Deng <33117903+wtomin@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:56:19 +0800 Subject: [PATCH 07/11] save server params in json Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- tests/dfx/perf/scripts/run_diffusion_benchmark.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 078eaaea33f..1bd9bf1a143 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -317,6 +317,7 @@ def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]] "model": cfg["server_params"]["model"], "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {})), "benchmark_backend": "vllm-omni", + "server_params": cfg["server_params"], } ) return result @@ -363,6 +364,7 @@ def diffusion_server(request): print(f"\nStarting {server_type} server for test: {test_name}") with _make_server(server_cfg) as server: server.test_name = test_name + server.server_params = server_cfg["server_params"] print(f"{server_type} server started successfully") yield server print(f"{server_type} server stopping…") @@ -400,6 +402,7 @@ def run_benchmark( params: dict[str, Any], test_name: str, backend: str = "vllm-omni", + server_params: dict[str, Any] | None = None, ) -> dict[str, Any]: """Run diffusion_benchmark_serving.py as a subprocess and return parsed metrics. @@ -496,6 +499,7 @@ def run_benchmark( "test_name": test_name, "backend": backend, "timestamp": timestamp, + "server_params": server_params, "benchmark_params": params, "result": metrics, "log_file": str(log_file), @@ -561,6 +565,7 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): params=params, test_name=test_name, backend=backend, + server_params=diffusion_server.server_params, ) print(f"\n{'=' * 60}") From d7eb734d356d448ab359edb4192a8d4c221e8a12 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:53:15 +0800 Subject: [PATCH 08/11] Further adjust threshold https://buildkite.com/vllm/vllm-omni/builds/5326/steps/canvas?sid=019d2d14-786e-47e0-a10a-274764d38354&tab=output Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- tests/dfx/perf/tests/test_qwen_image_vllm_omni.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 66478e73bb7..e8de50be42d 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -22,7 +22,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.30, - "latency_p99": 2.55, + "latency_p99": 3.50, "peak_memory_mb_max": 67000, "peak_memory_mb_mean": 67000 } @@ -178,7 +178,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.52, - "latency_p99": 2.20, + "latency_p99": 3.50, "peak_memory_mb_max": 67000, "peak_memory_mb_mean": 67000 } From 380d67909a6c5cc30c596393345e0bdc530211ff Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Fri, 27 Mar 2026 12:07:12 +0800 Subject: [PATCH 09/11] Replace max latency with mean; remove max memory Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- .../perf/tests/test_qwen_image_vllm_omni.json | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index bd6ac7144e5..230ed57e322 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -22,8 +22,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.30, - "latency_p99": 3.50, - "peak_memory_mb_max": 67000, + "latency_mean": 3.50, "peak_memory_mb_mean": 67000 } }, @@ -39,8 +38,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.037, - "latency_p99": 27.0, - "peak_memory_mb_max": 74000, + "latency_mean": 27.0, "peak_memory_mb_mean": 74000 } } @@ -74,8 +72,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.43, - "latency_p99": 2.34, - "peak_memory_mb_max": 61000, + "latency_mean": 2.34, "peak_memory_mb_mean": 61000 } }, @@ -91,8 +88,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.11, - "latency_p99": 9.1, - "peak_memory_mb_max": 61000, + "latency_mean": 9.1, "peak_memory_mb_mean": 61000 } } @@ -136,8 +132,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.52, - "latency_p99": 3.50, - "peak_memory_mb_max": 67000, + "latency_mean": 3.50, "peak_memory_mb_mean": 67000 } }, @@ -153,8 +148,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.17, - "latency_p99": 6.15, - "peak_memory_mb_max": 74000, + "latency_mean": 6.15, "peak_memory_mb_mean": 74000 } } From 7dd626824f66e89363ca85e5a92aa5346539ac33 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:52:21 +0800 Subject: [PATCH 10/11] Further reduce threshold Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- tests/dfx/perf/tests/test_qwen_image_vllm_omni.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 230ed57e322..3064d317ef3 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -71,7 +71,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.43, + "throughput_qps": 0.15, "latency_mean": 2.34, "peak_memory_mb_mean": 61000 } @@ -87,7 +87,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.11, + "throughput_qps": 0.15, "latency_mean": 9.1, "peak_memory_mb_mean": 61000 } @@ -131,7 +131,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.52, + "throughput_qps": 0.15, "latency_mean": 3.50, "peak_memory_mb_mean": 67000 } @@ -147,7 +147,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.17, + "throughput_qps": 0.15, "latency_mean": 6.15, "peak_memory_mb_mean": 74000 } From ed64ffce6f3447e4e39ae3387ba23be40759b981 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:56:48 +0800 Subject: [PATCH 11/11] update Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- tests/dfx/perf/tests/test_qwen_image_vllm_omni.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 3064d317ef3..387e874ad5f 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -71,7 +71,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.15, + "throughput_qps": 0.1, "latency_mean": 2.34, "peak_memory_mb_mean": 61000 } @@ -87,7 +87,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.15, + "throughput_qps": 0.1, "latency_mean": 9.1, "peak_memory_mb_mean": 61000 } @@ -131,7 +131,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.15, + "throughput_qps": 0.1, "latency_mean": 3.50, "peak_memory_mb_mean": 67000 } @@ -147,7 +147,7 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.15, + "throughput_qps": 0.1, "latency_mean": 6.15, "peak_memory_mb_mean": 74000 }