diff --git a/benchmarks/diffusion/backends.py b/benchmarks/diffusion/backends.py index f12995f6cc9..fa53f87aed7 100644 --- a/benchmarks/diffusion/backends.py +++ b/benchmarks/diffusion/backends.py @@ -1,6 +1,5 @@ import asyncio import base64 -import json import mimetypes import os import time @@ -335,263 +334,12 @@ async def async_request_v1_videos( return output -async def async_request_image_sglang( - input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: tqdm | None = None, -) -> RequestFuncOutput: - output = RequestFuncOutput() - output.start_time = time.perf_counter() - - # Check if we need to use multipart (for image edits with input images) - if input.image_paths and len(input.image_paths) > 0: - # Use multipart/form-data for image edits - data = aiohttp.FormData() - data.add_field("model", input.model) - data.add_field("prompt", input.prompt) - data.add_field("response_format", "b64_json") - - if input.width and input.height: - data.add_field("size", f"{input.width}x{input.height}") - - # Merge extra parameters - for key, value in input.extra_body.items(): - data.add_field(key, str(value)) - - # Add image file(s) - for idx, img_path in enumerate(input.image_paths): - if os.path.exists(img_path): - data.add_field( - "image", - open(img_path, "rb"), - filename=os.path.basename(img_path), - content_type="application/octet-stream", - ) - else: - output.error = f"Image file not found: {img_path}" - output.success = False - if pbar: - pbar.update(1) - return output - - try: - async with session.post(input.api_url, data=data) as response: - if response.status == 200: - resp_json = await response.json() - output.response_body = resp_json - output.success = True - if "peak_memory_mb" in resp_json: - output.peak_memory_mb = resp_json["peak_memory_mb"] - else: - output.error = f"HTTP {response.status}: {await response.text()}" - output.success = False - except Exception as e: - output.error = str(e) - output.success = False - else: - # Use JSON for text-to-image generation - payload = { - "model": input.model, - "prompt": input.prompt, - "n": 1, - "response_format": "b64_json", - } - - if input.width and input.height: - payload["size"] = f"{input.width}x{input.height}" - - if input.num_inference_steps: - payload["num_inference_steps"] = input.num_inference_steps - - payload.update(input.extra_body) - - try: - async with session.post(input.api_url, json=payload) as response: - if response.status == 200: - resp_json = await response.json() - output.response_body = resp_json - output.success = True - if "peak_memory_mb" in resp_json: - output.peak_memory_mb = resp_json["peak_memory_mb"] - else: - output.error = f"HTTP {response.status}: {await response.text()}" - output.success = False - except Exception as e: - output.error = str(e) - output.success = False - - output.latency = time.perf_counter() - output.start_time - - # Check SLO if defined - if input.slo_ms is not None and output.success: - output.slo_achieved = (output.latency * 1000.0) <= input.slo_ms - - if pbar: - pbar.update(1) - return output - - -async def async_request_video_sglang( - input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: tqdm | None = None, -) -> RequestFuncOutput: - output = RequestFuncOutput() - output.start_time = time.perf_counter() - - # 1. Submit Job - job_id = None - # Check if we need to upload images (Multipart) or just send JSON - if input.image_paths and len(input.image_paths) > 0: - # Use multipart/form-data - data = aiohttp.FormData() - data.add_field("model", input.model) - data.add_field("prompt", input.prompt) - - if input.width and input.height: - data.add_field("size", f"{input.width}x{input.height}") - - # Add extra body fields to form data if possible, or assume simple key-values - # Note: Nested dicts in extra_body might need JSON serialization if API expects it stringified - if input.extra_body: - data.add_field("extra_body", json.dumps(input.extra_body)) - - # Explicitly add fps/num_frames if they are not in extra_body (bench_serving logic overrides) - if input.num_frames: - data.add_field("num_frames", str(input.num_frames)) - if input.fps: - data.add_field("fps", str(input.fps)) - - # Add image file - # Currently only support single image upload as 'input_reference' per API spec - img_path = input.image_paths[0] - if os.path.exists(img_path): - data.add_field( - "input_reference", - open(img_path, "rb"), - filename=os.path.basename(img_path), - content_type="application/octet-stream", - ) - else: - output.error = f"Image file not found: {img_path}" - output.success = False - if pbar: - pbar.update(1) - return output - - try: - async with session.post(input.api_url, data=data) as response: - if response.status == 200: - resp_json = await response.json() - job_id = resp_json.get("id") - else: - output.error = f"Submit failed HTTP {response.status}: {await response.text()}" - output.success = False - if pbar: - pbar.update(1) - return output - except Exception as e: - output.error = f"Submit exception: {str(e)}" - output.success = False - if pbar: - pbar.update(1) - return output - - else: - # Use JSON - payload: dict[str, Any] = { - "model": input.model, - "prompt": input.prompt, - } - if input.width and input.height: - payload["size"] = f"{input.width}x{input.height}" - if input.num_frames: - payload["num_frames"] = input.num_frames - if input.fps: - payload["fps"] = input.fps - if input.num_inference_steps: - payload["num_inference_steps"] = input.num_inference_steps - - payload.update(input.extra_body) - - try: - async with session.post(input.api_url, json=payload) as response: - if response.status == 200: - resp_json = await response.json() - job_id = resp_json.get("id") - else: - output.error = f"Submit failed HTTP {response.status}: {await response.text()}" - output.success = False - if pbar: - pbar.update(1) - return output - except Exception as e: - output.error = f"Submit exception: {str(e)}" - output.success = False - if pbar: - pbar.update(1) - return output - - if not job_id: - output.error = "No job_id returned" - output.success = False - if pbar: - pbar.update(1) - return output - - # 2. Poll for completion - # Assuming the API returns a 'status' field. - # We construct the check URL. Assuming api_url is like .../v1/videos - # The check url should be .../v1/videos/{id} - check_url = f"{input.api_url}/{job_id}" - - while True: - try: - async with session.get(check_url) as response: - if response.status == 200: - status_data = await response.json() - status = status_data.get("status") - if status == "completed": - output.success = True - output.response_body = status_data - if "peak_memory_mb" in status_data: - output.peak_memory_mb = status_data["peak_memory_mb"] - break - elif status == "failed": - output.success = False - output.error = f"Job failed: {status_data.get('error')}" - break - else: - # queued or processing - await asyncio.sleep(1.0) - else: - output.success = False - output.error = f"Poll failed HTTP {response.status}: {await response.text()}" - break - except Exception as e: - output.success = False - output.error = f"Poll exception: {str(e)}" - break - - output.latency = time.perf_counter() - output.start_time - - # Check SLO if defined - if input.slo_ms is not None and output.success: - output.slo_achieved = (output.latency * 1000.0) <= input.slo_ms - - if pbar: - pbar.update(1) - return output - - backends_function_mapping = { "2i": { "vllm-omni": (async_request_chat_completions, "/v1/chat/completions"), "openai": (async_request_openai_images, "/v1/images/generations"), - "sglang": (async_request_image_sglang, "/v1/images/generations"), }, "2v": { "v1/videos": (async_request_v1_videos, "/v1/videos"), - "sglang": (async_request_video_sglang, "/v1/videos"), }, } diff --git a/benchmarks/diffusion/diffusion_benchmark_serving.py b/benchmarks/diffusion/diffusion_benchmark_serving.py index 0ae35b1ea35..aad955b0d1d 100644 --- a/benchmarks/diffusion/diffusion_benchmark_serving.py +++ b/benchmarks/diffusion/diffusion_benchmark_serving.py @@ -1,4 +1,4 @@ -# adapted from sglang and fastvideo +# adapted from fastvideo # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project @@ -1007,7 +1007,7 @@ async def limited_request_func(req, session, pbar): "--backend", type=str, default="vllm-omni", - choices=["vllm-omni", "openai", "sglang", "v1/videos"], + choices=["vllm-omni", "openai", "v1/videos"], help="Backend to target the benchmark to.", ) parser.add_argument( diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 80b5eb03a30..1bd9bf1a143 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -1,21 +1,15 @@ """ Performance benchmark CI runner for diffusion models. -Supports two server backends: +Supports vLLM-Omni server backend: - vllm-omni (default): starts DiffusionServer via vllm_omni.entrypoints.cli.main, benchmarks with diffusion_benchmark_serving.py --backend vllm-omni - - sglang: starts SglangServer via `sglang serve`, - benchmarks with diffusion_benchmark_serving.py --backend sglang A config JSON file is REQUIRED via --config-file: pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json -JSON config entries are distinguished by a "server_type" field ("vllm-omni" or "sglang"). -sglang entries support two additional fields under server_params: - - "env": dict of extra environment variables (e.g. SGLANG_CACHE_DIT_ENABLED) - - "cache_dit_config": dict written to a temp YAML and passed as - --cache-dit-config to sglang serve (requires cache-dit == 1.3.0) +JSON config entries use a "server_type" field, and this runner executes +the vllm-omni path. All benchmark results for a session are consolidated into a single JSON file under BENCHMARK_RESULT_DIR (override via the DIFFUSION_BENCHMARK_DIR environment variable). @@ -23,7 +17,6 @@ timestamp) together with the raw metrics returned by the benchmark script. """ -import importlib.metadata import json import os import socket @@ -286,145 +279,18 @@ def __exit__(self, *_): _kill_process_tree(self.proc.pid) -_CACHE_DIT_REQUIRED_VERSION = os.environ.get("CACHE_DIT_VERSION", "1.3.0") - - -def _check_cache_dit_version(required: str = _CACHE_DIT_REQUIRED_VERSION) -> None: - """Verify that the installed cache-dit package matches *required* exactly. - - Raises RuntimeError if the package is not installed or the version differs. - """ - try: - installed = importlib.metadata.version("cache-dit") - except importlib.metadata.PackageNotFoundError: - raise RuntimeError( - f"cache-dit is not installed. Please install version {required}: pip install cache-dit=={required}" - ) - if installed != required: - raise RuntimeError( - f"cache-dit version mismatch: required {required}, " - f"but found {installed}. " - f"Please install the correct version: pip install cache-dit=={required}" - ) - - -class SglangServer: - """Start a sglang serve process for diffusion benchmarking. - - Supports two Cache-DiT activation modes: - 1. Environment variable: pass env={"SGLANG_CACHE_DIT_ENABLED": "true"} - 2. YAML config file: pass cache_dit_config={...} (written to a temp - file and forwarded as --cache-dit-config; requires cache-dit >= 1.3.0) - """ - - server_type = "sglang" - - def __init__( - self, - model: str, - serve_args: list[str], - *, - port: int | None = None, - env_overrides: dict[str, str] | None = None, - cache_dit_config: dict[str, Any] | None = None, - ) -> None: - self.model = model - self.serve_args = serve_args - self.host = "127.0.0.1" - self.port = port if port is not None else _get_open_port() - self.env_overrides = env_overrides or {} - self.cache_dit_config = cache_dit_config - self.proc: subprocess.Popen | None = None - self._tmp_yaml: str | None = None - self.test_name: str = "" - if self.cache_dit_config is not None: - _check_cache_dit_version() - - @staticmethod - def _write_cache_dit_yaml(config: dict[str, Any]) -> str: - """Serialize config dict to a temp YAML file and return its path. - - Tries PyYAML first for clean block-style output; falls back to - json.dump since JSON is valid YAML and correctly handles arbitrary - nesting, lists, booleans, and null values. - """ - tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) - try: - import yaml # PyYAML - - yaml.dump(config, tmp, default_flow_style=False, allow_unicode=True) - except ImportError: - json.dump(config, tmp, indent=2, ensure_ascii=False) - tmp.write("\n") - tmp.close() - print(f" Cache-DiT config written to: {tmp.name}") - return tmp.name - - def _start_server(self) -> None: - env = os.environ.copy() - env.update(self.env_overrides) - - cmd = [ - "sglang", - "serve", - "--model-path", - self.model, - "--host", - self.host, - "--port", - str(self.port), - ] + self.serve_args - - if self.cache_dit_config is not None: - self._tmp_yaml = self._write_cache_dit_yaml(self.cache_dit_config) - cmd += ["--cache-dit-config", self._tmp_yaml] - - print(f"Launching SglangServer: {' '.join(cmd)}") - if self.env_overrides: - print(f" Extra env: {self.env_overrides}") - - self.proc = subprocess.Popen( - cmd, - env=env, - cwd=str(Path(__file__).parent.parent.parent.parent), - ) - _wait_for_port(self.host, self.port) - print(f"SglangServer ready on {self.host}:{self.port}") - - def __enter__(self): - self._start_server() - return self - - def __exit__(self, *_): - if self.proc: - _kill_process_tree(self.proc.pid) - if self._tmp_yaml: - try: - Path(self._tmp_yaml).unlink(missing_ok=True) - except Exception: - pass - - # --------------------------------------------------------------------------- # Config helpers # --------------------------------------------------------------------------- -def _build_serve_args(serve_args_dict: dict[str, Any], server_type: str = "vllm-omni") -> list[str]: - """Convert a serve_args dict from test.json into a flat CLI argument list. - - Boolean handling differs by server type: - - vllm-omni uses store_true/store_false style: True → add flag only, - False → omit flag entirely. - - sglang accepts explicit boolean values: always emit ``--flag true/false``. - """ +def _build_serve_args(serve_args_dict: dict[str, Any]) -> list[str]: + """Convert a serve_args dict from test.json into a flat CLI argument list.""" args: list[str] = [] for key, value in serve_args_dict.items(): flag = f"--{key}" if isinstance(value, bool): - if server_type == "sglang": - args.extend([flag, str(value).lower()]) - elif value: + if value: args.append(flag) elif isinstance(value, dict): args.extend([flag, json.dumps(value, separators=(",", ":"))]) @@ -442,16 +308,16 @@ def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]] if test_name in seen: continue seen.add(test_name) - server_type = cfg.get("server_type", "vllm-omni") + if cfg.get("server_type", "vllm-omni") != "vllm-omni": + raise ValueError(f"Unsupported server_type in config: {cfg.get('server_type')}") result.append( { "test_name": test_name, - "server_type": server_type, + "server_type": "vllm-omni", "model": cfg["server_params"]["model"], - "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {}), server_type), - "env_overrides": cfg["server_params"].get("env", {}), - "cache_dit_config": cfg["server_params"].get("cache_dit_config"), - "benchmark_backend": server_type, # "vllm-omni" or "sglang" + "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {})), + "benchmark_backend": "vllm-omni", + "server_params": cfg["server_params"], } ) return result @@ -466,17 +332,10 @@ def _test_param_mapping(configs: list[dict[str, Any]]) -> dict[str, list[dict]]: return mapping -def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer | SglangServer: - """Factory: return the appropriate server instance for the given config.""" +def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer: + """Factory: return a vLLM-Omni diffusion server instance for the config.""" model = server_cfg["model"] serve_args = server_cfg["serve_args"] - if server_cfg["server_type"] == "sglang": - return SglangServer( - model=model, - serve_args=serve_args, - env_overrides=server_cfg.get("env_overrides", {}), - cache_dit_config=server_cfg.get("cache_dit_config"), - ) return DiffusionServer(model=model, serve_args=serve_args) @@ -496,7 +355,7 @@ def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer | SglangServer: @pytest.fixture(scope="module") def diffusion_server(request): - """Start one server (vllm-omni or sglang) per unique test configuration.""" + """Start one vLLM-Omni server per unique test configuration.""" with _server_lock: server_cfg: dict[str, Any] = request.param test_name = server_cfg["test_name"] @@ -505,6 +364,7 @@ def diffusion_server(request): print(f"\nStarting {server_type} server for test: {test_name}") with _make_server(server_cfg) as server: server.test_name = test_name + server.server_params = server_cfg["server_params"] print(f"{server_type} server started successfully") yield server print(f"{server_type} server stopping…") @@ -542,6 +402,7 @@ def run_benchmark( params: dict[str, Any], test_name: str, backend: str = "vllm-omni", + server_params: dict[str, Any] | None = None, ) -> dict[str, Any]: """Run diffusion_benchmark_serving.py as a subprocess and return parsed metrics. @@ -638,6 +499,7 @@ def run_benchmark( "test_name": test_name, "backend": backend, "timestamp": timestamp, + "server_params": server_params, "benchmark_params": params, "result": metrics, "log_file": str(log_file), @@ -694,7 +556,7 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): """ test_name = benchmark_params["test_name"] params = benchmark_params["params"] - backend = diffusion_server.server_type # "vllm-omni" or "sglang" + backend = diffusion_server.server_type # "vllm-omni" result = run_benchmark( host=diffusion_server.host, @@ -703,6 +565,7 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): params=params, test_name=test_name, backend=backend, + server_params=diffusion_server.server_params, ) print(f"\n{'=' * 60}") diff --git a/tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json b/tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json deleted file mode 100644 index 33119e9028a..00000000000 --- a/tests/dfx/perf/tests/test_qwen_image_sglang_diffusion.json +++ /dev/null @@ -1,241 +0,0 @@ -[ - { - "test_name": "test_sglang_qwen_image_single_device", - "description": "sglang: single-device baseline (no parallelism)", - "server_type": "sglang", - "server_params": { - "model": "Qwen/Qwen-Image", - "serve_args": { - "num-gpus": 1, - "dit-cpu-offload": false, - "text-encoder-cpu-offload": false, - "vae-cpu-offload": false - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20", - "dataset": "random", - "task": "t2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "1536x1536_steps35", - "dataset": "random", - "task": "t2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - } - ] - }, - - { - "test_name": "test_sglang_qwen_image_ulysses2_cfg2_vae_patch4", - "description": "sglang: USP=2 + CFG-parallel=2 + VAE patch parallel via --use-parallel-tiling", - "server_type": "sglang", - "server_params": { - "model": "Qwen/Qwen-Image", - "serve_args": { - "num-gpus": 4, - "sp-degree": 2, - "ulysses-degree": 2, - "enable-cfg-parallel": true, - "use-parallel-tiling": true, - "dit-cpu-offload": false, - "text-encoder-cpu-offload": false, - "vae-cpu-offload": false - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20", - "dataset": "random", - "task": "t2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "1536x1536_steps35", - "dataset": "random", - "task": "t2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - } - ] - }, - - { - "test_name": "test_sglang_qwen_image_ulysses2_cfg2_cache_dit", - "description": "sglang: USP=2 + CFG-parallel=2 + Cache-DiT via YAML config (--cache-dit-config)", - "server_type": "sglang", - "server_params": { - "model": "Qwen/Qwen-Image", - "cache_dit_config": { - "cache_config": { - "Fn_compute_blocks": 1, - "Bn_compute_blocks": 0, - "max_warmup_steps": 4, - "residual_diff_threshold": 0.24, - "max_continuous_cached_steps": 3, - "enable_taylorseer": false, - "taylorseer_order": 1, - "scm_steps_mask_policy": "none", - "scm_steps_policy": "dynamic" - }, - "parallelism_config": { - "ulysses_size": 2, - "attention_backend": "native" - } - }, - "serve_args": { - "num-gpus": 4, - "sp-degree": 2, - "ulysses-degree": 2, - "enable-cfg-parallel": true, - "dit-cpu-offload": false, - "text-encoder-cpu-offload": false, - "vae-cpu-offload": false - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20", - "dataset": "random", - "task": "t2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "1536x1536_steps35", - "dataset": "random", - "task": "t2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.001, - "latency_p99": 1000.0, - "peak_memory_mb_max": 400000, - "peak_memory_mb_mean": 400000 - } - } - ] - } -] diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 2921f39edd2..387e874ad5f 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -21,9 +21,8 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.40, - "latency_p99": 2.55, - "peak_memory_mb_max": 67000, + "throughput_qps": 0.30, + "latency_mean": 3.50, "peak_memory_mb_mean": 67000 } }, @@ -39,29 +38,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.037, - "latency_p99": 27.0, - "peak_memory_mb_max": 74000, - "peak_memory_mb_mean": 74000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.14, - "latency_p99": 25.0, - "peak_memory_mb_max": 74000, + "latency_mean": 27.0, "peak_memory_mb_mean": 74000 } } @@ -94,9 +71,8 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.43, - "latency_p99": 2.34, - "peak_memory_mb_max": 61000, + "throughput_qps": 0.1, + "latency_mean": 2.34, "peak_memory_mb_mean": 61000 } }, @@ -111,30 +87,8 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.11, - "latency_p99": 9.1, - "peak_memory_mb_max": 61000, - "peak_memory_mb_mean": 61000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.29, - "latency_p99": 8.5, - "peak_memory_mb_max": 61000, + "throughput_qps": 0.1, + "latency_mean": 9.1, "peak_memory_mb_mean": 61000 } } @@ -177,9 +131,8 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.52, - "latency_p99": 2.36, - "peak_memory_mb_max": 67000, + "throughput_qps": 0.1, + "latency_mean": 3.50, "peak_memory_mb_mean": 67000 } }, @@ -194,30 +147,8 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.17, - "latency_p99": 6.15, - "peak_memory_mb_max": 74000, - "peak_memory_mb_mean": 74000 - } - }, - { - "name": "mixed_resolution", - "dataset": "random", - "task": "t2i", - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "random-request-config": [ - {"width": 512, "height": 512, "num_inference_steps": 20, "weight": 0.15}, - {"width": 768, "height": 768, "num_inference_steps": 20, "weight": 0.25}, - {"width": 1024, "height": 1024, "num_inference_steps": 25, "weight": 0.45}, - {"width": 1536, "height": 1536, "num_inference_steps": 35, "weight": 0.15} - ], - "baseline": { - "throughput_qps": 0.41, - "latency_p99": 5.33, - "peak_memory_mb_max": 74000, + "throughput_qps": 0.1, + "latency_mean": 6.15, "peak_memory_mb_mean": 74000 } }