diff --git a/benchmarks/diffusion/backends.py b/benchmarks/diffusion/backends.py index 6097cdccca9..f64b0e350ab 100644 --- a/benchmarks/diffusion/backends.py +++ b/benchmarks/diffusion/backends.py @@ -1,3 +1,4 @@ +import asyncio import base64 import mimetypes import os @@ -206,7 +207,140 @@ async def async_request_openai_images( return output +async def async_request_v1_videos( + input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + output = RequestFuncOutput() + output.start_time = time.perf_counter() + + files = dict(input.extra_body) + if input.prompt: + files.setdefault("prompt", input.prompt) + if input.width and input.height: + files.setdefault("height", input.height) + files.setdefault("width", input.width) + if input.num_frames: + files.setdefault("num_frames", input.num_frames) + if input.num_inference_steps: + files.setdefault("num_inference_steps", input.num_inference_steps) + if input.seed is not None: + files.setdefault("seed", input.seed) + if input.fps: + files.setdefault("fps", input.fps) + + form = aiohttp.FormData() + for k, v in files.items(): + form.add_field(k, str(v)) + + image_file = None + if input.image_paths and len(input.image_paths) > 0: + image_path = input.image_paths[0] + image_file = open(image_path, "rb") + form.add_field( + "input_reference", + image_file, + filename=os.path.basename(image_path), + content_type="application/octet-stream", + ) + + job_id = None + job_status = None + poll_json = {} + resp_json = {} + + try: + # invoke a post request (POST /v1/videos) + async with session.post(input.api_url, data=form) as response: + if response.status == 200: + resp_json = await response.json() + job_id = resp_json.get("id") + job_status = resp_json.get("status") + if not job_id or not job_status: + output.error = "API response missing job 'id' or 'status' field." + output.success = False + return output + else: + output.error = f"HTTP {response.status}: {await response.text()}" + output.success = False + return output + + # invoke a poll request (GET /v1/videos/{video_id}) + poll_interval = 2.0 # Unit(s) + timeout_seconds = 600.0 + deadline = time.perf_counter() + timeout_seconds + job_url = f"{input.api_url}/{job_id}" + + while job_status not in {"completed", "failed"}: + await asyncio.sleep(poll_interval) + + async with session.get(job_url) as poll_response: + if poll_response.status != 200: + output.error = f"Polling failed HTTP {poll_response.status}: {await poll_response.text()}" + output.success = False + return output + + poll_json = await poll_response.json() + job_status = poll_json.get("status") + + if time.perf_counter() >= deadline: + output.error = f"Timed out waiting for video job {job_id} to complete." + output.success = False + return output + + if job_status == "failed": + output.error = f"Video job failed: {poll_json}" + output.success = False + return output + + # invoke a get request (GET /v1/videos/{video_id}/content) + content_url = f"{job_url}/content" + async with session.get(content_url) as content_response: + if content_response.status != 200: + output.error = ( + f"Content retrieval failed HTTP {content_response.status}: {await content_response.text()}" + ) + output.success = False + return output + + video_bytes = await content_response.read() + output.response_body = video_bytes + output.success = True + if "peak_memory_mb" in poll_json: + output.peak_memory_mb = poll_json["peak_memory_mb"] + elif "peak_memory_mb" in resp_json: + output.peak_memory_mb = resp_json["peak_memory_mb"] + except Exception as e: + output.error = str(e) + output.success = False + finally: + if image_file is not None: + image_file.close() + + if job_id is not None: + try: + async with session.delete(f"{input.api_url}/{job_id}") as _: + pass + except Exception as e: + print(f"Failed to clean up video job {job_id}: {e}") + + output.latency = time.perf_counter() - output.start_time + + if output.success and input.slo_ms is not None: + output.slo_achieved = (output.latency * 1000.0) <= float(input.slo_ms) + + if pbar: + pbar.update(1) + return output + + backends_function_mapping = { - "vllm-omni": (async_request_chat_completions, "/v1/chat/completions"), - "openai": (async_request_openai_images, "/v1/images/generations"), + "2i": { + "vllm-omni": (async_request_chat_completions, "/v1/chat/completions"), + "openai": (async_request_openai_images, "/v1/images/generations"), + }, + "2v": { + "v1/videos": (async_request_v1_videos, "/v1/videos"), + }, } diff --git a/benchmarks/diffusion/diffusion_benchmark_serving.py b/benchmarks/diffusion/diffusion_benchmark_serving.py index a00e975b0a2..602f9938e69 100644 --- a/benchmarks/diffusion/diffusion_benchmark_serving.py +++ b/benchmarks/diffusion/diffusion_benchmark_serving.py @@ -9,6 +9,7 @@ Supports multiple backends: - vllm-omni: Uses /v1/chat/completions endpoint (default) - openai: Uses /v1/images/generations endpoint + - v1/videos: Use /v1/videos endpoint Usage: # Video (vllm-omni backend) @@ -28,6 +29,16 @@ --backend vllm-omni --dataset vbench --task t2i --num-prompts 10 \ --height 1024 --width 1024 + python3 benchmarks/diffusion/diffusion_benchmark_serving.py \ + --backend vllm-omni --dataset random --task t2i --num-prompts 1 \ + --max-concurrency 1 --enable-negative-prompt \ + --random-request-config '[ + {"width":512,"height":512,"num_inference_steps":20,"weight":0.15}, + {"width":768,"height":768,"num_inference_steps":20,"weight":0.25}, + {"width":1024,"height":1024,"num_inference_steps":25,"weight":0.45}, + {"width":1536,"height":1536,"num_inference_steps":35,"weight":0.15} + ]' + i2i: python3 benchmarks/diffusion/diffusion_benchmark_serving.py \ --backend vllm-omni --dataset vbench --task i2i --num-prompts 10 @@ -38,6 +49,16 @@ --backend openai --dataset vbench --task t2i --num-prompts 10 \ --height 1024 --width 1024 --port 3000 + # Video (v1/vedeos) + t2v: + python3 benchmarks/diffusion/diffusion_benchmark_serving.py \ + --backend v1/videos --dataset random --task t2v --num-prompts 1 \ + --max-concurrency 1 --enable-negative-prompt \ + --random-request-config '[ + {"width":854,"height":480,"num_inference_steps":18,"num_frames":120,"fps":24,"weight":1} + ]' + + """ import argparse @@ -45,7 +66,10 @@ import asyncio import glob import json +import logging import os +import random +import tempfile import time import uuid from abc import ABC, abstractmethod @@ -57,8 +81,11 @@ import numpy as np import requests from backends import RequestFuncInput, RequestFuncOutput, backends_function_mapping +from PIL import Image from tqdm.asyncio import tqdm +logger = logging.getLogger(__name__) + class BaseDataset(ABC): def __init__(self, args, api_url: str, model: str): @@ -516,26 +543,66 @@ def get_requests(self) -> list[RequestFuncInput]: class RandomDataset(BaseDataset): - def __init__(self, args, api_url: str, model: str): - self.args = args - self.api_url = api_url - self.model = model + def __init__(self, args, api_url: str, model: str, enable_negative_prompt: bool = False): + super().__init__(args, api_url, model) self.num_prompts = args.num_prompts + self.enable_negative_prompt = enable_negative_prompt + self.random_request_config = getattr(args, "random_request_config", None) + if self.random_request_config: + self.random_request_config = json.loads(self.random_request_config) + self._weights = [p["weight"] for p in self.random_request_config] + + self.random_request_config = [ + {k: v for k, v in p.items() if k != "weight"} for p in self.random_request_config + ] + + seed = getattr(args, "random_request_seed", 42) + self._rng = random.Random(seed) + + self._sampled_requests = self._rng.choices( + self.random_request_config, + weights=self._weights, + k=self.num_prompts, + ) + else: + self._sampled_requests = None + + # Random image generate + if self.args.task in ["i2v", "ti2v", "ti2i", "i2i"]: + img = Image.new("RGB", (512, 512), (255, 255, 255)) + + image_path = os.path.join(tempfile.gettempdir(), "diffusion_benchmark_random_image.png") + self._random_image_path = [image_path] + img.save(image_path) + else: + self._random_image_path = None def __len__(self) -> int: return self.num_prompts def __getitem__(self, idx: int) -> RequestFuncInput: + extra_body = {} + if self.enable_negative_prompt: + extra_body["negative_prompt"] = f"Negative prompt {idx} for benchmarking diffusion models" + + params = { + "width": self.args.width, + "height": self.args.height, + "num_frames": self.args.num_frames, + "num_inference_steps": self.args.num_inference_steps, + "fps": self.args.fps, + } + if self._sampled_requests: + profile = self._sampled_requests[idx] + params.update(profile) return RequestFuncInput( prompt=f"Random prompt {idx} for benchmarking diffusion models", api_url=self.api_url, model=self.model, - width=self.args.width, - height=self.args.height, - num_frames=self.args.num_frames, - num_inference_steps=self.args.num_inference_steps, seed=self.args.seed, - fps=self.args.fps, + extra_body=extra_body, + image_paths=self._random_image_path, + **params, ) def get_requests(self) -> list[RequestFuncInput]: @@ -686,6 +753,7 @@ def calculate_metrics( "latency_mean": np.mean(latencies) if latencies else 0, "latency_median": np.median(latencies) if latencies else 0, "latency_p99": np.percentile(latencies, 99) if latencies else 0, + "latency_p95": np.percentile(latencies, 95) if latencies else 0, "latency_p50": np.percentile(latencies, 50) if latencies else 0, "peak_memory_mb_max": max(peak_memories) if peak_memories else 0, "peak_memory_mb_mean": np.mean(peak_memories) if peak_memories else 0, @@ -742,8 +810,32 @@ async def benchmark(args): if args.base_url is None: args.base_url = f"http://{args.host}:{args.port}" + VIDEO_TASKS = {"t2v", "i2v", "ti2v"} + IMAGE_TASKS = {"t2i", "i2i", "ti2i"} + + if args.task in VIDEO_TASKS: + task_type = "2v" + elif args.task in IMAGE_TASKS: + task_type = "2i" + else: + raise ValueError( + f"Unsupported task: '{args.task}'. " + f"Valid video tasks: {sorted(VIDEO_TASKS)}, " + f"Valid image tasks: {sorted(IMAGE_TASKS)}" + ) + + valid_backends = sorted(backends_function_mapping[task_type].keys()) + + if args.backend not in valid_backends: + logger.error( + f"Invalid backend '{args.backend}' for task '{args.task}' (task type: '{task_type}').\n" + f"Valid backends for this task type: {valid_backends}\n" + f"Example usage: --task {args.task} --backend {valid_backends[0]}" + ) + raise ValueError("Backend validation failed. See log above for valid options.") + # Setup API URL and request function based on backend - request_func, api_url = backends_function_mapping[args.backend] + request_func, api_url = backends_function_mapping[task_type][args.backend] api_url = f"{args.base_url}{api_url}" if args.dataset == "vbench": @@ -751,7 +843,7 @@ async def benchmark(args): elif args.dataset == "trace": dataset = TraceDataset(args, api_url, args.model) elif args.dataset == "random": - dataset = RandomDataset(args, api_url, args.model) + dataset = RandomDataset(args, api_url, args.model, args.enable_negative_prompt) else: raise ValueError(f"Unknown dataset: {args.dataset}") @@ -847,6 +939,7 @@ async def limited_request_func(req, session, pbar): print("{:<40} {:<15.4f}".format("Latency Mean (s):", metrics["latency_mean"])) print("{:<40} {:<15.4f}".format("Latency Median (s):", metrics["latency_median"])) print("{:<40} {:<15.4f}".format("Latency P99 (s):", metrics["latency_p99"])) + print("{:<40} {:<15.4f}".format("Latency P95 (s):", metrics["latency_p95"])) if args.slo: print(f"{'-' * 50}") @@ -883,7 +976,7 @@ async def limited_request_func(req, session, pbar): "--backend", type=str, default="vllm-omni", - choices=["vllm-omni", "openai"], + choices=["vllm-omni", "openai", "v1/videos"], help="Backend to target the benchmark to.", ) parser.add_argument( @@ -972,6 +1065,25 @@ async def limited_request_func(req, session, pbar): help="SLO target multiplier: slo_ms = estimated_exec_time_ms * slo_scale (default: 3).", ) parser.add_argument("--disable-tqdm", action="store_true", help="Disable progress bar.") + parser.add_argument( + "--enable-negative-prompt", + action="store_true", + default=False, + help="Generate negative prompts when using the random dataset.", + ) + parser.add_argument( + "--random-request-config", + type=str, + default=None, + help=( + "JSON string defining random request profiles. " + "Each profile may contain: width, height, num_inference_steps, etc. " + "The 'weight' field controls sampling probability (relative weight). " + "Example: " + '[{"width":512,"height":512,"num_inference_steps":20,"weight":0.15},' + '{"width":768,"height":768,"num_inference_steps":20,"weight":0.85}]' + ), + ) args = parser.parse_args() diff --git a/benchmarks/diffusion/performance_dashboard/qwen_image_serving_performance.md b/benchmarks/diffusion/performance_dashboard/qwen_image_serving_performance.md new file mode 100644 index 00000000000..ce022f1a8d9 --- /dev/null +++ b/benchmarks/diffusion/performance_dashboard/qwen_image_serving_performance.md @@ -0,0 +1,169 @@ +# Qwen-Image Serving Performance Dashboard + +This document describes how to deploy and benchmark **Qwen-Image** using vLLM-Omni. It includes service startup configuration, acceleration-related options, benchmark methodology, dataset settings, and performance results. + +--- + +# 1. Overview + +Qwen-Image is a multimodal text-to-image generation model served through the vLLM-Omni infrastructure. + +This document covers: + +* Service launch configuration (including acceleration options) +* Benchmark scripts and usage +* Dataset and workload settings +* Performance measurement results +* Reproducibility guidelines + +--- + +# 2. Test Environment +| Component | Specification | +|------------|----------------| +| GPU | NVIDIA A100-SXM4-80GB | +| Diffusion Attention Backend | FlashAttention | + +# 3. Service Launch Configuration + +## 3.1 Basic Serving Command + +```bash +vllm serve Qwen/Qwen-Image --omni \ + --port 8091 +``` + +## 3.2 Key Parameters + +| Parameter | Description | +| --------------------- | ------------------------ | +| `--cfg-parallel-size` | CFG parallelism degree | +| `--ulysses-degree` | Ulysses parallel degree | +| `--vae-patch-parallel-size` | VAE parallel degree | +| `--tensor-parallel-size` | Tensor parallelism degree | + +Record these parameters when reporting performance results. + +--- + +# 4. Benchmark Script + +## 4.1 Benchmark Entry + +```bash +python benchmarks/diffusion/diffusion_benchmark_serving.py \ + --backend vllm-omni \ + --dataset \ + --task t2i \ + --num-prompts \ + --max-concurrency \ + --enable-negative-prompt \ + --random-request-config +``` + +## 4.2 Key Benchmark Arguments + +| Parameter | Description | +| ---------------------- | --------------------------------- | +| `--backend` | Serving backend (use `vllm-omni`) | +| `--dataset` | Dataset name (`random` or custom) | +| `--task` | Task type (e.g., `t2i`) | +| `--num-prompts` | Total number of requests | +| `--max-concurrency` | Client-side concurrency | +| `--random-request-config`| JSON string defining random request | + +--- + +# 5. Dataset & Workload Settings + +## 5.1 Recommended Evaluation Configurations + +### Dataset A ( 512 Resolution) + +* Dataset: `random` +* Task: t2i +* Concurrency: 1 +* Mix Resolution +``` +[ + {"width":512,"height":512,"num_inference_steps":20,"weight":1} +] +``` + +### Dataset B (1536 Resolution) + +* Dataset: `random` +* Task: t2i +* Concurrency: 1 +* Mix Resolution +``` +[ + {"width":1536,"height":1536,"num_inference_steps":35,"weight":1} +] +``` + +### Dataset C (Mix Resolution) + +* Dataset: `random` +* Task: t2i +* Concurrency: 1 +* Mix Resolution +``` +[ + {"width":512,"height":512,"num_inference_steps":20,"weight":0.15}, + {"width":768,"height":768,"num_inference_steps":20,"weight":0.25}, + {"width":1024,"height":1024,"num_inference_steps":25,"weight":0.45}, + {"width":1536,"height":1536,"num_inference_steps":35,"weight":0.15} +] +``` +--- + +## 5.2 Example Benchmark Command + +```bash +python benchmarks/diffusion/diffusion_benchmark_serving.py \ + --backend vllm-omni \ + --dataset random \ + --task t2i \ + --num-prompts 1 \ + --max-concurrency 1 \ + --enable-negative-prompt \ + --random-request-config '[ + {"width":512,"height":512,"num_inference_steps":20,"weight":1} + ]' +``` + +--- + +# 6. Performance Metrics + +The following metrics are collected during benchmarking: + +| Metric | Description | Unit | +| ------------------ | ----------------------------- | ------- | +| Mean Latency | Mean of latency | seconds | +| P99 Latency | P99 of latency | seconds | + +--- + +# 7. Performance Results + +| Dataset Configuration | Max Concur. | CFG | Usp | Tp | VAE Parallel | Mean Latency (s) | P99 Latency (s) | +|-----------------------|-----|-----|-----|----|--------------|------------------|------------------| +| Dataset A | 1 | 2 | 2 | Off | Off | 2.2087 | 2.2087 | +| Dataset B | 1 | 2 | 2 | Off | Off | 19.6739 | 19.6739 | +| Dataset C | 1 | 2 | 2 | Off | Off | 5.67259 | 18.6234 | +--- + +# 8. Reproducibility Checklist + +To ensure consistent and comparable benchmark results: + +* Record GPU type +* Record parallel configuration +* Record benchmark parameters (resolution, concurrency, number of prompts) +* Ensure no background workload on GPUs during testing + +--- + +This document serves as the official Qwen-Image serving performance reference under vLLM-Omni. diff --git a/benchmarks/diffusion/performance_dashboard/wan_2_2_serving_performance.md b/benchmarks/diffusion/performance_dashboard/wan_2_2_serving_performance.md new file mode 100644 index 00000000000..9d6c40ece36 --- /dev/null +++ b/benchmarks/diffusion/performance_dashboard/wan_2_2_serving_performance.md @@ -0,0 +1,170 @@ +# Wan2.2 Serving Performance Dashboard + +This document describes how to deploy and benchmark **Wan-AI/Wan2.2-T2V-A14B-Diffusers** using vLLM-Omni. It includes service startup configuration, acceleration-related options, benchmark methodology, dataset settings, and performance results. + +--- + +# 1. Overview + +Wan-AI/Wan2.2-T2V-A14B-Diffusers is a multimodal text-to-video generation model served through the vLLM-Omni infrastructure. + +This document covers: + +* Service launch configuration (including acceleration options) +* Benchmark scripts and usage +* Dataset and workload settings +* Performance measurement results +* Reproducibility guidelines + +--- + +# 2. Test Environment +| Component | Specification | +|------------|----------------| +| GPU | NVIDIA A100-SXM4-80GB | +| Diffusion Attention Backend | FlashAttention | + +# 3. Service Launch Configuration + +## 3.1 Basic Serving Command + +```bash +vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni \ + --port 8091 +``` + +## 3.2 Key Parameters + +| Parameter | Description | +| --------------------- | ------------------------ | +| `--cfg-parallel-size` | CFG parallelism degree | +| `--ulysses-degree` | Ulysses parallel degree | +| `--vae-patch-parallel-size` | VAE parallel degree | +| `--tensor-parallel-size` | Tensor parallelism degree | +| `--use-hsdp` | Enable Hybrid Sharded Data Parallel to shard model weights across GPUs | + +Record these parameters when reporting performance results. + +--- + +# 4. Benchmark Script + +## 4.1 Benchmark Entry + +```bash +python benchmarks/diffusion/diffusion_benchmark_serving.py \ + --backend v1/videos \ + --dataset \ + --task t2v \ + --num-prompts \ + --max-concurrency \ + --enable-negative-prompt \ + --random-request-config +``` + +## 4.2 Key Benchmark Arguments + +| Parameter | Description | +| ---------------------- | --------------------------------- | +| `--backend` | Serving backend (use `v1/videos`) | +| `--dataset` | Dataset name (`random` or custom) | +| `--task` | Task type (e.g., `t2v`) | +| `--num-prompts` | Total number of requests | +| `--max-concurrency` | Client-side concurrency | +| `--random-request-config`| JSON string defining random request | + +--- + +# 5. Dataset & Workload Settings + +## 5.1 Recommended Evaluation Configurations + +### Dataset A (480p) + +* Dataset: `random` +* Task: t2v +* Concurrency: 1 +* Mix Resolution +``` +[ + {"width":854,"height":480,"num_inference_steps":3,"num_frames":80,"fps":16,"weight":1} +] +``` +### Dataset B (720p) + +* Dataset: `random` +* Task: t2v +* Concurrency: 1 +* Mix Resolution +``` +[ + {"width":1280,"height":720,"num_inference_steps":6,"num_frames":80,"fps":16,"weight":1} +] +``` +### Dataset C (Mix Resolution) + +* Dataset: `random` +* Task: t2v +* Concurrency: 1 +* Mix Resolution +``` +[ + {"width":854,"height":480,"num_inference_steps":3,"num_frames":80,"fps":16,"weight":0.15}, + {"width":854,"height":480,"num_inference_steps":4,"num_frames":120,"fps":24,"weight":0.25}, + {"width":1280,"height":720,"num_inference_steps":6,"num_frames":80,"fps":16,"weight":0.6} +] +``` +--- + +## 5.2 Example Benchmark Command + +```bash +python benchmarks/diffusion/diffusion_benchmark_serving.py \ + --backend v1/videos \ + --dataset random \ + --task t2v \ + --num-prompts 1 \ + --max-concurrency 1 \ + --enable-negative-prompt \ + --random-request-config '[ + {"width":854,"height":480,"num_inference_steps":18,"num_frames": 33,"fps":16",weight":1} + ]' +``` + +--- + +# 6. Performance Metrics + +The following metrics are collected during benchmarking: + +| Metric | Description | Unit | +| ------------------ | ----------------------------- | ------- | +| Mean Latency | Mean of latency | seconds | +| P99 Latency | P99 of latency | seconds | + +--- + +# 7. Performance Results + +| Dataset Configuration | Max Concur. | CFG | Usp | Tp | Hsdp | VAE Parallel | Mean Latency (s) | P99 Latency (s) | +|-----------------------|-----|-----|-----|-----|----|--------------|------------------|------------------| +| Dataset A | 1 | 2 | 2 | 1 | On | 1 | 24.6766 | 24.6766 | +| Dataset A | 1 | 2 | 2 | 1 | On | 4 | 21.6810 | 21.6810 | +| Dataset B | 1 | 2 | 2 | 1 | On | 1 | 124.6639 | 124.6639 | +| Dataset B | 1 | 2 | 2 | 1 | On | 4 | 117.44 | 117.44 | +| Dataset C | 1 | 2 | 2 | 1 | On | 1 | 79.2175 | 124.2565 | +| Dataset C | 1 | 2 | 2 | 1 | On | 4 | 74.4977 | 117.710 | +--- + +# 8. Reproducibility Checklist + +To ensure consistent and comparable benchmark results: + +* Record GPU type +* Record parallel configuration +* Record benchmark parameters (resolution, concurrency, number of prompts) +* Ensure no background workload on GPUs during testing + +--- + +This document serves as the official Wan2.2 serving performance reference under vLLM-Omni.