diff --git a/docs/cli/README.md b/docs/cli/README.md
index 8069110b3ca..1fcfdb14eac 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -22,3 +22,21 @@ If you have custom stage configs file, launch the server with command below
 ```bash
 vllm serve Qwen/Qwen2.5-Omni-7B --omni --stage-configs-path /path/to/stage_configs_file
 ```
+
+
+## bench
+
+Run benchmark tests for online serving throughput.
+Available Commands:
+
+```bash
+vllm bench serve --omni \
+    --model Qwen/Qwen2.5-Omni-7B \
+    --host server-host \
+    --port server-port \
+    --random-input-len 32 \
+    --random-output-len 4  \
+    --num-prompts  5
+```
+
+See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md
new file mode 100644
index 00000000000..bcb1df74cd9
--- /dev/null
+++ b/docs/cli/bench/serve.md
@@ -0,0 +1,359 @@
+# vLLM-Omni Benchmark CLI Guide
+The vllm bench command launches the vLLM-Omni benchmark to evaluate the performance of multimodal models.
+
+## Notes
+We currently only support using the "openai-chat-omni" backend.
+
+## Basic Parameter Description
+You can use `vllm bench serve --omni --help=all` to get descriptions of all parameters. The commonly used parameters are described below:
+- `--omni`  
+  Enable Omni (multimodal) mode, supporting multimodal inputs and outputs such as images, videos, and audio.
+
+- `--backend`  
+  Specify the backend adapter as openai-chat-omni, using OpenAI Chat compatible API behavior as the protocol. Currently only openai-chat-omni is supported.
+
+- `--model`  
+  The model identifier to load, filled according to the models supported by vLLM-Omni.
+
+- `--endpoint`  
+  The API endpoint exposed externally, to which clients send their requests.
+
+- `--dataset-name`  
+  The name of the dataset used; random-mm indicates generating random multimodal inputs (images, videos, audio).
+
+- `--num-prompts`  
+  The total number of requests to send, an integer.
+
+- `--max-concurrency`  
+  "Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up."
+
+- `--request-rate`  
+  "Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times."
+
+- `--ignore-eos`  
+  "Set ignore_eos flag when sending the benchmark request."
+
+- `--metric-percentiles`  
+  Comma-separated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\"."
+        "Use \"--percentile-metrics\" to select metrics.
+
+- `--percentile-metrics`  
+        "Comma-separated list of selected metrics to report percentiles."
+                    "This argument specifies the metrics to report percentiles."
+                    'Allowed metric names are "ttft", "tpot", "itl", "e2el", "audio_ttfp", "audio_rtf". '
+
+- `--save-result`  
+Specify to save benchmark results to a json file
+
+- `--save-detailed`  
+"When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc."
+
+- `--result-dir`  
+ "Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory."
+
+- `--result-filename`  
+"Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+
+- `--random-prefix-len`  
+  Number of fixed prefix tokens before the random context in a request.
+  The total input length is the sum of random-prefix-len and a random
+  context length sampled from [input_len * (1 - range_ratio),
+  input_len * (1 + range_ratio)].Only the random and random-mm modes
+  support this parameter.
+
+- `--random-input-len`  
+  Number of input tokens per request.Only the random and random-mm modes support this parameter.
+
+- `--random-output-len`  
+  Number of output tokens per request.Only the random and random-mm modes support this parameter.
+
+- `--random-range-ratio`  
+  Range ratio for sampling input/output length,
+  used only for random sampling. Must be in the range [0, 1) to define
+  a symmetric sampling range
+  [length * (1 - range_ratio), length * (1 + range_ratio)].
+  Only the random and random-mm modes support this parameter.
+
+- `--random-mm-base-items-per-request`  
+  Base number of multimodal items per request for random-mm.
+  Actual per-request count is sampled around this base using
+  --random-mm-num-mm-items-range-ratio.
+  Only the random-mm mode supports this parameter.
+
+- `--random-mm-limit-mm-per-prompt`  
+  Per-modality hard caps for items attached per request, e.g.
+  '{"image": 3, "video": 1, "audio": 1}'. The sampled per-request item
+  count is clamped to the sum of these limits. When a modality
+  reaches its cap, its buckets are excluded and probabilities are
+  renormalized.
+  Only the random-mm mode supports this parameter.
+
+- `--random-mm-num-mm-items-range-ratio`  
+  Range ratio r in [0, 1] for sampling items per request.
+  We sample uniformly from the closed integer range
+  [floor(n*(1-r)), ceil(n*(1+r))]
+  where n is the base items per request.
+  r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped
+  to the sum of per-modality limits from
+  --random-mm-limit-mm-per-prompt.
+  An error is raised if the computed min exceeds the max.
+  Only the random-mm mode supports this parameter.
+
+- `--random-mm-bucket-config`  
+  The bucket config is a dictionary mapping a multimodal item
+  sampling configuration to a probability.
+  Currently allows for 3 modalities: audio, images and videos.
+  A bucket key is a tuple of (height, width, num_frames)
+  The value is the probability of sampling that specific item.
+  Example:
+  --random-mm-bucket-config
+  "{(256, 256, 1): 0.5, (720, 1280, 16): 0.4, (0, 1, 5): 0.10}"
+  First item: images with resolution 256x256 w.p. 0.5
+  Second item: videos with resolution 720x1280 and 16 frames
+  Third item: audios with 1s duration and 5 channels w.p. 0.1
+  OBS.: If the probabilities do not sum to 1, they are normalized.
+  Only the random-mm mode supports this parameter.
+
+## Usage Examples
+
+### Online Benchmark
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+First start serving your model:
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni
+```
+
+Then run the benchmarking for sharegpt:
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+  --omni \
+  --port 43845 \
+  --model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
+  --endpoint /v1/chat/completions \
+  --backend openai-chat-omni \
+  --num-prompts 2 \
+  --dataset-name sharegpt \
+  --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
+  --percentile-metrics ttft,tpot,itl,e2el
+```
+If successful, you will see the following output:
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     2
+Failed requests:                         0
+Benchmark duration (s):                  81.63
+Request throughput (req/s):              0.02
+Peak concurrent requests:                2.00
+----------------End-to-end Latency----------------
+Mean E2EL (ms):                          56966.13
+Median E2EL (ms):                        56966.13
+P99 E2EL (ms):                           81016.80
+================== Text Result ===================
+Total input tokens:                      36
+Total generated tokens:                  5926
+Output token throughput (tok/s):         72.60
+Peak output token throughput (tok/s):    103.00
+Peak concurrent requests:                2.00
+Total Token throughput (tok/s):          73.04
+---------------Time to First Token----------------
+Mean TTFT (ms):                          124.76
+Median TTFT (ms):                        124.76
+P99 TTFT (ms):                           156.10
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          481.30
+Median TPOT (ms):                        481.30
+P99 TPOT (ms):                           947.55
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           25.11
+Median ITL (ms):                         0.33
+P99 ITL (ms):                            25.17
+================== Audio Result ==================
+Total audio duration generated(s):       3.95
+Total audio frames generated:            94890
+Audio throughput(audio duration/s):      0.05
+==================================================
+```
+
+Or run the benchmarking for random:
+
+```bash
+vllm bench serve \
+  --omni \
+  --port 43845 \
+  --endpoint /v1/chat/completions \
+  --backend openai-chat-omni \
+  --model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
+  --dataset-name random \
+  --num-prompts 2 \
+  --random-prefix-len 5 \
+  --random-input-len 10 \
+  --random-output-len 100 \
+  --percentile-metrics ttft,tpot,itl,e2el,audio_ttfp,audio_rtf \
+  --ignore-eos
+```
+
+If successful, you will see the following output:
+
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     2
+Failed requests:                         0
+Benchmark duration (s):                  24.35
+Request throughput (req/s):              0.08
+Peak concurrent requests:                2.00
+----------------End-to-end Latency----------------
+Mean E2EL (ms):                          22576.23
+Median E2EL (ms):                        22576.23
+P99 E2EL (ms):                           24205.72
+================== Text Result ===================
+Total input tokens:                      30
+Total generated tokens:                  8973
+Output token throughput (tok/s):         368.52
+Peak output token throughput (tok/s):    81.00
+Peak concurrent requests:                2.00
+Total Token throughput (tok/s):          369.76
+---------------Time to First Token----------------
+Mean TTFT (ms):                          125.16
+Median TTFT (ms):                        125.16
+P99 TTFT (ms):                           155.88
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          5.01
+Median TPOT (ms):                        5.01
+P99 TPOT (ms):                           5.42
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           34.15
+Median ITL (ms):                         0.01
+P99 ITL (ms):                            376.19
+================== Audio Result ==================
+Total audio duration generated(s):       3.95
+Total audio frames generated:            94890
+Audio throughput(audio duration/s):      0.16
+---------------Time to First Packet---------------
+Mean AUDIO_TTFP (ms):                    11756.89
+Median AUDIO_TTFP (ms):                  11756.89
+P99 AUDIO_TTFP (ms):                     20854.25
+-----------------Real Time Factor-----------------
+Mean AUDIO_RTF:                          3.75
+Median AUDIO_RTF:                        3.75
+P99 AUDIO_RTF:                           7.39
+==================================================
+```
+Notes:
+We use (audio generation time - first packet latency) / audio duration to calculate RTF.
+
+</details>
+
+### Multi-Modal Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Benchmark the performance of multi-modal requests in vLLM-Omni.
+
+Generate synthetic image、video、audio inputs alongside random text prompts to stress-test vision models without external datasets.
+
+Notes:
+
+- Works only with online benchmark via the OpenAI backend (`--backend openai-chat-omni`) and endpoint `/v1/chat/completions`.
+
+Start the server (example):
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni
+```
+
+It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
+
+Then run the benchmarking script:
+```bash
+vllm bench serve \
+    --omni \
+  --dataset-name random-mm \
+  --port 40849 \
+  --model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
+  --endpoint /v1/chat/completions \
+  --backend openai-chat-omni \
+  --request-rate 1 \
+  --num-prompts 1 \
+  --random-input-len 10 \
+  --random-range-ratio 0.0 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-num-mm-items-range-ratio 0 \
+  --random-mm-limit-mm-per-prompt '{"image":1,"video":1, "audio": 1}' \
+  --random-mm-bucket-config '{"(32, 32, 1)": 0.5, "(0, 1, 1)": 0.1, "(32, 32, 2)":0.4}' \
+  --ignore-eos \
+  --percentile-metrics ttft,tpot,itl \
+  --random-output-len 2 \
+  --extra_body '{"modalities": ["text"]}'
+```
+
+If successful, you will see the following output:
+
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     1
+Failed requests:                         0
+Request rate configured (RPS):           1.00
+Benchmark duration (s):                  1.21
+Request throughput (req/s):              0.83
+Peak concurrent requests:                1.00
+================== Text Result ===================
+Total input tokens:                      10
+Total generated tokens:                  3
+Output token throughput (tok/s):         2.49
+Peak output token throughput (tok/s):    3.00
+Peak concurrent requests:                1.00
+Total Token throughput (tok/s):          10.77
+---------------Time to First Token----------------
+Mean TTFT (ms):                          179.74
+Median TTFT (ms):                        179.74
+P99 TTFT (ms):                           179.74
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          12.76
+Median TPOT (ms):                        12.76
+P99 TPOT (ms):                           12.76
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           12.76
+Median ITL (ms):                         12.76
+P99 ITL (ms):                            25.24
+================== Audio Result ==================
+Total audio duration generated(s):       0.00
+Total audio frames generated:            0
+Audio throughput(audio duration/s):      0.00
+==================================================
+```
+
+Behavioral notes:
+
+- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
+
+How sampling works:
+
+- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
+- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
+- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
+This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
+- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
+</details>
diff --git a/vllm_omni/benchmarks/data_modules/__init__.py b/vllm_omni/benchmarks/data_modules/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_omni/benchmarks/data_modules/random_multi_modal_dataset.py b/vllm_omni/benchmarks/data_modules/random_multi_modal_dataset.py
new file mode 100644
index 00000000000..14ba86cc4cb
--- /dev/null
+++ b/vllm_omni/benchmarks/data_modules/random_multi_modal_dataset.py
@@ -0,0 +1,152 @@
+import base64
+import io
+import logging
+from collections.abc import Mapping
+from typing import Any
+
+import numpy as np
+import soundfile as sf
+import torch
+from vllm.benchmarks.datasets import RandomMultiModalDataset, process_image, process_video
+
+logger = logging.getLogger(__name__)
+
+
+def process_audio(audio: Any) -> Mapping[str, Any]:
+    """
+    Process a single audio input and return a multimedia content dictionary.
+
+    Supports the following input types:
+
+    1. Dictionary with raw audio bytes: - Expects a dict with a 'bytes' key
+       containing raw audio data.
+
+    2. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the audio URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(audio, dict) and "bytes" in audio:
+        audio_bytes = audio["bytes"]
+        audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+        return {
+            "type": "audio_url",
+            "audio_url": {"url": f"data:audio/mpeg;base64,{audio_base64}"},
+        }
+    if isinstance(audio, str):
+        audio_url = audio if audio.startswith(("http://", "https://", "file://")) else f"file://{audio}"
+        return {"type": "audio_url", "audio_url": {"url": audio_url}}
+
+    raise ValueError(
+        f"Invalid audio input {audio}. Must be a string of local path/remote url, "
+        f"or a dictionary with raw audio bytes in the form of `{{'bytes': raw_audio_bytes}}`."
+    )
+
+
+# -----------------------------------------------------------------------------
+# MultiModalDataset Implementation
+# -----------------------------------------------------------------------------
+class OmniRandomMultiModalDataset(RandomMultiModalDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def generate_synthetic_audio(
+        self,
+        duration: int,  # seconds
+        num_channels: int,  # 1:Mono，2:Stereo 5:5.1 surround sound
+    ) -> dict[str, Any]:
+        """Generate synthetic audio with random values.
+        Default use 48000Hz.
+        """
+        sample_rate = 48000
+        num_samples = int(sample_rate * duration)
+        audio_data = self._rng.uniform(-0.5, 0.5, (num_samples, num_channels))
+        audio_data = np.clip(audio_data, -1.0, 1.0)
+        audio_tensor = torch.FloatTensor(audio_data.T)
+        audio_np = audio_tensor.numpy()
+
+        buffer = io.BytesIO()
+
+        sf.write(buffer, audio_np.T, sample_rate, format="wav")
+
+        buffer.seek(0)
+        audio_bytes = buffer.read()
+        buffer.close()
+        return {
+            "bytes": audio_bytes,
+        }
+
+    def generate_mm_item(
+        self,
+        mm_item_config: tuple[int, int, int],
+    ) -> Mapping[str, Any]:
+        """
+        Create synthetic images and videos and
+        apply process_image/process_video respectively.
+        This follows the OpenAI API chat completions
+        https://github.com/openai/openai-python
+        """
+
+        if self.map_config_to_modality(mm_item_config) == "image":
+            return process_image(self.generate_synthetic_image(mm_item_config[1], mm_item_config[0]))
+        elif self.map_config_to_modality(mm_item_config) == "video":
+            return process_video(self.generate_synthetic_video(mm_item_config[1], mm_item_config[0], mm_item_config[2]))
+        elif self.map_config_to_modality(mm_item_config) == "audio":
+            return process_audio(self.generate_synthetic_audio(mm_item_config[1], mm_item_config[2]))
+        else:
+            raise ValueError(f"Invalid multimodal item configuration: {mm_item_config}")
+
+    def generate_synthetic_video(self, width: int, height: int, num_frames: int) -> Any:
+        """Generate synthetic video with random values."""
+        import imageio
+
+        video_data = self._rng.integers(
+            0,
+            256,
+            (num_frames, height, width, 3),
+            dtype=np.uint8,
+        )
+        buffer = io.BytesIO()
+        writer_kwargs = {
+            "format": "mp4",
+            "fps": 30,
+            "codec": "libx264",
+            "quality": 7,
+            "pixelformat": "yuv420p",
+            "macro_block_size": 16,
+            "ffmpeg_params": [
+                "-preset",
+                "medium",
+                "-crf",
+                "23",
+                "-movflags",
+                "+faststart",
+                "-pix_fmt",
+                "yuv420p",
+                "-vf",
+                f"scale={width}:{height}",
+            ],
+        }
+
+        with imageio.get_writer(buffer, **writer_kwargs) as writer:
+            for frame_idx in range(num_frames):
+                writer.append_data(video_data[frame_idx])
+        buffer.seek(0)
+        video_bytes = buffer.read()
+
+        return {
+            "bytes": video_bytes,
+        }
+
+    def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
+        """Map the configuration to the modality."""
+        if config[0] == 0:
+            return "audio"
+        elif config[-1] == 1:
+            return "image"
+        elif config[-1] > 1:
+            return "video"
+        else:
+            raise ValueError(f"Invalid multimodal item configuration: {config}")
diff --git a/vllm_omni/benchmarks/metrics/__init__.py b/vllm_omni/benchmarks/metrics/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_omni/benchmarks/metrics/metrics.py b/vllm_omni/benchmarks/metrics/metrics.py
new file mode 100644
index 00000000000..f404a12f8e6
--- /dev/null
+++ b/vllm_omni/benchmarks/metrics/metrics.py
@@ -0,0 +1,330 @@
+import warnings
+from dataclasses import dataclass
+
+import numpy as np
+from transformers import PreTrainedTokenizerBase
+from vllm.benchmarks.datasets import SampleRequest
+from vllm.benchmarks.lib.endpoint_request_func import RequestFuncOutput
+from vllm.benchmarks.serve import MILLISECONDS_TO_SECONDS_CONVERSION, TERM_PLOTLIB_AVAILABLE, BenchmarkMetrics, TaskType
+
+
+@dataclass
+class MultiModalsBenchmarkMetrics(BenchmarkMetrics):
+    mean_audio_ttfp_ms: float = 0.0
+    median_audio_ttfp_ms: float = 0.0
+    std_audio_ttfp_ms: float = 0.0
+    percentiles_audio_ttfp_ms: list[tuple[float, float]] = None
+    total_audio_duration_ms: float = 0.0
+    total_audio_frames: int = 0
+    audio_throughput: float = 0.0
+    mean_audio_rtf: float = 0.0
+    median_audio_rtf: float = 0.0
+    std_audio_rtf: float = 0.0
+    percentiles_audio_rtf: list[tuple[float, float]] = None
+
+
+def print_metrics(
+    task_type,
+    selected_percentile_metrics,
+    max_concurrency,
+    request_rate,
+    benchmark_duration,
+    goodput_config_dict,
+    metrics: MultiModalsBenchmarkMetrics,
+):
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10}".format("Failed requests:", metrics.failed))
+    if max_concurrency is not None:
+        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
+    if request_rate != float("inf"):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput))
+    if goodput_config_dict:
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput))
+    if isinstance(metrics, MultiModalsBenchmarkMetrics):
+        print("{:<40} {:<10.2f}".format("Peak concurrent requests:", metrics.max_concurrent_requests))
+    if task_type != TaskType.GENERATION or "e2el" in selected_percentile_metrics:
+        process_one_metric("e2el", metrics)
+    print_text_metrics(task_type, selected_percentile_metrics, metrics)
+    if task_type == TaskType.GENERATION:
+        print_audio_metrics(selected_percentile_metrics, metrics)
+    print("=" * 50)
+
+
+def print_text_metrics(task_type, selected_percentile_metrics, metrics: MultiModalsBenchmarkMetrics):
+    print("{s:{c}^{n}}".format(s=" Text Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    if isinstance(metrics, MultiModalsBenchmarkMetrics):
+        print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+        print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput))
+        print("{:<40} {:<10.2f}".format("Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s))
+        print("{:<40} {:<10.2f}".format("Peak concurrent requests:", metrics.max_concurrent_requests))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput))
+
+    if task_type == TaskType.GENERATION:
+        for metric in selected_percentile_metrics:
+            if metric == "e2el":
+                continue
+            if not metric.startswith("audio"):
+                process_one_metric(metric, metrics)
+
+
+def print_audio_metrics(selected_percentile_metrics, metrics: MultiModalsBenchmarkMetrics):
+    print("{s:{c}^{n}}".format(s=" Audio Result ", n=50, c="="))
+    print("{:<40} {:<10.2f}".format("Total audio duration generated(s):", metrics.total_audio_duration_ms))
+    print("{:<40} {:<10}".format("Total audio frames generated:", metrics.total_audio_frames))
+    print("{:<40} {:<10.2f}".format("Audio throughput(audio duration/s):", metrics.audio_throughput))
+    for metric in selected_percentile_metrics:
+        if metric.startswith("audio"):
+            process_one_metric(metric, metrics)
+
+
+def process_one_metric(
+    metric_attribute_name: str,
+    metrics: MultiModalsBenchmarkMetrics,
+):
+    metric_header_map = {
+        "ttft": "Time to First Token",
+        "tpot": "Time per Output Token (excl. 1st token)",
+        "itl": "Inter-token Latency",
+        "e2el": "End-to-end Latency",
+        "audio_ttfp": "Time to First Packet",
+        "audio_rtf": "Real Time Factor",
+    }
+
+    header = metric_header_map.get(metric_attribute_name, metric_attribute_name)
+    print("{s:{c}^{n}}".format(s=header, n=50, c="-"))
+
+    is_audio_rtf = metric_attribute_name == "audio_rtf"
+
+    suffix = "" if is_audio_rtf else "_ms"
+    unit_suffix = "" if is_audio_rtf else " (ms)"
+
+    mean_attr_name = f"mean_{metric_attribute_name}{suffix}"
+    mean_value = getattr(metrics, mean_attr_name, 0.0)
+    print(f"{f'Mean {metric_attribute_name.upper()}{unit_suffix}:':<40} {mean_value:<10.2f}")
+
+    median_attr_name = f"median_{metric_attribute_name}{suffix}"
+    median_value = getattr(metrics, median_attr_name, 0.0)
+    print(f"{f'Median {metric_attribute_name.upper()}{unit_suffix}:':<40} {median_value:<10.2f}")
+
+    percentiles_attr_name = f"percentiles_{metric_attribute_name}{suffix}"
+    percentiles = getattr(metrics, percentiles_attr_name, [])
+
+    for percentile, value in percentiles:
+        p_str = str(int(percentile)) if percentile.is_integer() else str(percentile)
+        label = f"P{p_str} {metric_attribute_name.upper()}{unit_suffix}:"
+        print(f"{label:<40} {value:<10.2f}")
+
+
+def calculate_metrics(
+    input_requests: list[SampleRequest],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentiles: list[float],
+    goodput_config_dict: dict[str, float],
+    task_type,
+    selected_percentile_metrics,
+    max_concurrency,
+    request_rate,
+    benchmark_duration,
+) -> tuple[BenchmarkMetrics, list[int]]:
+    """Calculate the metrics for the benchmark.
+
+    Args:
+        input_requests: The input requests.
+        outputs: The outputs of the requests.
+        dur_s: The duration of the benchmark.
+        tokenizer: The tokenizer to use.
+        selected_percentiles: The percentiles to select.
+        goodput_config_dict: The goodput configuration.
+
+    Returns:
+        A tuple of the benchmark metrics and the actual output lengths.
+    """
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
+    audio_ttfps: list[float] = []
+    audio_rtfs: list[float] = []
+    audio_duration: list[float] = []
+    audio_frames: list[int] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_tokens
+
+            if not output_len:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            audio_ttfps.append(getattr(outputs[i], "audio_ttfp", 0.0))
+            audio_rtfs.append(getattr(outputs[i], "audio_rtf", 0.0))
+            audio_duration.append(getattr(outputs[i], "audio_duration", 0.0))
+            audio_frames.append(getattr(outputs[i], "audio_frames", 0.0))
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "audio_ttft" in goodput_config_dict:
+            valid_metrics.append(audio_ttfps)
+            slo_values.append(goodput_config_dict["audio_ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION)
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration on the benchmark arguments.",
+            stacklevel=2,
+        )
+
+    # Calculate max output tokens per second metric
+    max_output_tokens_per_s = 0.0
+    max_concurrent_requests = 0
+
+    # Find the time range across all successful requests
+    successful_outputs = [output for output in outputs if output.success]
+    failed_outputs = [output for output in outputs if not output.success]
+    if successful_outputs:
+        min_start_time = min(output.start_time for output in successful_outputs)
+        max_end_time = max(output.start_time + output.latency for output in successful_outputs)
+
+        # Create second buckets (ceiling to ensure we capture all time)
+        duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1
+        tokens_per_second = np.zeros(duration_seconds)
+        concurrent_requests_per_second = np.zeros(duration_seconds)
+
+        for i, output in enumerate(successful_outputs):
+            # Calculate token generation timestamp using
+            # start_time, ttft, and itl
+            token_times = [output.start_time + output.ttft]
+            current_time = token_times[0]
+            for itl_value in output.itl:
+                current_time += itl_value
+                token_times.append(current_time)
+
+            # Add tokens to second buckets
+            for token_time in token_times:
+                second_bucket = int(token_time - min_start_time)
+                if 0 <= second_bucket < duration_seconds:
+                    tokens_per_second[second_bucket] += 1
+
+            # Track concurrent requests for each second this request was active
+            request_start_second = int(output.start_time - min_start_time)
+            request_end_second = int((output.start_time + output.latency) - min_start_time)
+
+            for second in range(request_start_second, request_end_second + 1):
+                concurrent_requests_per_second[second] += 1
+
+        # Find the maximum tokens per second and corresponding
+        # concurrent requests
+        if len(tokens_per_second) > 0:
+            max_output_tokens_per_s = float(np.max(tokens_per_second))
+            max_concurrent_requests = int(np.max(concurrent_requests_per_second))
+
+        if TERM_PLOTLIB_AVAILABLE:
+            import termplotlib as tpl
+
+            fig = tpl.figure()
+            fig.plot(
+                np.arange(len(tokens_per_second)),
+                tokens_per_second,
+                title="Output tokens per second",
+            )
+            fig.plot(
+                np.arange(len(concurrent_requests_per_second)),
+                concurrent_requests_per_second,
+                title="Concurrent requests per second",
+            )
+            fig.show()
+        else:
+            print("tip: install termplotlib and gnuplot to plot the metrics")
+
+    metrics = MultiModalsBenchmarkMetrics(
+        completed=completed,
+        failed=len(failed_outputs),
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) * 1000,  # ttfts is empty if streaming is not supported by the endpoint
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
+        mean_audio_ttfp_ms=np.mean(audio_ttfps or 0) * 1000,
+        std_audio_ttfp_ms=np.std(audio_ttfps or 0) * 1000,
+        median_audio_ttfp_ms=np.median(audio_ttfps or 0) * 1000,
+        percentiles_audio_ttfp_ms=[(p, np.percentile(audio_ttfps or 0, p) * 1000) for p in selected_percentiles],
+        total_audio_duration_ms=sum(audio_duration),
+        total_audio_frames=sum(audio_frames),
+        audio_throughput=sum(audio_duration) / dur_s,
+        mean_audio_rtf=np.mean(audio_rtfs or 0),
+        std_audio_rtf=np.std(audio_rtfs or 0),
+        median_audio_rtf=np.median(audio_rtfs or 0),
+        percentiles_audio_rtf=[(p, np.percentile(audio_rtfs or 0, p)) for p in selected_percentiles],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles],
+        max_output_tokens_per_s=max_output_tokens_per_s,
+        max_concurrent_requests=max_concurrent_requests,
+    )
+    print_metrics(
+        task_type,
+        selected_percentile_metrics,
+        max_concurrency,
+        request_rate,
+        benchmark_duration,
+        goodput_config_dict,
+        metrics,
+    )
+    return metrics, actual_output_lens
diff --git a/vllm_omni/benchmarks/patch/__init__.py b/vllm_omni/benchmarks/patch/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py
new file mode 100644
index 00000000000..6591af0148f
--- /dev/null
+++ b/vllm_omni/benchmarks/patch/patch.py
@@ -0,0 +1,538 @@
+import asyncio
+import base64
+import contextlib
+import io
+import json
+import os
+import random
+import sys
+import time
+import traceback
+from collections.abc import Iterable
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Literal
+
+import aiohttp
+from pydub import AudioSegment
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+from vllm.benchmarks import datasets
+from vllm.benchmarks.datasets import SampleRequest
+from vllm.benchmarks.lib.endpoint_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    OPENAI_COMPATIBLE_BACKENDS,
+    RequestFuncInput,
+    RequestFuncOutput,
+    StreamedResponseHandler,
+    _get_chat_content,
+    _update_headers_common,
+    _update_payload_common,
+    _validate_api_url,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+from vllm_omni.benchmarks.data_modules.random_multi_modal_dataset import OmniRandomMultiModalDataset
+
+get_samples_old = datasets.get_samples
+
+
+def get_samples(args, tokenizer):
+    if args.backend not in ["openai-chat-omni"]:
+        raise ValueError("benchmark is only supported on 'openai-chat-omni' backend.")
+    if args.dataset_name == "random-mm":
+        dataset = OmniRandomMultiModalDataset(random_seed=args.seed, dataset_path=args.dataset_path)
+        input_requests = dataset.sample(
+            tokenizer=tokenizer,
+            num_requests=args.num_prompts,
+            prefix_len=args.random_prefix_len,
+            range_ratio=args.random_range_ratio,
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            base_items_per_request=args.random_mm_base_items_per_request,
+            limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
+            num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
+            bucket_config=args.random_mm_bucket_config,
+            request_id_prefix=args.request_id_prefix,
+            no_oversample=args.no_oversample,
+        )
+        return input_requests
+    else:
+        return get_samples_old(args, tokenizer)
+
+
+datasets.get_samples = get_samples
+
+
+@dataclass
+class MixRequestFuncOutput(RequestFuncOutput):
+    audio_ttfp: float = 0.0
+    audio_duration: float = 0.0
+    audio_frames: int = 0
+    audio_rtf: float = 0.0
+
+
+async def async_request_openai_chat_omni_completions(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+    mm_position: Literal["first", "last"] = "last",
+) -> MixRequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "OpenAI Chat Completions API", "chat/completions")
+
+    content = _get_chat_content(request_func_input, mm_position=mm_position)
+
+    payload = {
+        "model": request_func_input.model_name if request_func_input.model_name else request_func_input.model,
+        "messages": [
+            {"role": "user", "content": content},
+        ],
+        "temperature": 0.0,
+        "max_tokens": request_func_input.output_len,
+        "stream": True,
+        "stream_options": {
+            "include_usage": True,
+        },
+    }
+    _update_payload_common(payload, request_func_input)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+    _update_headers_common(headers, request_func_input)
+
+    output = MixRequestFuncOutput()
+    output.prompt_len = request_func_input.prompt_len
+
+    generated_text = ""
+    generated_audio = ""
+    ttft = 0.0
+    st = time.perf_counter()
+    output.start_time = st
+    most_recent_timestamp = st
+    audio_generate_time = 0.0
+    audio_first_timestamp = st
+    try:
+        async with session.post(url=api_url, json=payload, headers=headers) as response:
+            if response.status == 200:
+                handler = StreamedResponseHandler()
+                async for chunk_bytes in response.content.iter_any():
+                    chunk_bytes = chunk_bytes.strip()
+                    if not chunk_bytes:
+                        continue
+
+                    messages = handler.add_chunk(chunk_bytes)
+                    for message in messages:
+                        # NOTE: SSE comments (often used as pings) start with
+                        # a colon. These are not JSON data payload and should
+                        # be skipped.
+                        if message.startswith(":"):
+                            continue
+
+                        chunk = message.removeprefix("data: ")
+
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+                            if choices := data.get("choices"):
+                                modality = data.get("modality")
+                                content = choices[0]["delta"].get("content")
+                                if modality == "text":
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+                                    else:
+                                        output.itl.append(timestamp - most_recent_timestamp)
+                                    generated_text += content or ""
+                                elif modality == "audio":
+                                    if output.audio_ttfp == 0.0:
+                                        audio_first_timestamp = timestamp
+                                        output.audio_ttfp = timestamp - st
+                                    audio_generate_time = timestamp - audio_first_timestamp
+                                    if content != "":
+                                        audio_bytes = base64.b64decode(content)
+                                        seg = AudioSegment.from_file(io.BytesIO(audio_bytes))
+                                        if seg is not None:
+                                            if generated_audio is None:
+                                                generated_audio = seg
+                                            else:
+                                                generated_audio = seg + generated_audio
+
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+                            most_recent_timestamp = timestamp
+
+                output.generated_text = generated_text
+                if generated_audio is not None:
+                    output.audio_duration = len(generated_audio) / 1000.0
+                    frame_width = generated_audio.frame_width
+                    if frame_width > 0:
+                        output.audio_frames = len(generated_audio.raw_data) // frame_width
+                    else:
+                        output.audio_frames = 0
+                        logger.warning("Audio frame width is zero")
+                    audio_duration = output.audio_duration
+                    if audio_duration > 0:
+                        output.audio_rtf = audio_generate_time / output.audio_duration
+                    else:
+                        output.audio_rtf = 0
+                        logger.warning("Audio duration is zero")
+
+                output.success = True
+                output.latency = most_recent_timestamp - st
+            else:
+                output.error = response.reason or ""
+                output.success = False
+    except Exception:
+        output.success = False
+        exc_info = sys.exc_info()
+        output.error = "".join(traceback.format_exception(*exc_info))
+        logger.error(f"ERROR: send request failed, reason is: {output.error}")
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+ASYNC_REQUEST_FUNCS["openai-chat-omni"] = async_request_openai_chat_omni_completions
+if "openai-chat-omni" not in OPENAI_COMPATIBLE_BACKENDS:
+    OPENAI_COMPATIBLE_BACKENDS.append("openai-chat-omni")
+
+# ruff: noqa: E402
+# Prevent import order from causing patch failures
+from vllm.benchmarks import serve
+from vllm.benchmarks.serve import TaskType, calculate_metrics_for_embeddings, get_request, wait_for_endpoint
+
+from vllm_omni.benchmarks.metrics.metrics import MultiModalsBenchmarkMetrics, calculate_metrics
+
+# ruff: noqa: E402
+
+benchmark_old = serve.benchmark
+
+
+async def benchmark(
+    task_type: TaskType,
+    endpoint_type: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    model_name: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: list[SampleRequest],
+    logprobs: int | None,
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    num_warmups: int,
+    profile: bool,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    ignore_eos: bool,
+    goodput_config_dict: dict[str, float],
+    max_concurrency: int | None,
+    lora_modules: Iterable[str] | None,
+    extra_headers: dict | None,
+    extra_body: dict | None,
+    ramp_up_strategy: Literal["linear", "exponential"] | None = None,
+    ramp_up_start_rps: int | None = None,
+    ramp_up_end_rps: int | None = None,
+    ready_check_timeout_sec: int = 600,
+):
+    try:
+        request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
+    except KeyError:
+        raise ValueError(f"Unknown backend: {endpoint_type}") from None
+
+    # Reuses connections across requests to reduce TLS handshake overhead.
+    connector = aiohttp.TCPConnector(
+        limit=max_concurrency or 0,
+        limit_per_host=max_concurrency or 0,
+        ttl_dns_cache=300,
+        use_dns_cache=True,
+        keepalive_timeout=60,
+        enable_cleanup_closed=True,
+        force_close=False,
+        ssl=("https://" in api_url),
+    )
+
+    session = aiohttp.ClientSession(
+        connector=connector,
+        trust_env=True,
+        timeout=aiohttp.ClientTimeout(total=6 * 60 * 60),
+    )
+
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+        input_requests[0].prompt,
+        input_requests[0].prompt_len,
+        input_requests[0].expected_output_len,
+        input_requests[0].multi_modal_data,
+    )
+
+    assert (
+        test_mm_content is None
+        or isinstance(test_mm_content, dict)
+        or (isinstance(test_mm_content, list) and all(isinstance(item, dict) for item in test_mm_content))
+    ), "multi_modal_data must be a dict or list[dict]"
+    test_input = RequestFuncInput(
+        model=model_id,
+        model_name=model_name,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        logprobs=logprobs,
+        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
+        extra_headers=extra_headers,
+        extra_body=extra_body,
+    )
+
+    if ready_check_timeout_sec > 0:
+        test_output = await wait_for_endpoint(
+            request_func,
+            test_input,
+            session,
+            timeout_seconds=ready_check_timeout_sec,
+        )
+        if not test_output.success:
+            raise ValueError(
+                "Initial test run failed - Please make sure benchmark "
+                "arguments are correctly specified. "
+                f"Error: {test_output.error}"
+            )
+        else:
+            print("Initial test run completed.")
+    else:
+        print("Skipping endpoint ready check.")
+
+    if num_warmups > 0:
+        print(f"Warming up with {num_warmups} requests...")
+        warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups)
+        warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else contextlib.nullcontext()
+        warmup_tasks = []
+
+        async def warmup_limited_request_func():
+            async with warmup_semaphore:
+                return await request_func(request_func_input=test_input, session=session, pbar=warmup_pbar)
+
+        for _ in range(num_warmups):
+            request_task = asyncio.create_task(warmup_limited_request_func())
+            warmup_tasks.append(request_task)
+        _ = await asyncio.gather(*warmup_tasks)
+
+        if warmup_pbar is not None:
+            warmup_pbar.close()
+        print("Warmup run completed.")
+
+    print("Starting main benchmark run...")
+
+    if lora_modules:
+        # For each input request, choose a LoRA module at random.
+        lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))])
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            multi_modal_content=test_mm_content,
+            ignore_eos=ignore_eos,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        profile_output = await request_func(request_func_input=profile_input, session=session)
+        if profile_output.success:
+            print("Profiler started")
+
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
+
+    if ramp_up_strategy is not None:
+        print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
+        print(
+            f"Will increase RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over the duration of the benchmark."
+        )
+    else:
+        print(f"Traffic request rate: {request_rate}")
+
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else contextlib.nullcontext()
+
+    async def limited_request_func(request_func_input, session, pbar):
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, session=session, pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+
+    rps_change_events = []
+    last_int_rps = -1
+    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
+        last_int_rps = ramp_up_start_rps
+        rps_change_events.append(
+            {
+                "rps": last_int_rps,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )
+
+    async for request, current_request_rate in get_request(
+        input_requests,
+        request_rate,
+        burstiness,
+        ramp_up_strategy,
+        ramp_up_start_rps,
+        ramp_up_end_rps,
+    ):
+        if ramp_up_strategy is not None:
+            current_int_rps = int(current_request_rate)
+            if current_int_rps > last_int_rps:
+                timestamp = datetime.now().isoformat()
+                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
+                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
+                last_int_rps = current_int_rps
+        prompt, prompt_len, output_len, mm_content, request_id = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+            request.multi_modal_data,
+            request.request_id,
+        )
+        req_model_id, req_model_name = model_id, model_name
+        if lora_modules:
+            req_lora_module = next(lora_modules)
+            req_model_id, req_model_name = req_lora_module, req_lora_module
+
+        request_func_input = RequestFuncInput(
+            model=req_model_id,
+            model_name=req_model_name,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            logprobs=logprobs,
+            multi_modal_content=mm_content,
+            ignore_eos=ignore_eos,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+            request_id=request_id,
+        )
+        tasks.append(
+            asyncio.create_task(limited_request_func(request_func_input=request_func_input, session=session, pbar=pbar))
+        )
+    outputs: list[MixRequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    if task_type == TaskType.GENERATION:
+        metrics, actual_output_lens = calculate_metrics(
+            input_requests=input_requests,
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            tokenizer=tokenizer,
+            selected_percentiles=selected_percentiles,
+            goodput_config_dict=goodput_config_dict,
+            task_type=task_type,
+            selected_percentile_metrics=selected_percentile_metrics,
+            max_concurrency=max_concurrency,
+            request_rate=request_rate,
+            benchmark_duration=benchmark_duration,
+        )
+    else:
+        metrics = calculate_metrics_for_embeddings(
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            selected_percentiles=selected_percentiles,
+        )
+        actual_output_lens = 0
+
+    if isinstance(metrics, MultiModalsBenchmarkMetrics):
+        result = {
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "failed": metrics.failed,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "request_throughput": metrics.request_throughput,
+            "request_goodput": metrics.request_goodput if goodput_config_dict else None,
+            "output_throughput": metrics.output_throughput,
+            "total_token_throughput": metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "output_lens": actual_output_lens,
+            "ttfts": [output.ttft for output in outputs],
+            "itls": [output.itl for output in outputs],
+            "generated_texts": [output.generated_text for output in outputs],
+            "errors": [output.error for output in outputs],
+            "max_output_tokens_per_s": metrics.max_output_tokens_per_s,
+            "max_concurrent_requests": metrics.max_concurrent_requests,
+        }
+    else:
+        result = {
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "request_throughput": metrics.request_throughput,
+            "total_token_throughput": metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "errors": [output.error for output in outputs],
+        }
+
+    if rps_change_events:
+        result["rps_change_events"] = rps_change_events
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        is_audio_rtf = metric_attribute_name == "audio_rtf"
+
+        suffix = "" if is_audio_rtf else "_ms"
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}{suffix}"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            result[f"p{p_word}_{metric_attribute_name}{suffix}"] = value
+
+    if task_type == TaskType.GENERATION:
+        for metric in selected_percentile_metrics:
+            process_one_metric(metric)
+    else:
+        process_one_metric("e2el")
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+        )
+        profile_output = await request_func(request_func_input=profile_input, session=session)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    await session.close()
+    return result
+
+
+serve.benchmark = benchmark
diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py
new file mode 100644
index 00000000000..fe946036931
--- /dev/null
+++ b/vllm_omni/benchmarks/serve.py
@@ -0,0 +1,9 @@
+import argparse
+import asyncio
+from typing import Any
+
+from vllm.benchmarks.serve import main_async
+
+
+def main(args: argparse.Namespace) -> dict[str, Any]:
+    return asyncio.run(main_async(args))
diff --git a/vllm_omni/entrypoints/cli/__init__.py b/vllm_omni/entrypoints/cli/__init__.py
index b233a71e6d2..2ffba613055 100644
--- a/vllm_omni/entrypoints/cli/__init__.py
+++ b/vllm_omni/entrypoints/cli/__init__.py
@@ -1,5 +1,13 @@
 """CLI helpers for vLLM-Omni entrypoints."""
 
+# To ensure patch imports work properly, disable unused import checks
+# ruff: noqa: E402, F401
+# isort: off
+from vllm_omni.benchmarks.patch import patch
+# isort: on
+
+from vllm_omni.entrypoints.cli.benchmark.serve import OmniBenchmarkServingSubcommand
+
 from .serve import OmniServeCommand
 
-__all__ = ["OmniServeCommand"]
+__all__ = ["OmniServeCommand", "OmniBenchmarkServingSubcommand"]
diff --git a/vllm_omni/entrypoints/cli/benchmark/__init__.py b/vllm_omni/entrypoints/cli/benchmark/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_omni/entrypoints/cli/benchmark/base.py b/vllm_omni/entrypoints/cli/benchmark/base.py
new file mode 100644
index 00000000000..6a6f97eb1e8
--- /dev/null
+++ b/vllm_omni/entrypoints/cli/benchmark/base.py
@@ -0,0 +1,23 @@
+import argparse
+
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class OmniBenchmarkSubcommandBase(CLISubcommand):
+    """The base class of subcommands for vllm bench."""
+
+    help: str
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        """Add the CLI arguments to the parser."""
+        raise NotImplementedError
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Run the benchmark.
+
+        Args:
+            args: The arguments to the command.
+        """
+        raise NotImplementedError
diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py
new file mode 100644
index 00000000000..8880e35c7cf
--- /dev/null
+++ b/vllm_omni/entrypoints/cli/benchmark/main.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import argparse
+import typing
+
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
+
+from vllm_omni.entrypoints.cli.benchmark.base import OmniBenchmarkSubcommandBase
+
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+
+
+class OmniBenchmarkSubcommand(CLISubcommand):
+    """The `bench` subcommand for the vLLM CLI."""
+
+    name = "bench"
+    help = "vLLM-omni bench subcommand."
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        args.dispatch_function(args)
+
+    def validate(self, args: argparse.Namespace) -> None:
+        pass
+
+    def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        bench_parser = subparsers.add_parser(
+            self.name, description=self.help, usage=f"vllm {self.name} <bench_type> [options]"
+        )
+        bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")
+
+        for cmd_cls in OmniBenchmarkSubcommandBase.__subclasses__():
+            cmd_subparser = bench_subparsers.add_parser(
+                cmd_cls.name,
+                help=cmd_cls.help,
+                description=cmd_cls.help,
+                usage=f"vllm {self.name} {cmd_cls.name} [--omni] [options]",
+            )
+            cmd_subparser.add_argument(
+                "--omni",
+                action="store_true",
+                help="Enable benchmark-Omni mode (always enabled for omni commands)",
+            )
+            cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
+            cmd_cls.add_cli_args(cmd_subparser)
+
+            cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=f"{self.name} {cmd_cls.name}")
+
+        return bench_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [OmniBenchmarkSubcommand()]
diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py
new file mode 100644
index 00000000000..906e8851a4a
--- /dev/null
+++ b/vllm_omni/entrypoints/cli/benchmark/serve.py
@@ -0,0 +1,51 @@
+import argparse
+
+from vllm.benchmarks.serve import add_cli_args
+
+from vllm_omni.benchmarks.serve import main
+from vllm_omni.entrypoints.cli.benchmark.base import OmniBenchmarkSubcommandBase
+
+
+class OmniBenchmarkServingSubcommand(OmniBenchmarkSubcommandBase):
+    """The `serve` subcommand for vllm bench."""
+
+    name = "serve"
+    help = "Benchmark the online serving throughput."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+        for action in parser._actions:
+            if action.dest == "percentile_metrics":
+                action.help = (
+                    "Comma-separated list of selected metrics to report percentiles."
+                    "This argument specifies the metrics to report percentiles."
+                    'Allowed metric names are "ttft", "tpot", "itl", "e2el", "audio_ttfp", "audio_rtf". '
+                )
+            if action.dest == "random_mm_limit_mm_per_prompt":
+                action.help = (
+                    "Per-modality hard caps for items attached per request, e.g. "
+                    '\'{"image": 3, "video": 0, "audio": 1}\'. The sampled per-request item '
+                    "count is clamped to the sum of these limits. When a modality "
+                    "reaches its cap, its buckets are excluded and probabilities are "
+                    "renormalized."
+                )
+            if action.dest == "random_mm_bucket_config":
+                action.help = (
+                    "The bucket config is a dictionary mapping a multimodal item"
+                    "sampling configuration to a probability."
+                    "Currently allows for 3 modalities: audio, images and videos. "
+                    "A bucket key is a tuple of (height, width, num_frames)"
+                    "The value is the probability of sampling that specific item. "
+                    "Example: "
+                    "--random-mm-bucket-config "
+                    "{(256, 256, 1): 0.5, (720, 1280, 16): 0.4, (0, 1, 5): 0.10} "
+                    "First item: images with resolution 256x256 w.p. 0.5"
+                    "Second item: videos with resolution 720x1280 and 16 frames "
+                    "Third item: audios with 1s duration and 5 channels w.p. 0.1"
+                    "OBS.: If the probabilities do not sum to 1, they are normalized."
+                )
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
diff --git a/vllm_omni/entrypoints/cli/main.py b/vllm_omni/entrypoints/cli/main.py
index 6a65d9d6cde..629a4641cce 100644
--- a/vllm_omni/entrypoints/cli/main.py
+++ b/vllm_omni/entrypoints/cli/main.py
@@ -18,10 +18,12 @@ def main():
         from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
         from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+        import vllm_omni.entrypoints.cli.benchmark.main
         import vllm_omni.entrypoints.cli.serve
 
         CMD_MODULES = [
             vllm_omni.entrypoints.cli.serve,
+            vllm_omni.entrypoints.cli.benchmark.main,
         ]
 
         cli_env_setup()