From f37e2c4fe49ddd78decda62da7add8b258def30e Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Sun, 3 May 2026 22:47:47 +0100
Subject: [PATCH 01/15] Implement custom dataset class for benchmarking

Added audio processing functionality and a custom dataset class for ASR benchmarking. The new features support various audio input formats and allow for sampling from a JSONL dataset.

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 76 ++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 419275d2e6ae..2f224516fa73 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -31,6 +31,7 @@
 
 import numpy as np
 import pybase64 as base64
+import soundfile as sf
 from huggingface_hub import snapshot_download
 from PIL import Image
 from typing_extensions import deprecated
@@ -441,6 +442,27 @@ def process_video(video: Any) -> Mapping[str, Any]:
     )
 
 
+def process_audio(audio: Any) -> tuple:
+    """
+    Process a single audio input and return a (array, sample_rate) tuple.
+
+    Supports:
+    1. String: treated as a file path, loaded with soundfile.
+    2. Dict with 'array' and 'sampling_rate' keys: HuggingFace audio format.
+    3. Tuple (array, sr): passed through directly.
+    """
+    if isinstance(audio, str):
+        return sf.read(audio)
+    if isinstance(audio, dict) and "array" in audio and "sampling_rate" in audio:
+        return audio["array"], audio["sampling_rate"]
+    if isinstance(audio, tuple) and len(audio) == 2:
+        return audio
+    raise ValueError(
+        f"Invalid audio input {audio}. Must be a file path string, "
+        "a dict with 'array' and 'sampling_rate', or a (array, sr) tuple."
+    )
+
+
 def gen_prompt_decode_to_target_len(
     tokenizer: TokenizerLike,
     token_sequence: list[int],
@@ -1400,6 +1422,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
             "hf",
             "custom",
             "custom_mm",
+            "custom_audio",
             "prefix_repetition",
             "spec_bench",
             "speed_bench",
@@ -1827,6 +1850,18 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
             no_oversample=args.no_oversample,
         )
 
+    elif args.dataset_name == "custom_audio":
+        dataset = CustomAudioDataset(
+            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+        )
+        input_requests = dataset.sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.custom_output_len,
+            request_id_prefix=args.request_id_prefix,
+            no_oversample=args.no_oversample,
+        )
+
     elif args.dataset_name == "sonnet":
         dataset = SonnetDataset(
             dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
@@ -2324,6 +2359,47 @@ def sample(
         return sampled_requests
 
 
+class CustomAudioDataset(CustomDataset):
+    """
+    Custom dataset for ASR benchmarking. Loads data from a JSONL file. E.g.,
+    {"prompt": "", "audio": "/path/to/audio.wav"}
+    """
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: TokenizerLike,
+        num_requests: int,
+        output_len: int | None = None,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        self.num_available_samples = len(self.data)
+        sampled_requests = []
+
+        for i, item in enumerate(self.data):
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item.get("prompt", "")
+            prompt_len = len(tokenizer(prompt).input_ids)
+            y, sr = process_audio(item["audio"])
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len if output_len is not None else 256,
+                    multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # Spec Bench Dataset Implementation
 # -----------------------------------------------------------------------------

From 612968760d581959a9660105ffb98c8e9e971a80 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 4 May 2026 00:28:59 +0100
Subject: [PATCH 02/15] Apply suggestion from @gemini-code-assist[bot]

The soundfile library is imported at the top level without an ImportError check. This will cause the entire datasets module to fail to load if soundfile is not installed, even for users running non-audio benchmarks. Please follow the existing pattern in this file (e.g., for pandas or datasets) by using a try...except block or placeholder module.

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 2f224516fa73..f77d6b953cdc 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -31,7 +31,10 @@
 
 import numpy as np
 import pybase64 as base64
-import soundfile as sf
+try:
+    import soundfile as sf
+except ImportError:
+    sf = PlaceholderModule("soundfile")
 from huggingface_hub import snapshot_download
 from PIL import Image
 from typing_extensions import deprecated

From 3b2803bdc63bc3661268eb377325cf00588b711a Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 4 May 2026 00:29:31 +0100
Subject: [PATCH 03/15] Apply suggestion from @gemini-code-assist[bot]

The sample call for custom_audio is missing the skip_chat_template argument. This prevents the --skip-chat-template CLI flag from working correctly for this dataset type, which is inconsistent with the custom dataset implementation.

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index f77d6b953cdc..8525bbadc6cb 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -1861,6 +1861,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.custom_output_len,
+            skip_chat_template=args.skip_chat_template,
             request_id_prefix=args.request_id_prefix,
             no_oversample=args.no_oversample,
         )

From 47de46c5dce44567a76b9b05d1a9fced83cc9447 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 4 May 2026 00:30:04 +0100
Subject: [PATCH 04/15] Apply suggestion from @gemini-code-assist[bot]

The sample call for custom_audio is missing the skip_chat_template argument. This prevents the --skip-chat-template CLI flag from working correctly for this dataset type, which is inconsistent with the custom dataset implementation.

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 8525bbadc6cb..88fd415ff4e2 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -1857,6 +1857,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
         dataset = CustomAudioDataset(
             dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
         )
+        input_requests = dataset.sample(
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,

From 23c2252d4da482031b0ecfd454180554e6879f73 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 4 May 2026 01:04:47 +0100
Subject: [PATCH 05/15] Refine soundfile import and the audio sampling function

Add try... except to the soundfile import, and add guards to the audio sample function

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 41 ++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 88fd415ff4e2..2e2a7e5e4ab0 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -31,10 +31,7 @@
 
 import numpy as np
 import pybase64 as base64
-try:
-    import soundfile as sf
-except ImportError:
-    sf = PlaceholderModule("soundfile")
+
 from huggingface_hub import snapshot_download
 from PIL import Image
 from typing_extensions import deprecated
@@ -64,6 +61,11 @@
 except ImportError:
     pd = PlaceholderModule("pandas")
 
+try:
+    import soundfile as sf
+except ImportError:
+    sf = PlaceholderModule("soundfile")
+
 
 logger = logging.getLogger(__name__)
 
@@ -1857,7 +1859,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
         dataset = CustomAudioDataset(
             dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
         )
-        input_requests = dataset.sample(
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
@@ -2378,23 +2379,47 @@ def sample(
         output_len: int | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
+        skip_chat_template: bool = False,
         **kwargs,
     ) -> list[SampleRequest]:
         self.num_available_samples = len(self.data)
-        sampled_requests = []
+        if num_requests <= 0:
+            num_requests = self.num_available_samples
 
+        sampled_requests = []
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item.get("prompt", "")
-            prompt_len = len(tokenizer(prompt).input_ids)
             y, sr = process_audio(item["audio"])
             mm_content = {"audio": (y, sr)}
+
+            if tokenizer is None:
+                prompt_len = 1
+                new_output_len = output_len if output_len is not None and output_len != -1 else 256
+            else:
+                if not skip_chat_template:
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": prompt}],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                    )
+                prompt_len = len(tokenizer(prompt).input_ids)
+
+                new_output_len = output_len
+                if output_len is None or output_len == -1:
+                    if "output_tokens" not in item:
+                        raise ValueError(
+                            "If no output length is provided the "
+                            "custom dataset must contain an 'output_tokens' field."
+                        )
+                    new_output_len = int(item["output_tokens"])
+
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
-                    expected_output_len=output_len if output_len is not None else 256,
+                    expected_output_len=new_output_len,
                     multi_modal_data=mm_content,
                     request_id=request_id_prefix + str(i),
                 )

From 791daf3998d4a6e917ff8b644f7d18b30aee42b1 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Sun, 10 May 2026 23:22:01 +0100
Subject: [PATCH 06/15] Support Audio models

- Adding "custom_audio" to the `--dataset-name` choices, CustomAudioDataset class and process_audio function:
  - Support ASR models (Whisper tested)
  - Support Multimodal (text + audio) models requiring a chat template (Qwen2-Audio tested)
- Change "custom_mm" to "custom_image" and CustomMMDataset to CustomImageDataset:
  - For now, both "custom_mm" and "custom_image" are accepted to keep backward compatibility.


Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 79 ++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index cdc0764dc390..02826d965456 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -1426,8 +1426,9 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
             "random-rerank",
             "hf",
             "custom",
-            "custom_mm",
             "custom_audio",
+            "custom_image",
+            "custom_mm",
             "prefix_repetition",
             "spec_bench",
             "speed_bench",
@@ -1844,8 +1845,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
             no_oversample=args.no_oversample,
         )
 
-    elif args.dataset_name == "custom_mm":
-        dataset = CustomMMDataset(
+    elif args.dataset_name in ("custom_image", "custom_mm"):
+        dataset = CustomImageDataset(
             dataset_path=args.dataset_path,
             disable_shuffle=args.disable_shuffle,
             random_seed=args.seed,
@@ -1861,13 +1862,15 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
 
     elif args.dataset_name == "custom_audio":
         dataset = CustomAudioDataset(
-            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+            dataset_path=args.dataset_path,
+            disable_shuffle=args.disable_shuffle,
+            random_seed=args.seed,
         )
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.custom_output_len,
-            skip_chat_template=args.skip_chat_template,
+            enable_multimodal_chat=args.enable_multimodal_chat,
             request_id_prefix=args.request_id_prefix,
             no_oversample=args.no_oversample,
         )
@@ -2290,9 +2293,9 @@ def sample(
         return sampled_requests
 
 
-class CustomMMDataset(CustomDataset):
+class CustomImageDataset(CustomDataset):
     """
-    Implements the Custom MultiModal dataset. Loads data from a JSONL file and generates
+    Implements the Custom image dataset. Loads data from a JSONL file and generates
     sample requests based on conversation turns. E.g.,
     ```
     {
@@ -2371,8 +2374,12 @@ def sample(
 
 class CustomAudioDataset(CustomDataset):
     """
-    Custom dataset for ASR benchmarking. Loads data from a JSONL file. E.g.,
-    {"prompt": "", "audio": "/path/to/audio.wav"}
+    Custom dataset for audio benchmarking. Loads data from a JSONL file. E.g.,
+    {"prompt": "Transcribe the audio.", "audio": "/path/to/audio.wav"}
+
+    Supports both:
+    - Dedicated ASR models (e.g., Whisper) via openai-audio / /v1/audio/transcriptions
+    - Chat-based audio models (e.g., Qwen2-Audio) via openai-chat / /v1/chat/completions
     """
     IS_MULTIMODAL = True
 
@@ -2384,32 +2391,59 @@ def sample(
         request_id_prefix: str = "",
         no_oversample: bool = False,
         skip_chat_template: bool = False,
+        enable_multimodal_chat: bool = False,
         **kwargs,
     ) -> list[SampleRequest]:
         self.num_available_samples = len(self.data)
         if num_requests <= 0:
             num_requests = self.num_available_samples
-
         sampled_requests = []
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item.get("prompt", "")
-            y, sr = process_audio(item["audio"])
-            mm_content = {"audio": (y, sr)}
-
             if tokenizer is None:
                 prompt_len = 1
-                new_output_len = output_len if output_len is not None and output_len != -1 else 256
+                new_output_len = output_len if output_len not in (None, -1) else 256
+                mm_content = None
             else:
-                if not skip_chat_template:
-                    prompt = tokenizer.apply_chat_template(
-                        [{"role": "user", "content": prompt}],
-                        add_generation_prompt=True,
-                        tokenize=False,
-                    )
-                prompt_len = len(tokenizer(prompt).input_ids)
-
+                use_chat_template = (
+                    not skip_chat_template
+                    and hasattr(tokenizer, "chat_template")
+                    and tokenizer.chat_template is not None
+                )
+                if enable_multimodal_chat:
+                    # Chat-based audio models (e.g., Qwen2-Audio):
+                    # encode audio as base64; serve.py assembles the chat message
+                    # as: {"role": "user", "content": [
+                    #     {"type": "text", "text": prompt},
+                    #     {"type": "input_audio", "input_audio": {...}}
+                    # ]}
+                    y, sr = process_audio(item["audio"])
+                    buf = io.BytesIO()
+                    sf.write(buf, y, sr, format="WAV")
+                    audio_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+                    mm_content = {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    }
+                    # prompt stays as plain string; serve.py handles wrapping
+                else:
+                    # Whisper-style models: load audio array locally
+                    y, sr = process_audio(item["audio"])
+                    mm_content = {"audio": (y, sr)}
+                    if use_chat_template:
+                        # ASR models with a chat template but not multimodal chat
+                        prompt = tokenizer.apply_chat_template(
+                            [{"role": "user", "content": prompt}],
+                            add_generation_prompt=True,
+                            tokenize=False,
+                        )
+                    # else: plain prompt for Whisper-style models
+                prompt_len = len(tokenizer(prompt).input_ids) if isinstance(prompt, str) else 1
                 new_output_len = output_len
                 if output_len is None or output_len == -1:
                     if "output_tokens" not in item:
@@ -2418,7 +2452,6 @@ def sample(
                             "custom dataset must contain an 'output_tokens' field."
                         )
                     new_output_len = int(item["output_tokens"])
-
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,

From 775ffd37151df8d6e4c4299afff4ede6669886d7 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Sun, 10 May 2026 23:27:55 +0100
Subject: [PATCH 07/15] Add CustomAudioDataset and CustomImageDataset

Match changes in datasets.py

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/__init__.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/datasets/__init__.py b/vllm/benchmarks/datasets/__init__.py
index 5d5e172e7b46..7d00499ccdf2 100644
--- a/vllm/benchmarks/datasets/__init__.py
+++ b/vllm/benchmarks/datasets/__init__.py
@@ -10,7 +10,8 @@
     BurstGPTDataset,
     ConversationDataset,
     CustomDataset,
-    CustomMMDataset,
+    CustomAudioDataset,
+    CustomImageDataset,
     HuggingFaceDataset,
     InstructCoderDataset,
     MLPerfDataset,
@@ -36,6 +37,7 @@
     is_valid_sequence,
     lora_path_on_disk,
     lora_tokenizer_cache,
+    process_audio,
     process_image,
     process_video,
     zeta_prompt,
@@ -51,7 +53,8 @@
     "BurstGPTDataset",
     "ConversationDataset",
     "CustomDataset",
-    "CustomMMDataset",
+    "CustomAudioDataset",
+    "CustomImageDataset",
     "HuggingFaceDataset",
     "InstructCoderDataset",
     "MLPerfDataset",
@@ -77,6 +80,7 @@
     "is_valid_sequence",
     "lora_path_on_disk",
     "lora_tokenizer_cache",
+    "process_audio",
     "process_image",
     "process_video",
     "RangeRatio",

From 23ad51512bdb9a9f6b1124eebaa1b571633b1f5c Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 00:02:49 +0100
Subject: [PATCH 08/15] pre-commit check

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/__init__.py | 2 +-
 vllm/benchmarks/datasets/datasets.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/benchmarks/datasets/__init__.py b/vllm/benchmarks/datasets/__init__.py
index 7d00499ccdf2..b989958edcf6 100644
--- a/vllm/benchmarks/datasets/__init__.py
+++ b/vllm/benchmarks/datasets/__init__.py
@@ -9,8 +9,8 @@
     BlazeditDataset,
     BurstGPTDataset,
     ConversationDataset,
-    CustomDataset,
     CustomAudioDataset,
+    CustomDataset,
     CustomImageDataset,
     HuggingFaceDataset,
     InstructCoderDataset,
diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 02826d965456..b43fbf6c4764 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -31,7 +31,6 @@
 
 import numpy as np
 import pybase64 as base64
-
 from huggingface_hub import snapshot_download
 from PIL import Image
 from typing_extensions import deprecated
@@ -2381,6 +2380,7 @@ class CustomAudioDataset(CustomDataset):
     - Dedicated ASR models (e.g., Whisper) via openai-audio / /v1/audio/transcriptions
     - Chat-based audio models (e.g., Qwen2-Audio) via openai-chat / /v1/chat/completions
     """
+
     IS_MULTIMODAL = True
 
     def sample(
@@ -2443,7 +2443,9 @@ def sample(
                             tokenize=False,
                         )
                     # else: plain prompt for Whisper-style models
-                prompt_len = len(tokenizer(prompt).input_ids) if isinstance(prompt, str) else 1
+                prompt_len = (
+                    len(tokenizer(prompt).input_ids) if isinstance(prompt, str) else 1
+                )
                 new_output_len = output_len
                 if output_len is None or output_len == -1:
                     if "output_tokens" not in item:

From fa3c497757d23c7d65f76c7d39f8856580554e54 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 00:16:47 +0100
Subject: [PATCH 09/15] Deprecate 'custom_mm' dataset name with warning

Added a deprecation warning for 'custom_mm' dataset. Use '--dataset-name custom_image' instead

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index b43fbf6c4764..48485430dad7 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -1845,6 +1845,11 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
         )
 
     elif args.dataset_name in ("custom_image", "custom_mm"):
+        if args.dataset_name == "custom_mm":
+            logger.warning(
+                "Dataset name 'custom_mm' is deprecated and will be removed "
+                "in 3 minor versions. Use '--dataset-name custom_image' instead."
+            )
         dataset = CustomImageDataset(
             dataset_path=args.dataset_path,
             disable_shuffle=args.disable_shuffle,

From 08e09bcbfef08b45f4e10f36d3cc82ed479648d1 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 01:07:24 +0100
Subject: [PATCH 10/15] Update deprecation warning for custom_mm dataset

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 48485430dad7..be1b69277f65 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -1847,8 +1847,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
     elif args.dataset_name in ("custom_image", "custom_mm"):
         if args.dataset_name == "custom_mm":
             logger.warning(
-                "Dataset name 'custom_mm' is deprecated and will be removed "
-                "in 3 minor versions. Use '--dataset-name custom_image' instead."
+                "Dataset name 'custom_mm' is deprecated and will be removed in v0.24. "
+                "Use '--dataset-name custom_image' instead."
             )
         dataset = CustomImageDataset(
             dataset_path=args.dataset_path,

From d9323132d8c0d34f6c11c547f7c52c0d2539253f Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 18:34:58 +0100
Subject: [PATCH 11/15] Rename Custom MM to Custom Image in CLI docs

Updated the documentation to reflect changes in dataset naming and usage for image datasets.

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 docs/benchmarking/cli.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index 4f0c933c4ac8..59bbc139dbed 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -39,7 +39,8 @@ th {
 | Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
 | SPEED-Bench | ✅ | ✅ | `curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \| python3 -` |
 | Custom | ✅ | ✅ | Local file: `data.jsonl` |
-| Custom MM | ✅ | ✅ | Local file: `mm_data.jsonl` |
+| Custom Audio | ✅ | ✅ | Local file: `audio_data.jsonl` |
+| Custom Image | ✅ | ✅ | Local file: `image_data.jsonl` |
 
 Legend:
 
@@ -173,9 +174,9 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-#### Custom multimodal dataset
+#### Custom image dataset
 
-If the multimodal dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomMMDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `mm_data.jsonl`:
+If the image dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomImageDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `image_data.jsonl`:
 
 ```json
 {"prompt": "How many animals are present in the given image?", "image_files": ["/path/to/image/folder/horsepony.jpg"]}
@@ -193,8 +194,8 @@ vllm bench serve--save-result --save-detailed \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
-  --dataset-name custom_mm \
-  --dataset-path <path-to-your-mm-data-jsonl> \
+  --dataset-name custom_image \
+  --dataset-path <path-to-your-image-data-jsonl> \
   --allowed-local-media-path /path/to/image/folder
 ```
 

From f7fcabec1ce1ee7dc37c21ae51935026d140817f Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 23:05:23 +0100
Subject: [PATCH 12/15] Update CLI documentation for CustomAudioDataset

Added instructions for benchmarking with CustomAudioDataset, including examples for Whisper and Qwen2-Audio models.

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 docs/benchmarking/cli.md | 61 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 3 deletions(-)

diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index 59bbc139dbed..1253dd1aada5 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -143,7 +143,7 @@ Example output: ![Dataset Statistics](../assets/contributing/vllm_bench_serve_da
 
 #### Custom Dataset
 
-If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
+If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. At inference time, use the option `--dataset-name custom`. Your data needs to be in the `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 
 ```json
 {"prompt": "What is the capital of India?"}
@@ -174,9 +174,64 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-#### Custom image dataset
 
-If the image dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomImageDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `image_data.jsonl`:
+#### Custom Audio Dataset
+
+If the audio dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomAudioDataset`. At inference time, use the option `--dataset-name custom_audio`. Your data needs to be in the `.jsonl` format and needs to have "prompt" and "audio" fields per entry, e.g., `audio_data.jsonl`:
+
+```json
+{"prompt": "What does this audio say?", "audio": "/path/to/audio_1.wav"}
+{"prompt": "Transcribe the audio.", "audio": "/path/to/audio_2.wav"}
+```
+
+* **Supported models:** The `CustomAudioDataset` class supports two types of audio models: ASR models (e.g. Whisper) which do not require a "prompt" field; and multimodal audio-text chat models (e.g. Qwen2-Audio). Since these model types require different arguments at inference, we are giving two examples.
+
+* **Example 1: Whisper**
+
+Whisper is a dedicated ASR encoder-decoder model, so it uses `--backend openai-audio` and `--endpoint /v1/audio/transcriptions`.
+
+```bash
+# start server
+vllm serve openai/whisper-tiny
+```
+
+```bash
+vllm bench serve \
+  --model openai/whisper-tiny \
+  --backend openai-audio \
+  --endpoint /v1/audio/transcriptions \
+  --dataset-name custom_audio \
+  --dataset-path audio_data.jsonl \
+  --no-oversample \
+  --custom-output-len 256 \
+  --save-result \
+  --save-detailed \
+  --result-filename whisper_bench.json
+```
+
+* **Example 2: Qwen2-Audio**
+
+Qwen2-Audio is a multimodal chat model that can do ASR and speech analysis, so it uses `--backend openai-chat`, and `--endpoint /v1/chat/completions`. It also requires `--enable-multimodal-chat` to enable multimodal chat transformation.
+
+```bash
+vllm bench serve \
+  --model Qwen/Qwen2-Audio-7B-Instruct \
+  --backend openai-chat \
+  --endpoint /v1/chat/completions \
+  --dataset-name custom_audio \
+  --dataset-path audio_data.jsonl \
+  --no-oversample \
+  --custom-output-len 256 \
+  --enable-multimodal-chat \
+  --save-result \
+  --save-detailed \
+  --result-filename qwen_bench.json
+```
+
+
+#### Custom Image Dataset
+
+If the image dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomImageDataset`. At inference time, use the option `--dataset-name custom_image`. Your data needs to be in the `.jsonl` format and needs to have "prompt" and "image_files" fields per entry, e.g., `image_data.jsonl`:
 
 ```json
 {"prompt": "How many animals are present in the given image?", "image_files": ["/path/to/image/folder/horsepony.jpg"]}

From 2804410e71edc6f6d474affef7beb1080b21f54e Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 23:19:28 +0100
Subject: [PATCH 13/15] Fix formatting of model support descriptions

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index be1b69277f65..be581caa43a8 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -2382,8 +2382,8 @@ class CustomAudioDataset(CustomDataset):
     {"prompt": "Transcribe the audio.", "audio": "/path/to/audio.wav"}
 
     Supports both:
-    - Dedicated ASR models (e.g., Whisper) via openai-audio / /v1/audio/transcriptions
-    - Chat-based audio models (e.g., Qwen2-Audio) via openai-chat / /v1/chat/completions
+    - Dedicated ASR models (e.g., Whisper) via 'openai-audio' and '/v1/audio/transcriptions'
+    - Chat-based audio models (e.g., Qwen2-Audio) via 'openai-chat' and '/v1/chat/completions'
     """
 
     IS_MULTIMODAL = True

From 1b168baacb8253a0317e2a26c2a07a9862bdcd84 Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 23:30:58 +0100
Subject: [PATCH 14/15] Update cli.md

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 docs/benchmarking/cli.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index 1253dd1aada5..1520e92f3dd3 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -174,7 +174,6 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-
 #### Custom Audio Dataset
 
 If the audio dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomAudioDataset`. At inference time, use the option `--dataset-name custom_audio`. Your data needs to be in the `.jsonl` format and needs to have "prompt" and "audio" fields per entry, e.g., `audio_data.jsonl`:
@@ -184,9 +183,9 @@ If the audio dataset you want to benchmark is not supported yet in vLLM, then yo
 {"prompt": "Transcribe the audio.", "audio": "/path/to/audio_2.wav"}
 ```
 
-* **Supported models:** The `CustomAudioDataset` class supports two types of audio models: ASR models (e.g. Whisper) which do not require a "prompt" field; and multimodal audio-text chat models (e.g. Qwen2-Audio). Since these model types require different arguments at inference, we are giving two examples.
+- **Supported models:** The `CustomAudioDataset` class supports two types of audio models: ASR models (e.g. Whisper) which do not require a "prompt" field; and multimodal audio-text chat models (e.g. Qwen2-Audio). Since these model types require different arguments at inference, we are giving two examples.
 
-* **Example 1: Whisper**
+- **Example 1: Whisper**
 
 Whisper is a dedicated ASR encoder-decoder model, so it uses `--backend openai-audio` and `--endpoint /v1/audio/transcriptions`.
 
@@ -209,7 +208,7 @@ vllm bench serve \
   --result-filename whisper_bench.json
 ```
 
-* **Example 2: Qwen2-Audio**
+- **Example 2: Qwen2-Audio**
 
 Qwen2-Audio is a multimodal chat model that can do ASR and speech analysis, so it uses `--backend openai-chat`, and `--endpoint /v1/chat/completions`. It also requires `--enable-multimodal-chat` to enable multimodal chat transformation.
 
@@ -228,7 +227,6 @@ vllm bench serve \
   --result-filename qwen_bench.json
 ```
 
-
 #### Custom Image Dataset
 
 If the image dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomImageDataset`. At inference time, use the option `--dataset-name custom_image`. Your data needs to be in the `.jsonl` format and needs to have "prompt" and "image_files" fields per entry, e.g., `image_data.jsonl`:

From a9bc9d4743aa9a2946aa9cd0711816b9f08a861f Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Mon, 11 May 2026 23:41:24 +0100
Subject: [PATCH 15/15] Update datasets.py

Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
---
 vllm/benchmarks/datasets/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index be581caa43a8..e71b4a313586 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -2382,8 +2382,8 @@ class CustomAudioDataset(CustomDataset):
     {"prompt": "Transcribe the audio.", "audio": "/path/to/audio.wav"}
 
     Supports both:
-    - Dedicated ASR models (e.g., Whisper) via 'openai-audio' and '/v1/audio/transcriptions'
-    - Chat-based audio models (e.g., Qwen2-Audio) via 'openai-chat' and '/v1/chat/completions'
+    - Dedicated ASR models (e.g. Whisper) via openai-audio & /v1/audio/transcriptions
+    - Chat-based audio models (e.g. Qwen2-Audio) via openai-chat & /v1/chat/completions
     """
 
     IS_MULTIMODAL = True