Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f37e2c4
Implement custom dataset class for benchmarking
ymoslem May 3, 2026
6129687
Apply suggestion from @gemini-code-assist[bot]
ymoslem May 3, 2026
3b2803b
Apply suggestion from @gemini-code-assist[bot]
ymoslem May 3, 2026
47de46c
Apply suggestion from @gemini-code-assist[bot]
ymoslem May 3, 2026
23c2252
Refine soundfile import and the audio sampling function
ymoslem May 4, 2026
2f60e3a
Merge branch 'main' into custom-audio-dataset
ymoslem May 4, 2026
362bca1
Merge branch 'main' into custom-audio-dataset
ymoslem May 4, 2026
2bb7766
Merge branch 'main' into custom-audio-dataset
ymoslem May 5, 2026
2d8c055
Merge branch 'main' into custom-audio-dataset
ymoslem May 7, 2026
cd3191a
Merge branch 'main' into custom-audio-dataset
ymoslem May 10, 2026
791daf3
Support Audio models
ymoslem May 10, 2026
775ffd3
Add CustomAudioDataset and CustomImageDataset
ymoslem May 10, 2026
23ad515
pre-commit check
ymoslem May 10, 2026
fa3c497
Deprecate 'custom_mm' dataset name with warning
ymoslem May 10, 2026
08e09bc
Update deprecation warning for custom_mm dataset
ymoslem May 11, 2026
395aaa5
Merge branch 'main' into custom-audio-dataset
ymoslem May 11, 2026
982be6f
Merge branch 'main' into custom-audio-dataset
ymoslem May 11, 2026
d932313
Rename Custom MM to Custom Image in CLI docs
ymoslem May 11, 2026
f7fcabe
Update CLI documentation for CustomAudioDataset
ymoslem May 11, 2026
69219ae
Merge branch 'main' into custom-audio-dataset
ymoslem May 11, 2026
2804410
Fix formatting of model support descriptions
ymoslem May 11, 2026
1b168ba
Update cli.md
ymoslem May 11, 2026
a9bc9d4
Update datasets.py
ymoslem May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions vllm/benchmarks/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

import numpy as np
import pybase64 as base64
import soundfile as sf
Comment thread
ymoslem marked this conversation as resolved.
Outdated
Comment thread
ymoslem marked this conversation as resolved.
Outdated
from huggingface_hub import snapshot_download
from PIL import Image
from typing_extensions import deprecated
Expand Down Expand Up @@ -441,6 +442,27 @@ def process_video(video: Any) -> Mapping[str, Any]:
)


def process_audio(audio: Any) -> tuple:
"""
Process a single audio input and return a (array, sample_rate) tuple.

Supports:
1. String: treated as a file path, loaded with soundfile.
2. Dict with 'array' and 'sampling_rate' keys: HuggingFace audio format.
3. Tuple (array, sr): passed through directly.
"""
if isinstance(audio, str):
return sf.read(audio)
if isinstance(audio, dict) and "array" in audio and "sampling_rate" in audio:
return audio["array"], audio["sampling_rate"]
if isinstance(audio, tuple) and len(audio) == 2:
return audio
raise ValueError(
f"Invalid audio input {audio}. Must be a file path string, "
"a dict with 'array' and 'sampling_rate', or a (array, sr) tuple."
)


def gen_prompt_decode_to_target_len(
tokenizer: TokenizerLike,
token_sequence: list[int],
Expand Down Expand Up @@ -1400,6 +1422,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
"hf",
"custom",
"custom_mm",
"custom_audio",
Comment thread
ymoslem marked this conversation as resolved.
"prefix_repetition",
"spec_bench",
"speed_bench",
Expand Down Expand Up @@ -1827,6 +1850,18 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
no_oversample=args.no_oversample,
)

elif args.dataset_name == "custom_audio":
dataset = CustomAudioDataset(
dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
)
input_requests = dataset.sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.custom_output_len,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
)
Comment thread
ymoslem marked this conversation as resolved.

elif args.dataset_name == "sonnet":
dataset = SonnetDataset(
dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
Expand Down Expand Up @@ -2324,6 +2359,47 @@ def sample(
return sampled_requests


class CustomAudioDataset(CustomDataset):
"""
Custom dataset for ASR benchmarking. Loads data from a JSONL file. E.g.,
{"prompt": "", "audio": "/path/to/audio.wav"}
"""
IS_MULTIMODAL = True

def sample(
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list[SampleRequest]:
self.num_available_samples = len(self.data)
sampled_requests = []

for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
prompt = item.get("prompt", "")
prompt_len = len(tokenizer(prompt).input_ids)
y, sr = process_audio(item["audio"])
mm_content = {"audio": (y, sr)}
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len if output_len is not None else 256,
multi_modal_data=mm_content,
request_id=request_id_prefix + str(i),
)
)
self.maybe_oversample_requests(
sampled_requests, num_requests, request_id_prefix, no_oversample
)
return sampled_requests
Comment thread
ymoslem marked this conversation as resolved.


# -----------------------------------------------------------------------------
# Spec Bench Dataset Implementation
# -----------------------------------------------------------------------------
Expand Down
Loading