From 561bb2b56cde4b9ebb57d335b13b20c53f4f2250 Mon Sep 17 00:00:00 2001 From: jdebache Date: Wed, 28 Jan 2026 23:52:02 -0800 Subject: [PATCH 01/10] added txt slices dataset and made some small typing fixes Signed-off-by: jdebache --- tests/benchmarks/test_txt_slices_dataset.py | 54 ++++ vllm/benchmarks/datasets.py | 267 +++++++++++++++----- 2 files changed, 258 insertions(+), 63 deletions(-) create mode 100644 tests/benchmarks/test_txt_slices_dataset.py diff --git a/tests/benchmarks/test_txt_slices_dataset.py b/tests/benchmarks/test_txt_slices_dataset.py new file mode 100644 index 000000000000..8c676ce158ae --- /dev/null +++ b/tests/benchmarks/test_txt_slices_dataset.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +import tempfile + +import pytest +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.benchmarks.datasets import TxtSlicesDataset + + +@pytest.fixture(scope="session") +def hf_tokenizer() -> PreTrainedTokenizerBase: + # Use a small, commonly available tokenizer + return AutoTokenizer.from_pretrained("gpt2") + + +text_content = """ +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud +exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat +nulla pariatur. Excepteur sint occaecat cupidatat non proident, +sunt in culpa qui officia deserunt mollit anim id est laborum. +""" + + +@pytest.mark.benchmark +def test_txt_slices(hf_tokenizer: PreTrainedTokenizerBase) -> None: + # Write the text content to a temporary file + # Use delete=False for Python 3.10 compatibility (delete_on_close is 3.12+) + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: + f.write(text_content) + f.close() + temp_file_path = f.name + + try: + dataset = TxtSlicesDataset(dataset_path=temp_file_path) + + samples = dataset.sample( + hf_tokenizer, num_requests=10, input_len=10, output_len=10 + ) + + assert len(samples) == 10 + assert all(sample.prompt_len == 10 for sample in samples) + assert all(sample.expected_output_len == 10 for sample in samples) + + for sample in samples: + tokenized_prompt = hf_tokenizer( + sample.prompt, add_special_tokens=True + ).input_ids + assert len(tokenized_prompt) == 10 + finally: + os.unlink(f.name) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index dd71762b5ba7..7454fea0c252 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -10,6 +10,7 @@ - BurstGPT - HuggingFace - VisionArena + - TxtSlices """ import argparse @@ -19,11 +20,11 @@ import logging import math import random +import urllib from abc import ABC, abstractmethod from collections.abc import Callable, Iterator, Mapping from contextlib import suppress -from copy import deepcopy -from dataclasses import dataclass +from dataclasses import dataclass, replace from functools import cache from io import BytesIO from tempfile import NamedTemporaryFile @@ -65,15 +66,15 @@ # ----------------------------------------------------------------------------- -@dataclass +@dataclass(frozen=True) class SampleRequest: """ Represents a single inference request for benchmarking. """ - prompt: str | list[str] + prompt: str | list[str] | list[dict] prompt_len: int - expected_output_len: int + expected_output_len: int | None multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None lora_request: LoRARequest | None = None request_id: str | None = None @@ -110,7 +111,7 @@ def __init__( # default seed. self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED self.disable_shuffle = disable_shuffle - self.data = None + self.data: Any def apply_multimodal_chat_transformation( self, @@ -249,6 +250,7 @@ def sample( num_requests: int, request_id_prefix: str = "", no_oversample: bool = False, + **kwargs, ) -> list[SampleRequest]: """ Abstract method to generate sample requests from the dataset. @@ -296,8 +298,10 @@ def maybe_oversample_requests( needed = num_requests - len(requests) additional = [] for i in range(needed): - req = deepcopy(random.choice(requests)) - req.request_id = request_id_prefix + str(len(requests) + i) + req = replace( + random.choice(requests), + request_id=request_id_prefix + str(len(requests) + i), + ) additional.append(req) requests.extend(additional) logger.info("Oversampled requests to reach %d total samples.", num_requests) @@ -776,8 +780,11 @@ def sample( tokenizer: TokenizerLike, num_requests: int, request_id_prefix: str = "", + no_oversample: bool = False, + prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, input_len: int = RandomDataset.DEFAULT_INPUT_LEN, + output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, batchsize: int = 1, is_reranker: bool = True, **kwargs, @@ -868,6 +875,95 @@ def sample( return batch_requests +# ----------------------------------------------------------------------------- +# TxtSlicesDataset Implementation +# ----------------------------------------------------------------------------- + + +class TxtSlicesDataset(BenchmarkDataset): + """ + Implements the TxtSlices dataset. Takes a URL or file path to a text file, + tokenizes the entire content, and generates sample requests by randomly + slicing from the tokenized sequence with cycling support. + """ + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + dataset_path = kwargs.get("dataset_path") + if dataset_path is None: + raise ValueError( + "dataset_path must be provided to create a TxtSlicesDataset." + ) + self.text = self.load_data(dataset_path) + if len(self.text) == 0: + raise ValueError("The text file is empty and cannot be sampled from.") + + self.rng = random.Random(self.random_seed) + + @staticmethod + def load_data(dataset_path: str) -> str: + if dataset_path.startswith(("http://", "https://")): + with urllib.request.urlopen(dataset_path) as response: + return response.read().decode("utf-8") + else: + with open(dataset_path, encoding="utf-8") as f: + return f.read() + + def get_token_ids(self, tokenizer: TokenizerLike) -> tuple[int, ...]: + tokenized = tokenizer(self.text, add_special_tokens=False) + token_ids = tokenized.input_ids + if len(token_ids) == 0: + raise ValueError("The text is empty and cannot be sampled from.") + return token_ids + + def generate_prompt( + self, + tokenizer: TokenizerLike, + token_ids: tuple[int, ...], + input_len: int, + ) -> str: + num_available_tokens = len(token_ids) + + # Randomly select a start position + start_pos = self.rng.randint(0, num_available_tokens - 1) + + # Extract tokens with cycling if necessary + prompt_token_ids = tuple( + token_ids[(start_pos + j) % num_available_tokens] for j in range(input_len) + ) + + # Decode the tokens to get the prompt + return tokenizer.decode(prompt_token_ids, skip_special_tokens=False) + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + input_len: int = 1024, + output_len: int = 128, + **kwargs, + ) -> list[SampleRequest]: + # Tokenize the entire text content + token_ids = self.get_token_ids(tokenizer) + num_special_tokens = int(tokenizer.num_special_tokens_to_add()) + non_special_length = input_len - num_special_tokens + + return [ + SampleRequest( + prompt=self.generate_prompt(tokenizer, token_ids, non_special_length), + prompt_len=input_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + ) + for i in range(num_requests) + ] + + # ----------------------------------------------------------------------------- # MultiModalDataset Implementation # ----------------------------------------------------------------------------- @@ -1178,6 +1274,7 @@ def sample( range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, input_len: int = RandomDataset.DEFAULT_INPUT_LEN, output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, + batchsize: int = 1, limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT, base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST, num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, @@ -1187,6 +1284,10 @@ def sample( enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT, **kwargs, ) -> list[SampleRequest]: + if batchsize != 1: + raise NotImplementedError( + "batchsize > 1 is not supported for RandomMultiModalDataset." + ) # Get the sampling parameters for the dataset input_lens, output_lens, offsets = self.get_sampling_params( num_requests, range_ratio, input_len, output_len, tokenizer @@ -1326,16 +1427,16 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, lora_path: str | None = None, max_loras: int | None = None, output_len: int | None = None, enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, lora_assignment: str = "random", **kwargs, - ) -> list: - samples: list = [] + ) -> list[SampleRequest]: + samples: list[SampleRequest] = [] ind = 0 for entry in self.data: if len(samples) >= num_requests: @@ -1436,6 +1537,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "custom_mm", "prefix_repetition", "spec_bench", + "txt-slices", ], help="Name of the dataset to benchmark on.", ) @@ -1449,8 +1551,8 @@ def add_dataset_parser(parser: FlexibleArgumentParser): type=str, default=None, action=_ValidateDatasetArgs, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.", + help="Path to the sharegpt/sonnet dataset, the HF dataset ID if using HF " + "dataset, or the path/URL to a txt file for the txt-slices dataset.", ) parser.add_argument( "--no-oversample", @@ -1630,6 +1732,7 @@ def add_random_dataset_base_args( - random (random dataset) - random-mm (random multimodal dataset) - random-rerank (random dataset for reranking) + - txt-slices (txt-slices dataset) Args: parser_or_group: Either a parser or an argument group to add arguments to. @@ -2073,6 +2176,18 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), + "txt-slices": lambda: TxtSlicesDataset( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + input_len=args.random_input_len, + output_len=args.random_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), } try: @@ -2120,7 +2235,7 @@ def load_data(self) -> None: # This will be the standardized format which load_data() # has to convert into depending on the filetype of dataset_path. # sample() will assume this standardized format of self.data - self.data = [] + self.data: list[dict] = [] # Load the JSONL file if self.dataset_path.endswith(".jsonl"): @@ -2149,15 +2264,15 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, lora_path: str | None = None, max_loras: int | None = None, output_len: int | None = None, enable_multimodal_chat: bool = False, skip_chat_template: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: # load all data if needed self.num_available_samples = len(self.data) if num_requests <= 0: @@ -2168,7 +2283,7 @@ def sample( num_requests, ) - sampled_requests = [] + sampled_requests: list[SampleRequest] = [] for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break @@ -2340,9 +2455,32 @@ def load_data(self) -> None: if not getattr(self, "disable_shuffle", False): random.shuffle(self.data) - def sample(self, **kwargs) -> list: + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + lora_path: str | None = None, + max_loras: int | None = None, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list[SampleRequest]: # leverage CustomDataset sample - return super().sample(**kwargs) + return super().sample( + tokenizer, + num_requests, + request_id_prefix=request_id_prefix, + no_oversample=no_oversample, + lora_path=lora_path, + max_loras=max_loras, + output_len=output_len, + enable_multimodal_chat=enable_multimodal_chat, + skip_chat_template=skip_chat_template, + **kwargs, + ) # ----------------------------------------------------------------------------- @@ -2381,14 +2519,14 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, prefix_len: int = DEFAULT_PREFIX_LEN, input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, return_prompt_formatted: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: # Calculate average token length for a poem line. tokenized_lines = [tokenizer(line).input_ids for line in self.data] avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) @@ -2411,7 +2549,7 @@ def sample( num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) prefix_lines = self.data[:num_prefix_lines] - samples = [] + samples: list[SampleRequest] = [] ind = 0 while len(samples) < num_requests: extra_lines = random.choices( @@ -2482,11 +2620,11 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - max_loras: int | None = None, - lora_path: str | None = None, request_id_prefix: str = "", no_oversample: bool = False, lora_assignment: str = "random", + max_loras: int | None = None, + lora_path: str | None = None, **kwargs, ) -> list[SampleRequest]: samples = [] @@ -2574,15 +2712,15 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, - enable_multimodal_chat: bool = False, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, + enable_multimodal_chat: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: # Filter examples with at least 2 conversations filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) - sampled_requests = [] + sampled_requests: list[SampleRequest] = [] ind = 0 dynamic_output = output_len is None @@ -2634,15 +2772,15 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, - enable_multimodal_chat: bool = False, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, + enable_multimodal_chat: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: # Filter examples with at least 2 conversations filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) - sampled_requests = [] + sampled_requests: list[SampleRequest] = [] ind = 0 dynamic_output = output_len is None @@ -2703,12 +2841,12 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, - enable_multimodal_chat: bool = False, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, + enable_multimodal_chat: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) if parser_fn is None: raise ValueError(f"Unsupported dataset path: {self.hf_name}") @@ -2753,9 +2891,11 @@ class MMVUDataset(HuggingFaceDataset): DEFAULT_OUTPUT_LEN = 128 SUPPORTED_DATASET_PATHS = { - "yale-nlp/MMVU": lambda x: x["question"] - + " " - + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())), + "yale-nlp/MMVU": lambda x: ( + x["question"] + + " " + + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())) + ), } def __init__(self, **kwargs) -> None: @@ -2770,12 +2910,12 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, - enable_multimodal_chat: bool = False, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, + enable_multimodal_chat: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) if parser_fn is None: raise ValueError(f"Unsupported dataset path: {self.hf_name}") @@ -2838,15 +2978,15 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, output_len: int | None = None, enable_multimodal_chat: bool = False, skip_chat_template: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, **kwargs, ) -> list[SampleRequest]: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] + sampled_requests: list[SampleRequest] = [] for i, prompt in enumerate(self.sample_prompts(n=num_requests)): # apply template if not skip_chat_template: @@ -2903,15 +3043,15 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, output_len: int | None = None, enable_multimodal_chat: bool = False, skip_chat_template: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] + sampled_requests: list[SampleRequest] = [] for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: @@ -3050,12 +3190,12 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, **kwargs, - ) -> list: - sampled_requests = [] + ) -> list[SampleRequest]: + sampled_requests: list[SampleRequest] = [] ind = 0 dynamic_output = output_len is None @@ -3228,18 +3368,18 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, **kwargs, - ) -> list: + ) -> list[SampleRequest]: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN if "openai" in getattr(tokenizer, "name_or_path", ""): prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" else: prompt = "" prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests = [] + sampled_requests: list[SampleRequest] = [] ind = 0 skipped = 0 asr_min_audio_len_sec = kwargs.get("asr_min_audio_len_sec") @@ -3326,9 +3466,9 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, **kwargs, ) -> list[SampleRequest]: # Force dynamic output length based on reference completion. @@ -3405,12 +3545,12 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, prefix_len: int = DEFAULT_PREFIX_LEN, suffix_len: int = DEFAULT_SUFFIX_LEN, num_prefixes: int = DEFAULT_NUM_PREFIXES, output_len: int = DEFAULT_OUTPUT_LEN, - request_id_prefix: str = "", - no_oversample: bool = False, **kwargs, ) -> list[SampleRequest]: vocab_size = tokenizer.vocab_size @@ -3421,7 +3561,7 @@ def sample( f"to num_prefixes ({num_prefixes})" ) - def _generate_exact_length_tokens(target_length: int) -> list[int]: + def _generate_exact_length_tokens(target_length: int) -> tuple[list[int], int]: """Generate tokens that decode and re-encode to exactly target_length.""" # Generate random tokens @@ -3491,10 +3631,10 @@ def sample( self, tokenizer: TokenizerLike, num_requests: int, - output_len: int | None = None, - enable_multimodal_chat: bool = False, request_id_prefix: str = "", no_oversample: bool = False, + output_len: int | None = None, + enable_multimodal_chat: bool = False, **kwargs, ) -> list[SampleRequest]: # If --hf-output-len is not set, use the default output length. @@ -3516,6 +3656,7 @@ def sample( # if enable_multimodal_chat is False). prompt_len = len(tokenizer(question_text).input_ids) + prompt: str | list[dict] if enable_multimodal_chat: # If multimodal content should be embedded in the chat message, # convert to [{"role":"user","content":[...]}] From 56515317954e76e2c080cab3a8837b7a55ae7336 Mon Sep 17 00:00:00 2001 From: jdebache Date: Sun, 1 Mar 2026 16:27:09 +0000 Subject: [PATCH 02/10] factor out get_sampling_params to use it to implement range_ratio for txt_slices Signed-off-by: jdebache --- vllm/benchmarks/datasets.py | 155 ++++++++++++++++++++---------------- 1 file changed, 87 insertions(+), 68 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 7454fea0c252..ad217122cee3 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -503,6 +503,55 @@ def gen_prompt_decode_to_target_len( # ----------------------------------------------------------------------------- +def get_sampling_params( + rng: np.random.Generator, + num_requests: int, + range_ratio: float, + input_len: int, + output_len: int, + tokenizer: TokenizerLike, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Get the sampling parameters for the dataset. + """ + # Enforce range_ratio < 1 + if not (0.0 <= range_ratio < 1.0): + raise ValueError("range_ratio must be in [0, 1).") + num_special_tokens = int(tokenizer.num_special_tokens_to_add()) + real_input_len = max(0, int(input_len) - num_special_tokens) + # Bounds use floor for low and ceil for high + input_low = math.floor(real_input_len * (1 - range_ratio)) + input_high = math.ceil(real_input_len * (1 + range_ratio)) + output_low = math.floor(output_len * (1 - range_ratio)) + output_high = math.ceil(output_len * (1 + range_ratio)) + # Ensure the lower bound for output length is at least 1 to + # prevent sampling 0 tokens. + output_low = max(output_low, 1) + output_high = max(output_high, 1) + + if input_low > input_high: + raise ValueError( + f"Invalid input sampling interval: low={input_low} > high={input_high}" + ) + if output_low > output_high: + raise ValueError( + f"Invalid output sampling interval: low={output_low} > high={output_high}" + ) + + logger.info( + "Sampling input_len from [%s, %s] and output_len from [%s, %s]", + input_low, + input_high, + output_low, + output_high, + ) + + input_lens = rng.integers(input_low, input_high + 1, size=num_requests) + output_lens = rng.integers(output_low, output_high + 1, size=num_requests) + offsets = rng.integers(0, tokenizer.vocab_size, size=num_requests) + return input_lens, output_lens, offsets + + class RandomDataset(BenchmarkDataset): """ Synthetic text-only dataset for serving/throughput benchmarks. @@ -562,8 +611,8 @@ def sample( "* (1 - range_ratio) >= 1." ) - input_lens, output_lens, offsets = self.get_sampling_params( - num_requests, range_ratio, input_len, output_len, tokenizer + input_lens, output_lens, offsets = get_sampling_params( + self._rng, num_requests, range_ratio, input_len, output_len, tokenizer ) vocab_size = tokenizer.vocab_size @@ -574,7 +623,7 @@ def sample( # Generate prefix once prefix_token_ids = self.get_prefix(tokenizer, allowed_tokens, prefix_len) - requests = [] + requests: list[SampleRequest] = [] token_mismatch_total = 0 for i in range(num_requests): prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501 @@ -665,55 +714,6 @@ def get_prefix( ) return adjusted_tokens - def get_sampling_params( - self, - num_requests: int, - range_ratio: float, - input_len: int, - output_len: int, - tokenizer: TokenizerLike, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Get the sampling parameters for the dataset. - """ - # Enforce range_ratio < 1 - if not (0.0 <= range_ratio < 1.0): - raise ValueError("range_ratio must be in [0, 1).") - num_special_tokens = int(tokenizer.num_special_tokens_to_add()) - real_input_len = max(0, int(input_len) - num_special_tokens) - # Bounds use floor for low and ceil for high - input_low = math.floor(real_input_len * (1 - range_ratio)) - input_high = math.ceil(real_input_len * (1 + range_ratio)) - output_low = math.floor(output_len * (1 - range_ratio)) - output_high = math.ceil(output_len * (1 + range_ratio)) - # Ensure the lower bound for output length is at least 1 to - # prevent sampling 0 tokens. - output_low = max(output_low, 1) - output_high = max(output_high, 1) - - if input_low > input_high: - raise ValueError( - f"Invalid input sampling interval: low={input_low} > high={input_high}" - ) - if output_low > output_high: - raise ValueError( - "Invalid output sampling interval: " - f"low={output_low} > high={output_high}" - ) - - logger.info( - "Sampling input_len from [%s, %s] and output_len from [%s, %s]", - input_low, - input_high, - output_low, - output_high, - ) - - input_lens = self._rng.integers(input_low, input_high + 1, size=num_requests) - output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests) - offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests) - return input_lens, output_lens, offsets - def generate_token_sequence( self, *, @@ -793,8 +793,8 @@ def sample( query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len - query_lens, _, query_offsets = self.get_sampling_params( - 1, range_ratio, query_len_param, 0, tokenizer + query_lens, _, query_offsets = get_sampling_params( + self._rng, 1, range_ratio, query_len_param, 0, tokenizer ) query_len = int(query_lens[0]) @@ -807,8 +807,8 @@ def sample( else: doc_len_param = input_len - query_len - n_sep_tokens - doc_lens, _, doc_offsets = self.get_sampling_params( - num_requests, range_ratio, doc_len_param, 0, tokenizer + doc_lens, _, doc_offsets = get_sampling_params( + self._rng, num_requests, range_ratio, doc_len_param, 0, tokenizer ) vocab_size = tokenizer.vocab_size @@ -880,7 +880,7 @@ def sample( # ----------------------------------------------------------------------------- -class TxtSlicesDataset(BenchmarkDataset): +class TxtSlicesDataset(RandomDataset): """ Implements the TxtSlices dataset. Takes a URL or file path to a text file, tokenizes the entire content, and generates sample requests by randomly @@ -901,7 +901,7 @@ def __init__( if len(self.text) == 0: raise ValueError("The text file is empty and cannot be sampled from.") - self.rng = random.Random(self.random_seed) + self._rng = np.random.default_rng(self.random_seed) @staticmethod def load_data(dataset_path: str) -> str: @@ -924,12 +924,11 @@ def generate_prompt( tokenizer: TokenizerLike, token_ids: tuple[int, ...], input_len: int, + start_pos: int, + output_len: int, ) -> str: num_available_tokens = len(token_ids) - # Randomly select a start position - start_pos = self.rng.randint(0, num_available_tokens - 1) - # Extract tokens with cycling if necessary prompt_token_ids = tuple( token_ids[(start_pos + j) % num_available_tokens] for j in range(input_len) @@ -946,18 +945,37 @@ def sample( no_oversample: bool = False, input_len: int = 1024, output_len: int = 128, + range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, **kwargs, ) -> list[SampleRequest]: # Tokenize the entire text content token_ids = self.get_token_ids(tokenizer) - num_special_tokens = int(tokenizer.num_special_tokens_to_add()) - non_special_length = input_len - num_special_tokens + # Get the sampling parameters for input length and output length. + # We don't need the offsets. + input_lens, output_lens, _ = get_sampling_params( + self._rng, + num_requests, + range_ratio, + input_len, + output_len, + tokenizer, + ) + # Additionally, get the starting positions in the input text. + start_positions = self._rng.integers(0, len(token_ids), size=num_requests) + + # Put it all together. return [ SampleRequest( - prompt=self.generate_prompt(tokenizer, token_ids, non_special_length), - prompt_len=input_len, - expected_output_len=output_len, + prompt=self.generate_prompt( + tokenizer, + token_ids, + int(input_lens[i]), + int(start_positions[i]), + int(output_lens[i]), + ), + prompt_len=int(input_lens[i]), + expected_output_len=int(output_lens[i]), request_id=request_id_prefix + str(i), ) for i in range(num_requests) @@ -1289,8 +1307,8 @@ def sample( "batchsize > 1 is not supported for RandomMultiModalDataset." ) # Get the sampling parameters for the dataset - input_lens, output_lens, offsets = self.get_sampling_params( - num_requests, range_ratio, input_len, output_len, tokenizer + input_lens, output_lens, offsets = get_sampling_params( + self._rng, num_requests, range_ratio, input_len, output_len, tokenizer ) ( @@ -2185,6 +2203,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: num_requests=args.num_prompts, input_len=args.random_input_len, output_len=args.random_output_len, + range_ratio=args.random_range_ratio, request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), From 12b55ed259db3a20b25871de1a828a398d078cf2 Mon Sep 17 00:00:00 2001 From: jdebache Date: Thu, 2 Apr 2026 12:40:54 +0000 Subject: [PATCH 03/10] support different distribution between ISL and OSL Signed-off-by: jdebache --- vllm/benchmarks/datasets.py | 181 +++++++++++++++++++++++++--------- vllm/benchmarks/throughput.py | 18 ++++ 2 files changed, 151 insertions(+), 48 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index ad217122cee3..c2b057673edc 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -61,10 +61,6 @@ DEFAULT_NUM_PROMPTS = 1000 -# ----------------------------------------------------------------------------- -# Data Classes -# ----------------------------------------------------------------------------- - @dataclass(frozen=True) class SampleRequest: @@ -506,24 +502,34 @@ def gen_prompt_decode_to_target_len( def get_sampling_params( rng: np.random.Generator, num_requests: int, - range_ratio: float, + input_range_ratio: float, + output_range_ratio: float, input_len: int, output_len: int, tokenizer: TokenizerLike, ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ - Get the sampling parameters for the dataset. + Sample per-request input/output token lengths and vocab offsets. + + Lengths are drawn uniformly from integer ranges around the configured + means, controlled by ``input_range_ratio`` and ``output_range_ratio``. + Tokenizer special tokens are subtracted from ``input_len`` before + computing the sampling interval. + + Returns: + (input_lens, output_lens, offsets) – three 1-D ``np.ndarray`` of + shape ``(num_requests,)``. """ - # Enforce range_ratio < 1 - if not (0.0 <= range_ratio < 1.0): - raise ValueError("range_ratio must be in [0, 1).") + if not (0.0 <= input_range_ratio < 1.0): + raise ValueError("input_range_ratio must be in [0, 1).") + if not (0.0 <= output_range_ratio < 1.0): + raise ValueError("output_range_ratio must be in [0, 1).") num_special_tokens = int(tokenizer.num_special_tokens_to_add()) real_input_len = max(0, int(input_len) - num_special_tokens) - # Bounds use floor for low and ceil for high - input_low = math.floor(real_input_len * (1 - range_ratio)) - input_high = math.ceil(real_input_len * (1 + range_ratio)) - output_low = math.floor(output_len * (1 - range_ratio)) - output_high = math.ceil(output_len * (1 + range_ratio)) + input_low = math.floor(real_input_len * (1 - input_range_ratio)) + input_high = math.ceil(real_input_len * (1 + input_range_ratio)) + output_low = math.floor(output_len * (1 - output_range_ratio)) + output_high = math.ceil(output_len * (1 + output_range_ratio)) # Ensure the lower bound for output length is at least 1 to # prevent sampling 0 tokens. output_low = max(output_low, 1) @@ -587,6 +593,8 @@ def sample( no_oversample: bool = False, prefix_len: int = DEFAULT_PREFIX_LEN, range_ratio: float = DEFAULT_RANGE_RATIO, + input_range_ratio: float | None = None, + output_range_ratio: float | None = None, input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, batchsize: int = 1, @@ -595,24 +603,39 @@ def sample( lora_assignment: str = "random", **kwargs, ) -> list[SampleRequest]: - # validate total input tokens (prefix + sampled) is at least 1. + resolved_input_rr = ( + input_range_ratio if input_range_ratio is not None else range_ratio + ) + resolved_output_rr = ( + output_range_ratio if output_range_ratio is not None else range_ratio + ) + num_special = int(tokenizer.num_special_tokens_to_add()) real_input_len = max(0, int(input_len) - num_special) - min_sampled_input = math.floor(real_input_len * (1.0 - float(range_ratio))) + min_sampled_input = math.floor( + real_input_len * (1.0 - float(resolved_input_rr)) + ) min_total_input = int(prefix_len) + min_sampled_input if min_total_input < 1: raise ValueError( "--random-input-len is too small: with tokenizer special " - f"tokens {num_special} and --random-range-ratio {range_ratio}, " + f"tokens {num_special} and " + f"--random-input-range-ratio {resolved_input_rr}, " "the minimum possible total input tokens (prefix + sampled) is " f"{min_total_input}. Increase --random-input-len and/or " - "--random-prefix-len, or decrease --random-range-ratio so that " - "prefix_len + floor(max(0, random_input_len - num_special)) " - "* (1 - range_ratio) >= 1." + "--random-prefix-len, or decrease --random-input-range-ratio " + "so that prefix_len + floor(max(0, random_input_len - " + "num_special)) * (1 - input_range_ratio) >= 1." ) input_lens, output_lens, offsets = get_sampling_params( - self._rng, num_requests, range_ratio, input_len, output_len, tokenizer + self._rng, + num_requests, + resolved_input_rr, + resolved_output_rr, + input_len, + output_len, + tokenizer, ) vocab_size = tokenizer.vocab_size @@ -623,7 +646,7 @@ def sample( # Generate prefix once prefix_token_ids = self.get_prefix(tokenizer, allowed_tokens, prefix_len) - requests: list[SampleRequest] = [] + requests = [] token_mismatch_total = 0 for i in range(num_requests): prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501 @@ -783,18 +806,33 @@ def sample( no_oversample: bool = False, prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + input_range_ratio: float | None = None, + output_range_ratio: float | None = None, input_len: int = RandomDataset.DEFAULT_INPUT_LEN, output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, batchsize: int = 1, is_reranker: bool = True, **kwargs, ) -> list[SampleRequest]: + resolved_input_rr = ( + input_range_ratio if input_range_ratio is not None else range_ratio + ) + resolved_output_rr = ( + output_range_ratio if output_range_ratio is not None else range_ratio + ) + n_sep_tokens = int(is_reranker) query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len query_lens, _, query_offsets = get_sampling_params( - self._rng, 1, range_ratio, query_len_param, 0, tokenizer + self._rng, + 1, + resolved_input_rr, + resolved_output_rr, + query_len_param, + 0, + tokenizer, ) query_len = int(query_lens[0]) @@ -808,7 +846,13 @@ def sample( doc_len_param = input_len - query_len - n_sep_tokens doc_lens, _, doc_offsets = get_sampling_params( - self._rng, num_requests, range_ratio, doc_len_param, 0, tokenizer + self._rng, + num_requests, + resolved_input_rr, + resolved_output_rr, + doc_len_param, + 0, + tokenizer, ) vocab_size = tokenizer.vocab_size @@ -880,7 +924,7 @@ def sample( # ----------------------------------------------------------------------------- -class TxtSlicesDataset(RandomDataset): +class TxtSlicesDataset(BenchmarkDataset): """ Implements the TxtSlices dataset. Takes a URL or file path to a text file, tokenizes the entire content, and generates sample requests by randomly @@ -902,6 +946,7 @@ def __init__( raise ValueError("The text file is empty and cannot be sampled from.") self._rng = np.random.default_rng(self.random_seed) + self.rng = random.Random(self.random_seed) @staticmethod def load_data(dataset_path: str) -> str: @@ -924,11 +969,12 @@ def generate_prompt( tokenizer: TokenizerLike, token_ids: tuple[int, ...], input_len: int, - start_pos: int, - output_len: int, ) -> str: num_available_tokens = len(token_ids) + # Randomly select a start position + start_pos = self.rng.randint(0, num_available_tokens - 1) + # Extract tokens with cycling if necessary prompt_token_ids = tuple( token_ids[(start_pos + j) % num_available_tokens] for j in range(input_len) @@ -945,36 +991,35 @@ def sample( no_oversample: bool = False, input_len: int = 1024, output_len: int = 128, - range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + range_ratio: float = 0.0, + input_range_ratio: float | None = None, + output_range_ratio: float | None = None, **kwargs, ) -> list[SampleRequest]: - # Tokenize the entire text content + resolved_input_rr = ( + input_range_ratio if input_range_ratio is not None else range_ratio + ) + resolved_output_rr = ( + output_range_ratio if output_range_ratio is not None else range_ratio + ) + token_ids = self.get_token_ids(tokenizer) + num_special_tokens = int(tokenizer.num_special_tokens_to_add()) - # Get the sampling parameters for input length and output length. - # We don't need the offsets. input_lens, output_lens, _ = get_sampling_params( self._rng, num_requests, - range_ratio, + resolved_input_rr, + resolved_output_rr, input_len, output_len, tokenizer, ) - # Additionally, get the starting positions in the input text. - start_positions = self._rng.integers(0, len(token_ids), size=num_requests) - # Put it all together. return [ SampleRequest( - prompt=self.generate_prompt( - tokenizer, - token_ids, - int(input_lens[i]), - int(start_positions[i]), - int(output_lens[i]), - ), - prompt_len=int(input_lens[i]), + prompt=self.generate_prompt(tokenizer, token_ids, int(input_lens[i])), + prompt_len=int(input_lens[i]) + num_special_tokens, expected_output_len=int(output_lens[i]), request_id=request_id_prefix + str(i), ) @@ -1290,6 +1335,8 @@ def sample( no_oversample: bool = False, prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + input_range_ratio: float | None = None, + output_range_ratio: float | None = None, input_len: int = RandomDataset.DEFAULT_INPUT_LEN, output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, batchsize: int = 1, @@ -1306,9 +1353,21 @@ def sample( raise NotImplementedError( "batchsize > 1 is not supported for RandomMultiModalDataset." ) - # Get the sampling parameters for the dataset + resolved_input_rr = ( + input_range_ratio if input_range_ratio is not None else range_ratio + ) + resolved_output_rr = ( + output_range_ratio if output_range_ratio is not None else range_ratio + ) + input_lens, output_lens, offsets = get_sampling_params( - self._rng, num_requests, range_ratio, input_len, output_len, tokenizer + self._rng, + num_requests, + resolved_input_rr, + resolved_output_rr, + input_len, + output_len, + tokenizer, ) ( @@ -1772,9 +1831,27 @@ def add_random_dataset_base_args( type=float, default=0.0, help="Range ratio for sampling input/output length, " - "used only for random sampling. Must be in the range [0, 1) to define " - "a symmetric sampling range" - "[length * (1 - range_ratio), length * (1 + range_ratio)].", + "used only for random sampling. Sets both input and output range " + "ratios unless overridden by --random-input-range-ratio or " + "--random-output-range-ratio. Must be in [0, 1).", + ) + parser_or_group.add_argument( + "--random-input-range-ratio", + type=float, + default=None, + help="Range ratio for sampling input length, used only for random " + "sampling. Overrides --random-range-ratio for input lengths. " + "Must be in [0, 1). Defines the sampling range " + "[input_len * (1 - ratio), input_len * (1 + ratio)].", + ) + parser_or_group.add_argument( + "--random-output-range-ratio", + type=float, + default=None, + help="Range ratio for sampling output length, used only for random " + "sampling. Overrides --random-range-ratio for output lengths. " + "Must be in [0, 1). Defines the sampling range " + "[output_len * (1 - ratio), output_len * (1 + ratio)].", ) parser_or_group.add_argument( "--random-prefix-len", @@ -2144,6 +2221,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: input_len=args.random_input_len, output_len=args.random_output_len, range_ratio=args.random_range_ratio, + input_range_ratio=args.random_input_range_ratio, + output_range_ratio=args.random_output_range_ratio, request_id_prefix=args.request_id_prefix, batchsize=args.random_batch_size, no_oversample=args.no_oversample, @@ -2157,6 +2236,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: num_requests=args.num_prompts, prefix_len=args.random_prefix_len, range_ratio=args.random_range_ratio, + input_range_ratio=args.random_input_range_ratio, + output_range_ratio=args.random_output_range_ratio, input_len=args.random_input_len, output_len=args.random_output_len, base_items_per_request=args.random_mm_base_items_per_request, @@ -2176,6 +2257,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: num_requests=args.num_prompts, input_len=args.random_input_len, range_ratio=args.random_range_ratio, + input_range_ratio=args.random_input_range_ratio, + output_range_ratio=args.random_output_range_ratio, request_id_prefix=args.request_id_prefix, batchsize=args.random_batch_size, is_reranker=not args.no_reranker, @@ -2204,6 +2287,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: input_len=args.random_input_len, output_len=args.random_output_len, range_ratio=args.random_range_ratio, + input_range_ratio=args.random_input_range_ratio, + output_range_ratio=args.random_output_range_ratio, request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 42a8132ffe6e..82245a2b036a 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -357,6 +357,12 @@ def get_requests(args, tokenizer): and args.dataset_name not in {"prefix_repetition", "random-mm", "random-rerank"} ): sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["input_range_ratio"] = getattr( + args, "random_input_range_ratio", None + ) + sample_kwargs["output_range_ratio"] = getattr( + args, "random_output_range_ratio", None + ) # prefer random_* arguments, fall back to regular arguments random_prefix_len = getattr(args, "random_prefix_len", None) sample_kwargs["prefix_len"] = ( @@ -459,6 +465,12 @@ def get_requests(args, tokenizer): random_prefix_len if random_prefix_len is not None else prefix_len ) sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["input_range_ratio"] = getattr( + args, "random_input_range_ratio", None + ) + sample_kwargs["output_range_ratio"] = getattr( + args, "random_output_range_ratio", None + ) elif args.dataset_name == "random-rerank": dataset_cls = RandomDatasetForReranking # prefer random_* arguments, fall back to regular arguments @@ -477,6 +489,12 @@ def get_requests(args, tokenizer): sample_kwargs["batchsize"] = getattr(args, "random_batch_size", 1) sample_kwargs["is_reranker"] = not getattr(args, "no_reranker", False) sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["input_range_ratio"] = getattr( + args, "random_input_range_ratio", None + ) + sample_kwargs["output_range_ratio"] = getattr( + args, "random_output_range_ratio", None + ) else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values From dedb7395e5e857f181433899cf151912892c8d90 Mon Sep 17 00:00:00 2001 From: jdebache Date: Mon, 13 Apr 2026 18:15:38 +0000 Subject: [PATCH 04/10] address review comments Signed-off-by: jdebache --- tests/benchmarks/test_sampling_params.py | 250 +++++++++++++++++++ tests/benchmarks/test_txt_slices_dataset.py | 49 +++- vllm/benchmarks/create_txt_slices_dataset.py | 223 +++++++++++++++++ vllm/benchmarks/datasets.py | 191 +------------- vllm/benchmarks/shared.py | 73 ++++++ 5 files changed, 585 insertions(+), 201 deletions(-) create mode 100644 tests/benchmarks/test_sampling_params.py create mode 100644 vllm/benchmarks/create_txt_slices_dataset.py create mode 100644 vllm/benchmarks/shared.py diff --git a/tests/benchmarks/test_sampling_params.py b/tests/benchmarks/test_sampling_params.py new file mode 100644 index 000000000000..94ce46189a07 --- /dev/null +++ b/tests/benchmarks/test_sampling_params.py @@ -0,0 +1,250 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import numpy as np +import pytest + +from vllm.benchmarks.shared import get_sampling_params +from vllm.tokenizers import TokenizerLike + + +class _FakeTokenizer(TokenizerLike): + """Minimal tokenizer implementing the TokenizerLike protocol + for testing get_sampling_params.""" + + def __init__(self, vocab_size: int = 1000, num_special_tokens: int = 0) -> None: + self._vocab_size = vocab_size + self._num_special_tokens = num_special_tokens + + # -- Properties required by TokenizerLike -- + + @classmethod + def from_pretrained(cls, path_or_repo_id, *a, **kw): # type: ignore[override] + return cls() + + @property + def vocab_size(self) -> int: + return self._vocab_size + + @property + def all_special_tokens(self) -> list[str]: + return [] + + @property + def all_special_ids(self) -> list[int]: + return [] + + @property + def bos_token_id(self) -> int: + return 0 + + @property + def eos_token_id(self) -> int: + return 1 + + @property + def pad_token_id(self) -> int: + return 2 + + @property + def is_fast(self) -> bool: + return False + + @property + def max_token_id(self) -> int: + return self._vocab_size - 1 + + @property + def max_chars_per_token(self) -> int: + return 4 + + @property + def truncation_side(self) -> str: + return "right" + + def num_special_tokens_to_add(self) -> int: + return self._num_special_tokens + + def __call__(self, text, text_pair=None, **kw): # type: ignore[override] + raise NotImplementedError + + def get_vocab(self) -> dict[str, int]: + return {} + + def get_added_vocab(self) -> dict[str, int]: + return {} + + def encode(self, text, **kw) -> list[int]: # type: ignore[override] + raise NotImplementedError + + def apply_chat_template(self, messages, **kw): # type: ignore[override] + raise NotImplementedError + + def convert_tokens_to_ids(self, tokens): # type: ignore[override] + raise NotImplementedError + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + raise NotImplementedError + + def decode(self, ids, skip_special_tokens: bool = False) -> str: # type: ignore[override] + raise NotImplementedError + + def convert_ids_to_tokens( # type: ignore[override] + self, ids, skip_special_tokens: bool = False + ) -> list[str]: + raise NotImplementedError + + +class TestGetSamplingParams: + """Tests for ``get_sampling_params`` in ``vllm.benchmarks.shared``.""" + + # -- helpers -- + + @staticmethod + def _tok(vocab_size: int = 1000, num_special: int = 0) -> _FakeTokenizer: + return _FakeTokenizer(vocab_size=vocab_size, num_special_tokens=num_special) + + # -- return shape / dtype -- + + def test_returns_three_arrays(self): + rng = np.random.default_rng(0) + result = get_sampling_params(rng, 5, 0.0, 0.0, 100, 50, self._tok()) + assert len(result) == 3 + for arr in result: + assert isinstance(arr, np.ndarray) + + @pytest.mark.parametrize("n", [1, 10, 100]) + def test_output_length_matches_num_requests(self, n: int): + rng = np.random.default_rng(42) + input_lens, output_lens, offsets = get_sampling_params( + rng, n, 0.0, 0.0, 64, 32, self._tok() + ) + assert input_lens.shape == (n,) + assert output_lens.shape == (n,) + assert offsets.shape == (n,) + + # -- fixed lengths (range_ratio = 0) -- + + def test_zero_range_ratio_gives_constant_lengths(self): + rng = np.random.default_rng(7) + input_lens, output_lens, _ = get_sampling_params( + rng, 20, 0.0, 0.0, 128, 64, self._tok() + ) + assert np.all(input_lens == 128) + assert np.all(output_lens == 64) + + def test_special_tokens_subtracted_from_input(self): + rng = np.random.default_rng(7) + input_lens, _, _ = get_sampling_params( + rng, 10, 0.0, 0.0, 100, 50, self._tok(num_special=4) + ) + # real_input_len = 100 - 4 = 96, range_ratio 0 → all 96 + assert np.all(input_lens == 96) + + def test_special_tokens_not_subtracted_from_output(self): + rng = np.random.default_rng(7) + _, output_lens, _ = get_sampling_params( + rng, 10, 0.0, 0.0, 100, 50, self._tok(num_special=4) + ) + assert np.all(output_lens == 50) + + # -- range ratios -- + + def test_input_range_bounds(self): + rng = np.random.default_rng(0) + ratio = 0.5 + base = 200 + input_lens, _, _ = get_sampling_params( + rng, 500, ratio, 0.0, base, 50, self._tok() + ) + lo = int(np.floor(base * (1 - ratio))) + hi = int(np.ceil(base * (1 + ratio))) + assert np.all(input_lens >= lo) + assert np.all(input_lens <= hi) + + def test_output_range_bounds(self): + rng = np.random.default_rng(0) + ratio = 0.3 + base = 100 + _, output_lens, _ = get_sampling_params( + rng, 500, 0.0, ratio, 50, base, self._tok() + ) + lo = max(1, int(np.floor(base * (1 - ratio)))) + hi = int(np.ceil(base * (1 + ratio))) + assert np.all(output_lens >= lo) + assert np.all(output_lens <= hi) + + def test_output_low_clamped_to_one(self): + """Even with a high ratio that would push output_low to 0, + the function clamps it to 1.""" + rng = np.random.default_rng(0) + # output_len=1, ratio=0.99 → floor(1*0.01)=0, should clamp to 1 + _, output_lens, _ = get_sampling_params(rng, 50, 0.0, 0.99, 100, 1, self._tok()) + assert np.all(output_lens >= 1) + + # -- offsets bounded by vocab_size -- + + @pytest.mark.parametrize("vocab", [100, 32000, 128256]) + def test_offsets_within_vocab(self, vocab: int): + rng = np.random.default_rng(0) + _, _, offsets = get_sampling_params( + rng, 200, 0.0, 0.0, 64, 32, self._tok(vocab_size=vocab) + ) + assert np.all(offsets >= 0) + assert np.all(offsets < vocab) + + # -- reproducibility -- + + def test_same_seed_same_results(self): + tok = self._tok() + a = get_sampling_params(np.random.default_rng(42), 50, 0.3, 0.2, 256, 64, tok) + b = get_sampling_params(np.random.default_rng(42), 50, 0.3, 0.2, 256, 64, tok) + for arr_a, arr_b in zip(a, b): + np.testing.assert_array_equal(arr_a, arr_b) + + def test_different_seed_different_results(self): + tok = self._tok() + a = get_sampling_params(np.random.default_rng(0), 50, 0.3, 0.2, 256, 64, tok) + b = get_sampling_params(np.random.default_rng(1), 50, 0.3, 0.2, 256, 64, tok) + # Extremely unlikely all three arrays match with different seeds + assert not all(np.array_equal(arr_a, arr_b) for arr_a, arr_b in zip(a, b)) + + # -- validation / error paths -- + + @pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5]) + def test_invalid_input_range_ratio(self, bad_ratio: float): + rng = np.random.default_rng(0) + with pytest.raises(ValueError, match="input_range_ratio"): + get_sampling_params(rng, 10, bad_ratio, 0.0, 100, 50, self._tok()) + + @pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5]) + def test_invalid_output_range_ratio(self, bad_ratio: float): + rng = np.random.default_rng(0) + with pytest.raises(ValueError, match="output_range_ratio"): + get_sampling_params(rng, 10, 0.0, bad_ratio, 100, 50, self._tok()) + + def test_input_len_zero_with_special_tokens(self): + """input_len < num_special_tokens → real_input_len = 0, which is fine + (range [0, 0]).""" + rng = np.random.default_rng(0) + input_lens, _, _ = get_sampling_params( + rng, 5, 0.0, 0.0, 5, 50, self._tok(num_special=10) + ) + # real_input_len = max(0, 5 - 10) = 0 + assert np.all(input_lens == 0) + + # -- edge cases -- + + def test_single_request(self): + rng = np.random.default_rng(0) + i, o, off = get_sampling_params(rng, 1, 0.0, 0.0, 100, 50, self._tok()) + assert i.shape == (1,) + assert o.shape == (1,) + assert off.shape == (1,) + + def test_large_num_requests(self): + rng = np.random.default_rng(0) + i, o, off = get_sampling_params(rng, 10_000, 0.5, 0.5, 512, 128, self._tok()) + assert i.shape == (10_000,) + assert o.shape == (10_000,) + assert off.shape == (10_000,) diff --git a/tests/benchmarks/test_txt_slices_dataset.py b/tests/benchmarks/test_txt_slices_dataset.py index 8c676ce158ae..3312b8bf9e76 100644 --- a/tests/benchmarks/test_txt_slices_dataset.py +++ b/tests/benchmarks/test_txt_slices_dataset.py @@ -1,12 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json import os import tempfile import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase -from vllm.benchmarks.datasets import TxtSlicesDataset +from vllm.benchmarks.create_txt_slices_dataset import create_txt_slices_jsonl +from vllm.benchmarks.datasets import CustomDataset @pytest.fixture(scope="session") @@ -26,29 +28,50 @@ def hf_tokenizer() -> PreTrainedTokenizerBase: @pytest.mark.benchmark -def test_txt_slices(hf_tokenizer: PreTrainedTokenizerBase) -> None: +def test_create_txt_slices_jsonl(hf_tokenizer: PreTrainedTokenizerBase) -> None: + """Test that create_txt_slices_jsonl produces valid JSONL for CustomDataset.""" # Write the text content to a temporary file # Use delete=False for Python 3.10 compatibility (delete_on_close is 3.12+) with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: f.write(text_content) f.close() - temp_file_path = f.name + txt_path = f.name + + jsonl_path = txt_path + ".jsonl" try: - dataset = TxtSlicesDataset(dataset_path=temp_file_path) + create_txt_slices_jsonl( + input_path=txt_path, + output_path=jsonl_path, + tokenizer_name="gpt2", + num_prompts=10, + input_len=10, + output_len=10, + ) + + # Verify the JSONL file is valid and has the expected structure + with open(jsonl_path) as jf: + records = [json.loads(line) for line in jf] + assert len(records) == 10 + for record in records: + assert "prompt" in record + assert "output_tokens" in record + assert isinstance(record["prompt"], str) + assert record["output_tokens"] == 10 + + # Verify the JSONL file can be loaded by CustomDataset + dataset = CustomDataset(dataset_path=jsonl_path) samples = dataset.sample( - hf_tokenizer, num_requests=10, input_len=10, output_len=10 + tokenizer=hf_tokenizer, + num_requests=10, + output_len=10, + skip_chat_template=True, ) assert len(samples) == 10 - assert all(sample.prompt_len == 10 for sample in samples) assert all(sample.expected_output_len == 10 for sample in samples) - - for sample in samples: - tokenized_prompt = hf_tokenizer( - sample.prompt, add_special_tokens=True - ).input_ids - assert len(tokenized_prompt) == 10 finally: - os.unlink(f.name) + os.unlink(txt_path) + if os.path.exists(jsonl_path): + os.unlink(jsonl_path) diff --git a/vllm/benchmarks/create_txt_slices_dataset.py b/vllm/benchmarks/create_txt_slices_dataset.py new file mode 100644 index 000000000000..a630820cc872 --- /dev/null +++ b/vllm/benchmarks/create_txt_slices_dataset.py @@ -0,0 +1,223 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Convert a plain-text file (local path or URL) into a JSONL dataset +compatible with ``CustomDataset`` (``--dataset-name custom``). + +Each line of the output JSONL contains a ``prompt`` (decoded from a random +slice of the tokenized source text) and an ``output_tokens`` count. + +Usage +----- +:: + + python -m vllm.benchmarks.create_txt_slices_dataset \\ + --input sonnet.txt \\ + --output sonnet_dataset.jsonl \\ + --tokenizer gpt2 \\ + --num-prompts 1000 \\ + --input-len 1024 \\ + --output-len 128 + +The resulting JSONL file can then be used with the serving benchmark:: + + python -m vllm.benchmarks.serve \\ + --dataset-name custom \\ + --dataset-path sonnet_dataset.jsonl \\ + ... +""" + +from __future__ import annotations + +import argparse +import json +import logging +import random +import urllib.request + +import numpy as np +from transformers import AutoTokenizer + +from vllm.benchmarks.shared import get_sampling_params + +logger = logging.getLogger(__name__) + + +def load_text(path: str) -> str: + """Load text from a local file or URL.""" + if path.startswith(("http://", "https://")): + with urllib.request.urlopen(path) as response: + return response.read().decode("utf-8") + with open(path, encoding="utf-8") as f: + return f.read() + + +def create_txt_slices_jsonl( + *, + input_path: str, + output_path: str, + tokenizer_name: str, + num_prompts: int, + input_len: int, + output_len: int, + range_ratio: float = 0.0, + input_range_ratio: float | None = None, + output_range_ratio: float | None = None, + seed: int = 0, + trust_remote_code: bool = False, +) -> None: + """Read *input_path*, slice it into prompts, and write JSONL to + *output_path*.""" + + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, trust_remote_code=trust_remote_code + ) + + text = load_text(input_path) + if not text: + raise ValueError("The text file is empty and cannot be sampled from.") + + token_ids = tokenizer(text, add_special_tokens=False).input_ids + if not token_ids: + raise ValueError("Tokenizing the text produced zero tokens; cannot sample.") + + resolved_input_rr = ( + input_range_ratio if input_range_ratio is not None else range_ratio + ) + resolved_output_rr = ( + output_range_ratio if output_range_ratio is not None else range_ratio + ) + + rng_np = np.random.default_rng(seed) + rng_py = random.Random(seed) + + input_lens, output_lens, _ = get_sampling_params( + rng_np, + num_prompts, + resolved_input_rr, + resolved_output_rr, + input_len, + output_len, + tokenizer, + ) + + num_available_tokens = len(token_ids) + + records: list[dict[str, object]] = [] + for i in range(num_prompts): + req_input_len = int(input_lens[i]) + req_output_len = int(output_lens[i]) + + # Randomly select a start position and slice with cycling + start_pos = rng_py.randint(0, num_available_tokens - 1) + prompt_token_ids = [ + token_ids[(start_pos + j) % num_available_tokens] + for j in range(req_input_len) + ] + prompt = tokenizer.decode(prompt_token_ids, skip_special_tokens=False) + + records.append({"prompt": prompt, "output_tokens": req_output_len}) + + with open(output_path, "w", encoding="utf-8") as f: + for record in records: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + + logger.info( + "Wrote %d prompts to %s", + len(records), + output_path, + ) + + +def main(argv: list[str] | None = None) -> None: + parser = argparse.ArgumentParser( + description="Convert a plain-text file into a JSONL dataset " + "for CustomDataset (--dataset-name custom).", + ) + parser.add_argument( + "--input", + required=True, + help="Path or URL to the source text file.", + ) + parser.add_argument( + "--output", + required=True, + help="Path for the output JSONL file.", + ) + parser.add_argument( + "--tokenizer", + required=True, + help="HuggingFace tokenizer name or path.", + ) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompt samples to generate (default: 1000).", + ) + parser.add_argument( + "--input-len", + type=int, + default=1024, + help="Target number of input tokens per prompt (default: 1024).", + ) + parser.add_argument( + "--output-len", + type=int, + default=128, + help="Target number of output tokens per prompt (default: 128).", + ) + parser.add_argument( + "--range-ratio", + type=float, + default=0.0, + help="Range ratio for both input and output length sampling " + "(default: 0.0). Must be in [0, 1).", + ) + parser.add_argument( + "--input-range-ratio", + type=float, + default=None, + help="Range ratio for input length sampling. " + "Overrides --range-ratio for inputs.", + ) + parser.add_argument( + "--output-range-ratio", + type=float, + default=None, + help="Range ratio for output length sampling. " + "Overrides --range-ratio for outputs.", + ) + parser.add_argument( + "--seed", + type=int, + default=0, + help="Random seed for reproducibility (default: 0).", + ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from HuggingFace.", + ) + + args = parser.parse_args(argv) + + logging.basicConfig(level=logging.INFO) + + create_txt_slices_jsonl( + input_path=args.input, + output_path=args.output, + tokenizer_name=args.tokenizer, + num_prompts=args.num_prompts, + input_len=args.input_len, + output_len=args.output_len, + range_ratio=args.range_ratio, + input_range_ratio=args.input_range_ratio, + output_range_ratio=args.output_range_ratio, + seed=args.seed, + trust_remote_code=args.trust_remote_code, + ) + + +if __name__ == "__main__": + main() diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index c2b057673edc..545d623d71c6 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -10,7 +10,6 @@ - BurstGPT - HuggingFace - VisionArena - - TxtSlices """ import argparse @@ -20,7 +19,6 @@ import logging import math import random -import urllib from abc import ABC, abstractmethod from collections.abc import Callable, Iterator, Mapping from contextlib import suppress @@ -36,6 +34,7 @@ from PIL import Image from typing_extensions import deprecated +from vllm.benchmarks.shared import get_sampling_params from vllm.inputs import MultiModalDataDict from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path @@ -499,65 +498,6 @@ def gen_prompt_decode_to_target_len( # ----------------------------------------------------------------------------- -def get_sampling_params( - rng: np.random.Generator, - num_requests: int, - input_range_ratio: float, - output_range_ratio: float, - input_len: int, - output_len: int, - tokenizer: TokenizerLike, -) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Sample per-request input/output token lengths and vocab offsets. - - Lengths are drawn uniformly from integer ranges around the configured - means, controlled by ``input_range_ratio`` and ``output_range_ratio``. - Tokenizer special tokens are subtracted from ``input_len`` before - computing the sampling interval. - - Returns: - (input_lens, output_lens, offsets) – three 1-D ``np.ndarray`` of - shape ``(num_requests,)``. - """ - if not (0.0 <= input_range_ratio < 1.0): - raise ValueError("input_range_ratio must be in [0, 1).") - if not (0.0 <= output_range_ratio < 1.0): - raise ValueError("output_range_ratio must be in [0, 1).") - num_special_tokens = int(tokenizer.num_special_tokens_to_add()) - real_input_len = max(0, int(input_len) - num_special_tokens) - input_low = math.floor(real_input_len * (1 - input_range_ratio)) - input_high = math.ceil(real_input_len * (1 + input_range_ratio)) - output_low = math.floor(output_len * (1 - output_range_ratio)) - output_high = math.ceil(output_len * (1 + output_range_ratio)) - # Ensure the lower bound for output length is at least 1 to - # prevent sampling 0 tokens. - output_low = max(output_low, 1) - output_high = max(output_high, 1) - - if input_low > input_high: - raise ValueError( - f"Invalid input sampling interval: low={input_low} > high={input_high}" - ) - if output_low > output_high: - raise ValueError( - f"Invalid output sampling interval: low={output_low} > high={output_high}" - ) - - logger.info( - "Sampling input_len from [%s, %s] and output_len from [%s, %s]", - input_low, - input_high, - output_low, - output_high, - ) - - input_lens = rng.integers(input_low, input_high + 1, size=num_requests) - output_lens = rng.integers(output_low, output_high + 1, size=num_requests) - offsets = rng.integers(0, tokenizer.vocab_size, size=num_requests) - return input_lens, output_lens, offsets - - class RandomDataset(BenchmarkDataset): """ Synthetic text-only dataset for serving/throughput benchmarks. @@ -919,114 +859,6 @@ def sample( return batch_requests -# ----------------------------------------------------------------------------- -# TxtSlicesDataset Implementation -# ----------------------------------------------------------------------------- - - -class TxtSlicesDataset(BenchmarkDataset): - """ - Implements the TxtSlices dataset. Takes a URL or file path to a text file, - tokenizes the entire content, and generates sample requests by randomly - slicing from the tokenized sequence with cycling support. - """ - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - dataset_path = kwargs.get("dataset_path") - if dataset_path is None: - raise ValueError( - "dataset_path must be provided to create a TxtSlicesDataset." - ) - self.text = self.load_data(dataset_path) - if len(self.text) == 0: - raise ValueError("The text file is empty and cannot be sampled from.") - - self._rng = np.random.default_rng(self.random_seed) - self.rng = random.Random(self.random_seed) - - @staticmethod - def load_data(dataset_path: str) -> str: - if dataset_path.startswith(("http://", "https://")): - with urllib.request.urlopen(dataset_path) as response: - return response.read().decode("utf-8") - else: - with open(dataset_path, encoding="utf-8") as f: - return f.read() - - def get_token_ids(self, tokenizer: TokenizerLike) -> tuple[int, ...]: - tokenized = tokenizer(self.text, add_special_tokens=False) - token_ids = tokenized.input_ids - if len(token_ids) == 0: - raise ValueError("The text is empty and cannot be sampled from.") - return token_ids - - def generate_prompt( - self, - tokenizer: TokenizerLike, - token_ids: tuple[int, ...], - input_len: int, - ) -> str: - num_available_tokens = len(token_ids) - - # Randomly select a start position - start_pos = self.rng.randint(0, num_available_tokens - 1) - - # Extract tokens with cycling if necessary - prompt_token_ids = tuple( - token_ids[(start_pos + j) % num_available_tokens] for j in range(input_len) - ) - - # Decode the tokens to get the prompt - return tokenizer.decode(prompt_token_ids, skip_special_tokens=False) - - def sample( - self, - tokenizer: TokenizerLike, - num_requests: int, - request_id_prefix: str = "", - no_oversample: bool = False, - input_len: int = 1024, - output_len: int = 128, - range_ratio: float = 0.0, - input_range_ratio: float | None = None, - output_range_ratio: float | None = None, - **kwargs, - ) -> list[SampleRequest]: - resolved_input_rr = ( - input_range_ratio if input_range_ratio is not None else range_ratio - ) - resolved_output_rr = ( - output_range_ratio if output_range_ratio is not None else range_ratio - ) - - token_ids = self.get_token_ids(tokenizer) - num_special_tokens = int(tokenizer.num_special_tokens_to_add()) - - input_lens, output_lens, _ = get_sampling_params( - self._rng, - num_requests, - resolved_input_rr, - resolved_output_rr, - input_len, - output_len, - tokenizer, - ) - - return [ - SampleRequest( - prompt=self.generate_prompt(tokenizer, token_ids, int(input_lens[i])), - prompt_len=int(input_lens[i]) + num_special_tokens, - expected_output_len=int(output_lens[i]), - request_id=request_id_prefix + str(i), - ) - for i in range(num_requests) - ] - - # ----------------------------------------------------------------------------- # MultiModalDataset Implementation # ----------------------------------------------------------------------------- @@ -1614,7 +1446,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "custom_mm", "prefix_repetition", "spec_bench", - "txt-slices", ], help="Name of the dataset to benchmark on.", ) @@ -1628,8 +1459,8 @@ def add_dataset_parser(parser: FlexibleArgumentParser): type=str, default=None, action=_ValidateDatasetArgs, - help="Path to the sharegpt/sonnet dataset, the HF dataset ID if using HF " - "dataset, or the path/URL to a txt file for the txt-slices dataset.", + help="Path to the sharegpt/sonnet dataset or the HF dataset ID if " + "using HF dataset.", ) parser.add_argument( "--no-oversample", @@ -1809,7 +1640,6 @@ def add_random_dataset_base_args( - random (random dataset) - random-mm (random multimodal dataset) - random-rerank (random dataset for reranking) - - txt-slices (txt-slices dataset) Args: parser_or_group: Either a parser or an argument group to add arguments to. @@ -2277,21 +2107,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), - "txt-slices": lambda: TxtSlicesDataset( - random_seed=args.seed, - dataset_path=args.dataset_path, - disable_shuffle=args.disable_shuffle, - ).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - input_len=args.random_input_len, - output_len=args.random_output_len, - range_ratio=args.random_range_ratio, - input_range_ratio=args.random_input_range_ratio, - output_range_ratio=args.random_output_range_ratio, - request_id_prefix=args.request_id_prefix, - no_oversample=args.no_oversample, - ), } try: diff --git a/vllm/benchmarks/shared.py b/vllm/benchmarks/shared.py new file mode 100644 index 000000000000..0737619bc0a4 --- /dev/null +++ b/vllm/benchmarks/shared.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared utilities for benchmark dataset sampling. +""" + +import logging +import math + +import numpy as np + +from vllm.tokenizers import TokenizerLike + +logger = logging.getLogger(__name__) + + +def get_sampling_params( + rng: np.random.Generator, + num_requests: int, + input_range_ratio: float, + output_range_ratio: float, + input_len: int, + output_len: int, + tokenizer: TokenizerLike, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Sample per-request input/output token lengths and vocab offsets. + + Lengths are drawn uniformly from integer ranges around the configured + means, controlled by ``input_range_ratio`` and ``output_range_ratio``. + Tokenizer special tokens are subtracted from ``input_len`` before + computing the sampling interval. + + Returns: + (input_lens, output_lens, offsets) – three 1-D ``np.ndarray`` of + shape ``(num_requests,)``. + """ + if not (0.0 <= input_range_ratio < 1.0): + raise ValueError("input_range_ratio must be in [0, 1).") + if not (0.0 <= output_range_ratio < 1.0): + raise ValueError("output_range_ratio must be in [0, 1).") + num_special_tokens = int(tokenizer.num_special_tokens_to_add()) + real_input_len = max(0, int(input_len) - num_special_tokens) + input_low = math.floor(real_input_len * (1 - input_range_ratio)) + input_high = math.ceil(real_input_len * (1 + input_range_ratio)) + output_low = math.floor(output_len * (1 - output_range_ratio)) + output_high = math.ceil(output_len * (1 + output_range_ratio)) + # Ensure the lower bound for output length is at least 1 to + # prevent sampling 0 tokens. + output_low = max(output_low, 1) + output_high = max(output_high, 1) + + if input_low > input_high: + raise ValueError( + f"Invalid input sampling interval: low={input_low} > high={input_high}" + ) + if output_low > output_high: + raise ValueError( + f"Invalid output sampling interval: low={output_low} > high={output_high}" + ) + + logger.info( + "Sampling input_len from [%s, %s] and output_len from [%s, %s]", + input_low, + input_high, + output_low, + output_high, + ) + + input_lens = rng.integers(input_low, input_high + 1, size=num_requests) + output_lens = rng.integers(output_low, output_high + 1, size=num_requests) + offsets = rng.integers(0, tokenizer.vocab_size, size=num_requests) + return input_lens, output_lens, offsets From 7e49d99f47a1c4c0d21b1ab1f993115b97034f25 Mon Sep 17 00:00:00 2001 From: jdebache Date: Mon, 13 Apr 2026 18:21:34 +0000 Subject: [PATCH 05/10] improve doc a bit Signed-off-by: jdebache --- vllm/benchmarks/create_txt_slices_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/create_txt_slices_dataset.py b/vllm/benchmarks/create_txt_slices_dataset.py index a630820cc872..49903d7db103 100644 --- a/vllm/benchmarks/create_txt_slices_dataset.py +++ b/vllm/benchmarks/create_txt_slices_dataset.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Convert a plain-text file (local path or URL) into a JSONL dataset -compatible with ``CustomDataset`` (``--dataset-name custom``). +compatible with ``CustomDataset`` (``--dataset-name custom``), by +randomly slicing the tokenized text into prompts. Each line of the output JSONL contains a ``prompt`` (decoded from a random slice of the tokenized source text) and an ``output_tokens`` count. From 7f4468238fcf1eaaa8db883a276382306d4cb097 Mon Sep 17 00:00:00 2001 From: jdebache Date: Tue, 14 Apr 2026 07:02:58 +0000 Subject: [PATCH 06/10] address review comments Signed-off-by: jdebache --- tests/benchmarks/test_sampling_params.py | 62 ++++++----- tests/benchmarks/test_txt_slices_dataset.py | 77 ++++++------- vllm/benchmarks/datasets/__init__.py | 84 ++++++++++++++ .../create_txt_slices_dataset.py | 55 ++++------ vllm/benchmarks/{ => datasets}/datasets.py | 103 ++++++------------ vllm/benchmarks/{ => datasets}/shared.py | 34 +++++- 6 files changed, 239 insertions(+), 176 deletions(-) create mode 100644 vllm/benchmarks/datasets/__init__.py rename vllm/benchmarks/{ => datasets}/create_txt_slices_dataset.py (80%) rename vllm/benchmarks/{ => datasets}/datasets.py (97%) rename vllm/benchmarks/{ => datasets}/shared.py (67%) diff --git a/tests/benchmarks/test_sampling_params.py b/tests/benchmarks/test_sampling_params.py index 94ce46189a07..a56357264a57 100644 --- a/tests/benchmarks/test_sampling_params.py +++ b/tests/benchmarks/test_sampling_params.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from vllm.benchmarks.shared import get_sampling_params +from vllm.benchmarks.datasets.shared import get_sampling_params from vllm.tokenizers import TokenizerLike @@ -96,7 +96,7 @@ def convert_ids_to_tokens( # type: ignore[override] class TestGetSamplingParams: - """Tests for ``get_sampling_params`` in ``vllm.benchmarks.shared``.""" + """Tests for ``get_sampling_params`` in ``vllm.benchmarks.datasets.shared``.""" # -- helpers -- @@ -108,7 +108,7 @@ def _tok(vocab_size: int = 1000, num_special: int = 0) -> _FakeTokenizer: def test_returns_three_arrays(self): rng = np.random.default_rng(0) - result = get_sampling_params(rng, 5, 0.0, 0.0, 100, 50, self._tok()) + result = get_sampling_params(rng, 5, 0.0, 100, 50, self._tok()) assert len(result) == 3 for arr in result: assert isinstance(arr, np.ndarray) @@ -117,7 +117,7 @@ def test_returns_three_arrays(self): def test_output_length_matches_num_requests(self, n: int): rng = np.random.default_rng(42) input_lens, output_lens, offsets = get_sampling_params( - rng, n, 0.0, 0.0, 64, 32, self._tok() + rng, n, 0.0, 64, 32, self._tok() ) assert input_lens.shape == (n,) assert output_lens.shape == (n,) @@ -128,24 +128,19 @@ def test_output_length_matches_num_requests(self, n: int): def test_zero_range_ratio_gives_constant_lengths(self): rng = np.random.default_rng(7) input_lens, output_lens, _ = get_sampling_params( - rng, 20, 0.0, 0.0, 128, 64, self._tok() + rng, 20, 0.0, 128, 64, self._tok() ) assert np.all(input_lens == 128) assert np.all(output_lens == 64) - def test_special_tokens_subtracted_from_input(self): + def test_special_tokens_subtracted_from_input_only(self): rng = np.random.default_rng(7) - input_lens, _, _ = get_sampling_params( - rng, 10, 0.0, 0.0, 100, 50, self._tok(num_special=4) + input_lens, output_lens, _ = get_sampling_params( + rng, 10, 0.0, 100, 50, self._tok(num_special=4) ) # real_input_len = 100 - 4 = 96, range_ratio 0 → all 96 assert np.all(input_lens == 96) - - def test_special_tokens_not_subtracted_from_output(self): - rng = np.random.default_rng(7) - _, output_lens, _ = get_sampling_params( - rng, 10, 0.0, 0.0, 100, 50, self._tok(num_special=4) - ) + # special tokens are not subtracted from output length assert np.all(output_lens == 50) # -- range ratios -- @@ -155,7 +150,7 @@ def test_input_range_bounds(self): ratio = 0.5 base = 200 input_lens, _, _ = get_sampling_params( - rng, 500, ratio, 0.0, base, 50, self._tok() + rng, 500, {"input": ratio, "output": 0.0}, base, 50, self._tok() ) lo = int(np.floor(base * (1 - ratio))) hi = int(np.ceil(base * (1 + ratio))) @@ -167,7 +162,7 @@ def test_output_range_bounds(self): ratio = 0.3 base = 100 _, output_lens, _ = get_sampling_params( - rng, 500, 0.0, ratio, 50, base, self._tok() + rng, 500, {"input": 0.0, "output": ratio}, 50, base, self._tok() ) lo = max(1, int(np.floor(base * (1 - ratio)))) hi = int(np.ceil(base * (1 + ratio))) @@ -179,7 +174,9 @@ def test_output_low_clamped_to_one(self): the function clamps it to 1.""" rng = np.random.default_rng(0) # output_len=1, ratio=0.99 → floor(1*0.01)=0, should clamp to 1 - _, output_lens, _ = get_sampling_params(rng, 50, 0.0, 0.99, 100, 1, self._tok()) + _, output_lens, _ = get_sampling_params( + rng, 50, {"input": 0.0, "output": 0.99}, 100, 1, self._tok() + ) assert np.all(output_lens >= 1) # -- offsets bounded by vocab_size -- @@ -188,7 +185,7 @@ def test_output_low_clamped_to_one(self): def test_offsets_within_vocab(self, vocab: int): rng = np.random.default_rng(0) _, _, offsets = get_sampling_params( - rng, 200, 0.0, 0.0, 64, 32, self._tok(vocab_size=vocab) + rng, 200, 0.0, 64, 32, self._tok(vocab_size=vocab) ) assert np.all(offsets >= 0) assert np.all(offsets < vocab) @@ -197,15 +194,17 @@ def test_offsets_within_vocab(self, vocab: int): def test_same_seed_same_results(self): tok = self._tok() - a = get_sampling_params(np.random.default_rng(42), 50, 0.3, 0.2, 256, 64, tok) - b = get_sampling_params(np.random.default_rng(42), 50, 0.3, 0.2, 256, 64, tok) + rr = {"input": 0.3, "output": 0.2} + a = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok) + b = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok) for arr_a, arr_b in zip(a, b): np.testing.assert_array_equal(arr_a, arr_b) def test_different_seed_different_results(self): tok = self._tok() - a = get_sampling_params(np.random.default_rng(0), 50, 0.3, 0.2, 256, 64, tok) - b = get_sampling_params(np.random.default_rng(1), 50, 0.3, 0.2, 256, 64, tok) + rr = {"input": 0.3, "output": 0.2} + a = get_sampling_params(np.random.default_rng(0), 50, rr, 256, 64, tok) + b = get_sampling_params(np.random.default_rng(1), 50, rr, 256, 64, tok) # Extremely unlikely all three arrays match with different seeds assert not all(np.array_equal(arr_a, arr_b) for arr_a, arr_b in zip(a, b)) @@ -215,20 +214,29 @@ def test_different_seed_different_results(self): def test_invalid_input_range_ratio(self, bad_ratio: float): rng = np.random.default_rng(0) with pytest.raises(ValueError, match="input_range_ratio"): - get_sampling_params(rng, 10, bad_ratio, 0.0, 100, 50, self._tok()) + get_sampling_params( + rng, 10, {"input": bad_ratio, "output": 0.0}, 100, 50, self._tok() + ) @pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5]) def test_invalid_output_range_ratio(self, bad_ratio: float): rng = np.random.default_rng(0) with pytest.raises(ValueError, match="output_range_ratio"): - get_sampling_params(rng, 10, 0.0, bad_ratio, 100, 50, self._tok()) + get_sampling_params( + rng, 10, {"input": 0.0, "output": bad_ratio}, 100, 50, self._tok() + ) + + def test_invalid_dict_missing_keys(self): + rng = np.random.default_rng(0) + with pytest.raises(ValueError, match="input.*output"): + get_sampling_params(rng, 10, {"input": 0.1}, 100, 50, self._tok()) def test_input_len_zero_with_special_tokens(self): """input_len < num_special_tokens → real_input_len = 0, which is fine (range [0, 0]).""" rng = np.random.default_rng(0) input_lens, _, _ = get_sampling_params( - rng, 5, 0.0, 0.0, 5, 50, self._tok(num_special=10) + rng, 5, 0.0, 5, 50, self._tok(num_special=10) ) # real_input_len = max(0, 5 - 10) = 0 assert np.all(input_lens == 0) @@ -237,14 +245,14 @@ def test_input_len_zero_with_special_tokens(self): def test_single_request(self): rng = np.random.default_rng(0) - i, o, off = get_sampling_params(rng, 1, 0.0, 0.0, 100, 50, self._tok()) + i, o, off = get_sampling_params(rng, 1, 0.0, 100, 50, self._tok()) assert i.shape == (1,) assert o.shape == (1,) assert off.shape == (1,) def test_large_num_requests(self): rng = np.random.default_rng(0) - i, o, off = get_sampling_params(rng, 10_000, 0.5, 0.5, 512, 128, self._tok()) + i, o, off = get_sampling_params(rng, 10_000, 0.5, 512, 128, self._tok()) assert i.shape == (10_000,) assert o.shape == (10_000,) assert off.shape == (10_000,) diff --git a/tests/benchmarks/test_txt_slices_dataset.py b/tests/benchmarks/test_txt_slices_dataset.py index 3312b8bf9e76..7821e9a925a2 100644 --- a/tests/benchmarks/test_txt_slices_dataset.py +++ b/tests/benchmarks/test_txt_slices_dataset.py @@ -1,14 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -import os -import tempfile +from pathlib import Path import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase -from vllm.benchmarks.create_txt_slices_dataset import create_txt_slices_jsonl from vllm.benchmarks.datasets import CustomDataset +from vllm.benchmarks.datasets.create_txt_slices_dataset import create_txt_slices_jsonl @pytest.fixture(scope="session") @@ -28,50 +27,42 @@ def hf_tokenizer() -> PreTrainedTokenizerBase: @pytest.mark.benchmark -def test_create_txt_slices_jsonl(hf_tokenizer: PreTrainedTokenizerBase) -> None: +def test_create_txt_slices_jsonl( + hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path +) -> None: """Test that create_txt_slices_jsonl produces valid JSONL for CustomDataset.""" - # Write the text content to a temporary file - # Use delete=False for Python 3.10 compatibility (delete_on_close is 3.12+) - with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: - f.write(text_content) - f.close() - txt_path = f.name + txt_path = tmp_path / "input.txt" + jsonl_path = tmp_path / "input.txt.jsonl" - jsonl_path = txt_path + ".jsonl" + txt_path.write_text(text_content) - try: - create_txt_slices_jsonl( - input_path=txt_path, - output_path=jsonl_path, - tokenizer_name="gpt2", - num_prompts=10, - input_len=10, - output_len=10, - ) + create_txt_slices_jsonl( + input_path=str(txt_path), + output_path=str(jsonl_path), + tokenizer_name="gpt2", + num_prompts=10, + input_len=10, + output_len=10, + ) - # Verify the JSONL file is valid and has the expected structure - with open(jsonl_path) as jf: - records = [json.loads(line) for line in jf] + # Verify the JSONL file is valid and has the expected structure + records = [json.loads(line) for line in jsonl_path.read_text().splitlines()] - assert len(records) == 10 - for record in records: - assert "prompt" in record - assert "output_tokens" in record - assert isinstance(record["prompt"], str) - assert record["output_tokens"] == 10 + assert len(records) == 10 + for record in records: + assert "prompt" in record + assert "output_tokens" in record + assert isinstance(record["prompt"], str) + assert record["output_tokens"] == 10 - # Verify the JSONL file can be loaded by CustomDataset - dataset = CustomDataset(dataset_path=jsonl_path) - samples = dataset.sample( - tokenizer=hf_tokenizer, - num_requests=10, - output_len=10, - skip_chat_template=True, - ) + # Verify the JSONL file can be loaded by CustomDataset + dataset = CustomDataset(dataset_path=str(jsonl_path)) + samples = dataset.sample( + tokenizer=hf_tokenizer, + num_requests=10, + output_len=10, + skip_chat_template=True, + ) - assert len(samples) == 10 - assert all(sample.expected_output_len == 10 for sample in samples) - finally: - os.unlink(txt_path) - if os.path.exists(jsonl_path): - os.unlink(jsonl_path) + assert len(samples) == 10 + assert all(sample.expected_output_len == 10 for sample in samples) diff --git a/vllm/benchmarks/datasets/__init__.py b/vllm/benchmarks/datasets/__init__.py new file mode 100644 index 000000000000..84d66f381cfa --- /dev/null +++ b/vllm/benchmarks/datasets/__init__.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.benchmarks.datasets.datasets import ( + DEFAULT_NUM_PROMPTS, + AIMODataset, + ASRDataset, + BenchmarkDataset, + BlazeditDataset, + BurstGPTDataset, + ConversationDataset, + CustomDataset, + CustomMMDataset, + HuggingFaceDataset, + InstructCoderDataset, + MLPerfDataset, + MMStarDataset, + MMVUDataset, + MTBenchDataset, + MultiModalConversationDataset, + NextEditPredictionDataset, + PrefixRepetitionRandomDataset, + RandomDataset, + RandomDatasetForReranking, + RandomMultiModalDataset, + SampleRequest, + ShareGPTDataset, + SonnetDataset, + SpecBench, + VisionArenaDataset, + add_dataset_parser, + add_random_dataset_base_args, + add_random_multimodal_dataset_args, + gen_prompt_decode_to_target_len, + get_samples, + is_valid_sequence, + lora_path_on_disk, + lora_tokenizer_cache, + process_image, + process_video, + zeta_prompt, +) +from vllm.benchmarks.datasets.shared import RangeRatio + +__all__ = [ + "DEFAULT_NUM_PROMPTS", + "AIMODataset", + "ASRDataset", + "BenchmarkDataset", + "BlazeditDataset", + "BurstGPTDataset", + "ConversationDataset", + "CustomDataset", + "CustomMMDataset", + "HuggingFaceDataset", + "InstructCoderDataset", + "MLPerfDataset", + "MMStarDataset", + "MMVUDataset", + "MTBenchDataset", + "MultiModalConversationDataset", + "NextEditPredictionDataset", + "PrefixRepetitionRandomDataset", + "RandomDataset", + "RandomDatasetForReranking", + "RandomMultiModalDataset", + "SampleRequest", + "ShareGPTDataset", + "SonnetDataset", + "SpecBench", + "VisionArenaDataset", + "add_dataset_parser", + "add_random_dataset_base_args", + "add_random_multimodal_dataset_args", + "gen_prompt_decode_to_target_len", + "get_samples", + "is_valid_sequence", + "lora_path_on_disk", + "lora_tokenizer_cache", + "process_image", + "process_video", + "RangeRatio", + "zeta_prompt", +] diff --git a/vllm/benchmarks/create_txt_slices_dataset.py b/vllm/benchmarks/datasets/create_txt_slices_dataset.py similarity index 80% rename from vllm/benchmarks/create_txt_slices_dataset.py rename to vllm/benchmarks/datasets/create_txt_slices_dataset.py index 49903d7db103..0c80386e882b 100644 --- a/vllm/benchmarks/create_txt_slices_dataset.py +++ b/vllm/benchmarks/datasets/create_txt_slices_dataset.py @@ -12,7 +12,7 @@ ----- :: - python -m vllm.benchmarks.create_txt_slices_dataset \\ + python -m vllm.benchmarks.datasets.create_txt_slices_dataset \\ --input sonnet.txt \\ --output sonnet_dataset.jsonl \\ --tokenizer gpt2 \\ @@ -39,7 +39,7 @@ import numpy as np from transformers import AutoTokenizer -from vllm.benchmarks.shared import get_sampling_params +from vllm.benchmarks.datasets.shared import RangeRatio, get_sampling_params logger = logging.getLogger(__name__) @@ -61,9 +61,7 @@ def create_txt_slices_jsonl( num_prompts: int, input_len: int, output_len: int, - range_ratio: float = 0.0, - input_range_ratio: float | None = None, - output_range_ratio: float | None = None, + range_ratio: RangeRatio = 0.0, seed: int = 0, trust_remote_code: bool = False, ) -> None: @@ -82,21 +80,13 @@ def create_txt_slices_jsonl( if not token_ids: raise ValueError("Tokenizing the text produced zero tokens; cannot sample.") - resolved_input_rr = ( - input_range_ratio if input_range_ratio is not None else range_ratio - ) - resolved_output_rr = ( - output_range_ratio if output_range_ratio is not None else range_ratio - ) - rng_np = np.random.default_rng(seed) rng_py = random.Random(seed) input_lens, output_lens, _ = get_sampling_params( rng_np, num_prompts, - resolved_input_rr, - resolved_output_rr, + range_ratio, input_len, output_len, tokenizer, @@ -170,24 +160,12 @@ def main(argv: list[str] | None = None) -> None: ) parser.add_argument( "--range-ratio", - type=float, - default=0.0, - help="Range ratio for both input and output length sampling " - "(default: 0.0). Must be in [0, 1).", - ) - parser.add_argument( - "--input-range-ratio", - type=float, - default=None, - help="Range ratio for input length sampling. " - "Overrides --range-ratio for inputs.", - ) - parser.add_argument( - "--output-range-ratio", - type=float, - default=None, - help="Range ratio for output length sampling. " - "Overrides --range-ratio for outputs.", + type=str, + default="0.0", + help="Range ratio for input/output length sampling (default: 0.0). " + "A single float applies to both ISL and OSL. " + 'A JSON dict like \'{"input": 0.3, "output": 0.5}\' sets them ' + "independently. Values must be in [0, 1).", ) parser.add_argument( "--seed", @@ -205,6 +183,15 @@ def main(argv: list[str] | None = None) -> None: logging.basicConfig(level=logging.INFO) + # Parse --range-ratio: try float first, then JSON dict. + range_ratio: RangeRatio + try: + range_ratio = float(args.range_ratio) + except ValueError: + import json as _json + + range_ratio = _json.loads(args.range_ratio) + create_txt_slices_jsonl( input_path=args.input, output_path=args.output, @@ -212,9 +199,7 @@ def main(argv: list[str] | None = None) -> None: num_prompts=args.num_prompts, input_len=args.input_len, output_len=args.output_len, - range_ratio=args.range_ratio, - input_range_ratio=args.input_range_ratio, - output_range_ratio=args.output_range_ratio, + range_ratio=range_ratio, seed=args.seed, trust_remote_code=args.trust_remote_code, ) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets/datasets.py similarity index 97% rename from vllm/benchmarks/datasets.py rename to vllm/benchmarks/datasets/datasets.py index 545d623d71c6..2170d324647a 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets/datasets.py @@ -34,7 +34,11 @@ from PIL import Image from typing_extensions import deprecated -from vllm.benchmarks.shared import get_sampling_params +from vllm.benchmarks.datasets.shared import ( + RangeRatio, + _resolve_range_ratios, + get_sampling_params, +) from vllm.inputs import MultiModalDataDict from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path @@ -532,9 +536,7 @@ def sample( request_id_prefix: str = "", no_oversample: bool = False, prefix_len: int = DEFAULT_PREFIX_LEN, - range_ratio: float = DEFAULT_RANGE_RATIO, - input_range_ratio: float | None = None, - output_range_ratio: float | None = None, + range_ratio: RangeRatio = DEFAULT_RANGE_RATIO, input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, batchsize: int = 1, @@ -543,12 +545,7 @@ def sample( lora_assignment: str = "random", **kwargs, ) -> list[SampleRequest]: - resolved_input_rr = ( - input_range_ratio if input_range_ratio is not None else range_ratio - ) - resolved_output_rr = ( - output_range_ratio if output_range_ratio is not None else range_ratio - ) + resolved_input_rr, _ = _resolve_range_ratios(range_ratio) num_special = int(tokenizer.num_special_tokens_to_add()) real_input_len = max(0, int(input_len) - num_special) @@ -560,10 +557,10 @@ def sample( raise ValueError( "--random-input-len is too small: with tokenizer special " f"tokens {num_special} and " - f"--random-input-range-ratio {resolved_input_rr}, " + f"input range ratio {resolved_input_rr}, " "the minimum possible total input tokens (prefix + sampled) is " f"{min_total_input}. Increase --random-input-len and/or " - "--random-prefix-len, or decrease --random-input-range-ratio " + "--random-prefix-len, or decrease the input range ratio " "so that prefix_len + floor(max(0, random_input_len - " "num_special)) * (1 - input_range_ratio) >= 1." ) @@ -571,8 +568,7 @@ def sample( input_lens, output_lens, offsets = get_sampling_params( self._rng, num_requests, - resolved_input_rr, - resolved_output_rr, + range_ratio, input_len, output_len, tokenizer, @@ -745,22 +741,13 @@ def sample( request_id_prefix: str = "", no_oversample: bool = False, prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, - range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, - input_range_ratio: float | None = None, - output_range_ratio: float | None = None, + range_ratio: RangeRatio = RandomDataset.DEFAULT_RANGE_RATIO, input_len: int = RandomDataset.DEFAULT_INPUT_LEN, output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, batchsize: int = 1, is_reranker: bool = True, **kwargs, ) -> list[SampleRequest]: - resolved_input_rr = ( - input_range_ratio if input_range_ratio is not None else range_ratio - ) - resolved_output_rr = ( - output_range_ratio if output_range_ratio is not None else range_ratio - ) - n_sep_tokens = int(is_reranker) query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len @@ -768,8 +755,7 @@ def sample( query_lens, _, query_offsets = get_sampling_params( self._rng, 1, - resolved_input_rr, - resolved_output_rr, + range_ratio, query_len_param, 0, tokenizer, @@ -788,8 +774,7 @@ def sample( doc_lens, _, doc_offsets = get_sampling_params( self._rng, num_requests, - resolved_input_rr, - resolved_output_rr, + range_ratio, doc_len_param, 0, tokenizer, @@ -1166,9 +1151,7 @@ def sample( request_id_prefix: str = "", no_oversample: bool = False, prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, - range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, - input_range_ratio: float | None = None, - output_range_ratio: float | None = None, + range_ratio: RangeRatio = RandomDataset.DEFAULT_RANGE_RATIO, input_len: int = RandomDataset.DEFAULT_INPUT_LEN, output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, batchsize: int = 1, @@ -1185,18 +1168,11 @@ def sample( raise NotImplementedError( "batchsize > 1 is not supported for RandomMultiModalDataset." ) - resolved_input_rr = ( - input_range_ratio if input_range_ratio is not None else range_ratio - ) - resolved_output_rr = ( - output_range_ratio if output_range_ratio is not None else range_ratio - ) input_lens, output_lens, offsets = get_sampling_params( self._rng, num_requests, - resolved_input_rr, - resolved_output_rr, + range_ratio, input_len, output_len, tokenizer, @@ -1658,30 +1634,12 @@ def add_random_dataset_base_args( ) parser_or_group.add_argument( "--random-range-ratio", - type=float, - default=0.0, + type=str, + default="0.0", help="Range ratio for sampling input/output length, " - "used only for random sampling. Sets both input and output range " - "ratios unless overridden by --random-input-range-ratio or " - "--random-output-range-ratio. Must be in [0, 1).", - ) - parser_or_group.add_argument( - "--random-input-range-ratio", - type=float, - default=None, - help="Range ratio for sampling input length, used only for random " - "sampling. Overrides --random-range-ratio for input lengths. " - "Must be in [0, 1). Defines the sampling range " - "[input_len * (1 - ratio), input_len * (1 + ratio)].", - ) - parser_or_group.add_argument( - "--random-output-range-ratio", - type=float, - default=None, - help="Range ratio for sampling output length, used only for random " - "sampling. Overrides --random-range-ratio for output lengths. " - "Must be in [0, 1). Defines the sampling range " - "[output_len * (1 - ratio), output_len * (1 + ratio)].", + "used only for random sampling. A single float applies to both " + 'ISL and OSL. A JSON dict like \'{"input": 0.3, "output": 0.5}\' ' + "sets them independently. Values must be in [0, 1).", ) parser_or_group.add_argument( "--random-prefix-len", @@ -1814,10 +1772,25 @@ def normalize(d: dict) -> dict[tuple[int, int, int], float]: ) +def _parse_range_ratio(value: str) -> RangeRatio: + """Parse a ``--random-range-ratio`` CLI string. + + Accepts either a plain float (``"0.3"``) or a JSON dict + (``'{"input": 0.3, "output": 0.5}'``). + """ + try: + return float(value) + except ValueError: + return json.loads(value) + + def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: if not hasattr(args, "request_id_prefix"): args.request_id_prefix = "" + if hasattr(args, "random_range_ratio") and isinstance(args.random_range_ratio, str): + args.random_range_ratio = _parse_range_ratio(args.random_range_ratio) + if args.dataset_name == "custom": dataset = CustomDataset( dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle @@ -2051,8 +2024,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: input_len=args.random_input_len, output_len=args.random_output_len, range_ratio=args.random_range_ratio, - input_range_ratio=args.random_input_range_ratio, - output_range_ratio=args.random_output_range_ratio, request_id_prefix=args.request_id_prefix, batchsize=args.random_batch_size, no_oversample=args.no_oversample, @@ -2066,8 +2037,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: num_requests=args.num_prompts, prefix_len=args.random_prefix_len, range_ratio=args.random_range_ratio, - input_range_ratio=args.random_input_range_ratio, - output_range_ratio=args.random_output_range_ratio, input_len=args.random_input_len, output_len=args.random_output_len, base_items_per_request=args.random_mm_base_items_per_request, @@ -2087,8 +2056,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: num_requests=args.num_prompts, input_len=args.random_input_len, range_ratio=args.random_range_ratio, - input_range_ratio=args.random_input_range_ratio, - output_range_ratio=args.random_output_range_ratio, request_id_prefix=args.request_id_prefix, batchsize=args.random_batch_size, is_reranker=not args.no_reranker, diff --git a/vllm/benchmarks/shared.py b/vllm/benchmarks/datasets/shared.py similarity index 67% rename from vllm/benchmarks/shared.py rename to vllm/benchmarks/datasets/shared.py index 0737619bc0a4..bc5a4340dd62 100644 --- a/vllm/benchmarks/shared.py +++ b/vllm/benchmarks/datasets/shared.py @@ -13,12 +13,35 @@ logger = logging.getLogger(__name__) +# Type alias: a single float applies to both ISL and OSL; a dict allows +# specifying them independently via ``{"input": …, "output": …}``. +RangeRatio = float | dict[str, float] + + +def _resolve_range_ratios( + range_ratio: RangeRatio, +) -> tuple[float, float]: + """Return ``(input_range_ratio, output_range_ratio)`` from *range_ratio*. + + *range_ratio* is either a single float (used for both input and output) + or a dict with ``"input"`` and ``"output"`` keys. + """ + if isinstance(range_ratio, dict): + try: + return float(range_ratio["input"]), float(range_ratio["output"]) + except KeyError as exc: + raise ValueError( + "When range_ratio is a dict it must contain 'input' and " + f"'output' keys, got: {sorted(range_ratio)}" + ) from exc + ratio = float(range_ratio) + return ratio, ratio + def get_sampling_params( rng: np.random.Generator, num_requests: int, - input_range_ratio: float, - output_range_ratio: float, + range_ratio: RangeRatio, input_len: int, output_len: int, tokenizer: TokenizerLike, @@ -27,7 +50,10 @@ def get_sampling_params( Sample per-request input/output token lengths and vocab offsets. Lengths are drawn uniformly from integer ranges around the configured - means, controlled by ``input_range_ratio`` and ``output_range_ratio``. + means, controlled by *range_ratio*. It may be a single ``float`` + (applied to both input and output) or a ``dict`` with ``"input"`` and + ``"output"`` keys for independent control. + Tokenizer special tokens are subtracted from ``input_len`` before computing the sampling interval. @@ -35,6 +61,8 @@ def get_sampling_params( (input_lens, output_lens, offsets) – three 1-D ``np.ndarray`` of shape ``(num_requests,)``. """ + input_range_ratio, output_range_ratio = _resolve_range_ratios(range_ratio) + if not (0.0 <= input_range_ratio < 1.0): raise ValueError("input_range_ratio must be in [0, 1).") if not (0.0 <= output_range_ratio < 1.0): From 63b4470d36124a6e9fc3d99d37eb3278ca0d6b14 Mon Sep 17 00:00:00 2001 From: jdebache Date: Tue, 14 Apr 2026 07:07:31 +0000 Subject: [PATCH 07/10] address review comments Signed-off-by: jdebache --- vllm/benchmarks/datasets/datasets.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py index 2170d324647a..8b739fbb3036 100644 --- a/vllm/benchmarks/datasets/datasets.py +++ b/vllm/benchmarks/datasets/datasets.py @@ -65,7 +65,7 @@ DEFAULT_NUM_PROMPTS = 1000 -@dataclass(frozen=True) +@dataclass class SampleRequest: """ Represents a single inference request for benchmarking. @@ -2342,29 +2342,10 @@ def load_data(self) -> None: random.shuffle(self.data) def sample( - self, - tokenizer: TokenizerLike, - num_requests: int, - request_id_prefix: str = "", - no_oversample: bool = False, - lora_path: str | None = None, - max_loras: int | None = None, - output_len: int | None = None, - enable_multimodal_chat: bool = False, - skip_chat_template: bool = False, **kwargs, ) -> list[SampleRequest]: # leverage CustomDataset sample return super().sample( - tokenizer, - num_requests, - request_id_prefix=request_id_prefix, - no_oversample=no_oversample, - lora_path=lora_path, - max_loras=max_loras, - output_len=output_len, - enable_multimodal_chat=enable_multimodal_chat, - skip_chat_template=skip_chat_template, **kwargs, ) From f857a0c9ca571f75bf6165722ec9d9c143f4c6ac Mon Sep 17 00:00:00 2001 From: jdebache Date: Tue, 14 Apr 2026 07:18:02 +0000 Subject: [PATCH 08/10] apply changes to input/output range ratio changes to throughput.py by reverting changes made there Signed-off-by: jdebache --- vllm/benchmarks/throughput.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 82245a2b036a..42a8132ffe6e 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -357,12 +357,6 @@ def get_requests(args, tokenizer): and args.dataset_name not in {"prefix_repetition", "random-mm", "random-rerank"} ): sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["input_range_ratio"] = getattr( - args, "random_input_range_ratio", None - ) - sample_kwargs["output_range_ratio"] = getattr( - args, "random_output_range_ratio", None - ) # prefer random_* arguments, fall back to regular arguments random_prefix_len = getattr(args, "random_prefix_len", None) sample_kwargs["prefix_len"] = ( @@ -465,12 +459,6 @@ def get_requests(args, tokenizer): random_prefix_len if random_prefix_len is not None else prefix_len ) sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["input_range_ratio"] = getattr( - args, "random_input_range_ratio", None - ) - sample_kwargs["output_range_ratio"] = getattr( - args, "random_output_range_ratio", None - ) elif args.dataset_name == "random-rerank": dataset_cls = RandomDatasetForReranking # prefer random_* arguments, fall back to regular arguments @@ -489,12 +477,6 @@ def get_requests(args, tokenizer): sample_kwargs["batchsize"] = getattr(args, "random_batch_size", 1) sample_kwargs["is_reranker"] = not getattr(args, "no_reranker", False) sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["input_range_ratio"] = getattr( - args, "random_input_range_ratio", None - ) - sample_kwargs["output_range_ratio"] = getattr( - args, "random_output_range_ratio", None - ) else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values From 2cbd9a029c5e0c9aaab705b0dc0bff94d588dd56 Mon Sep 17 00:00:00 2001 From: jdebache Date: Tue, 14 Apr 2026 07:20:53 +0000 Subject: [PATCH 09/10] rename datasets sampling shared logic file to utils.py Signed-off-by: jdebache --- tests/benchmarks/test_sampling_params.py | 2 +- vllm/benchmarks/datasets/__init__.py | 2 +- vllm/benchmarks/datasets/create_txt_slices_dataset.py | 2 +- vllm/benchmarks/datasets/datasets.py | 2 +- vllm/benchmarks/datasets/{shared.py => utils.py} | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename vllm/benchmarks/datasets/{shared.py => utils.py} (100%) diff --git a/tests/benchmarks/test_sampling_params.py b/tests/benchmarks/test_sampling_params.py index a56357264a57..3bc34a84b377 100644 --- a/tests/benchmarks/test_sampling_params.py +++ b/tests/benchmarks/test_sampling_params.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from vllm.benchmarks.datasets.shared import get_sampling_params +from vllm.benchmarks.datasets.utils import get_sampling_params from vllm.tokenizers import TokenizerLike diff --git a/vllm/benchmarks/datasets/__init__.py b/vllm/benchmarks/datasets/__init__.py index 84d66f381cfa..5d5e172e7b46 100644 --- a/vllm/benchmarks/datasets/__init__.py +++ b/vllm/benchmarks/datasets/__init__.py @@ -40,7 +40,7 @@ process_video, zeta_prompt, ) -from vllm.benchmarks.datasets.shared import RangeRatio +from vllm.benchmarks.datasets.utils import RangeRatio __all__ = [ "DEFAULT_NUM_PROMPTS", diff --git a/vllm/benchmarks/datasets/create_txt_slices_dataset.py b/vllm/benchmarks/datasets/create_txt_slices_dataset.py index 0c80386e882b..3f7c5028a205 100644 --- a/vllm/benchmarks/datasets/create_txt_slices_dataset.py +++ b/vllm/benchmarks/datasets/create_txt_slices_dataset.py @@ -39,7 +39,7 @@ import numpy as np from transformers import AutoTokenizer -from vllm.benchmarks.datasets.shared import RangeRatio, get_sampling_params +from vllm.benchmarks.datasets.utils import RangeRatio, get_sampling_params logger = logging.getLogger(__name__) diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py index 8b739fbb3036..f426d2d82877 100644 --- a/vllm/benchmarks/datasets/datasets.py +++ b/vllm/benchmarks/datasets/datasets.py @@ -34,7 +34,7 @@ from PIL import Image from typing_extensions import deprecated -from vllm.benchmarks.datasets.shared import ( +from vllm.benchmarks.datasets.utils import ( RangeRatio, _resolve_range_ratios, get_sampling_params, diff --git a/vllm/benchmarks/datasets/shared.py b/vllm/benchmarks/datasets/utils.py similarity index 100% rename from vllm/benchmarks/datasets/shared.py rename to vllm/benchmarks/datasets/utils.py From 8204e31c3beba3bb646a014f944c48b00b751330 Mon Sep 17 00:00:00 2001 From: jdebache Date: Tue, 14 Apr 2026 07:36:51 +0000 Subject: [PATCH 10/10] adddress review comments Signed-off-by: jdebache --- vllm/benchmarks/datasets/datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py index f426d2d82877..d7ba9d8787ab 100644 --- a/vllm/benchmarks/datasets/datasets.py +++ b/vllm/benchmarks/datasets/datasets.py @@ -110,7 +110,7 @@ def __init__( # default seed. self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED self.disable_shuffle = disable_shuffle - self.data: Any + self.data: Any | None = None def apply_multimodal_chat_transformation( self, @@ -2253,7 +2253,7 @@ def sample( request_id_prefix: str = "", no_oversample: bool = False, **kwargs, - ) -> list: + ) -> list[SampleRequest]: # load all data if needed self.num_available_samples = len(self.data) if num_requests <= 0: @@ -2983,7 +2983,7 @@ def sample( min_distance: float = 0.0, max_distance: float = 1.0, **kwargs, - ) -> list: + ) -> list[SampleRequest]: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN sampled_requests = []