-
-
Notifications
You must be signed in to change notification settings - Fork 11.8k
[CI/Build][Doc] Fully deprecate old bench scripts for serving / throughput / latency #24411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,191 +1,17 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| """Benchmark the latency of processing a single batch of requests.""" | ||
| import sys | ||
|
|
||
| import argparse | ||
| import dataclasses | ||
| import json | ||
| import os | ||
| import time | ||
| from typing import Any, Optional | ||
|
|
||
| import numpy as np | ||
| from tqdm import tqdm | ||
| from typing_extensions import deprecated | ||
|
|
||
| import vllm.envs as envs | ||
| from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json | ||
| from vllm import LLM, SamplingParams | ||
| from vllm.engine.arg_utils import EngineArgs | ||
| from vllm.inputs import PromptType | ||
| from vllm.sampling_params import BeamSearchParams | ||
| from vllm.utils import FlexibleArgumentParser | ||
|
|
||
|
|
||
| def save_to_pytorch_benchmark_format( | ||
| args: argparse.Namespace, results: dict[str, Any] | ||
| ) -> None: | ||
| pt_records = convert_to_pytorch_benchmark_format( | ||
| args=args, | ||
| metrics={"latency": results["latencies"]}, | ||
| extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, | ||
| ) | ||
| if pt_records: | ||
| pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" | ||
| write_to_json(pt_file, pt_records) | ||
|
|
||
|
|
||
| @deprecated( | ||
| "benchmark_latency.py is deprecated and will be removed in a " | ||
| "future version. Please use 'vllm bench latency' instead.", | ||
| ) | ||
| def main(args: argparse.Namespace): | ||
| print(args) | ||
|
|
||
| engine_args = EngineArgs.from_cli_args(args) | ||
|
|
||
| # NOTE(woosuk): If the request cannot be processed in a single batch, | ||
| # the engine will automatically process the request in multiple batches. | ||
| llm = LLM(**dataclasses.asdict(engine_args)) | ||
| assert llm.llm_engine.model_config.max_model_len >= ( | ||
| args.input_len + args.output_len | ||
| ), ( | ||
| "Please ensure that max_model_len is greater than" | ||
| " the sum of input_len and output_len." | ||
| ) | ||
|
|
||
| sampling_params = SamplingParams( | ||
| n=args.n, | ||
| temperature=1.0, | ||
| top_p=1.0, | ||
| ignore_eos=True, | ||
| max_tokens=args.output_len, | ||
| detokenize=not args.disable_detokenize, | ||
| ) | ||
| print(sampling_params) | ||
| dummy_prompt_token_ids = np.random.randint( | ||
| 10000, size=(args.batch_size, args.input_len) | ||
| ) | ||
| dummy_prompts: list[PromptType] = [ | ||
| {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() | ||
| ] | ||
|
|
||
| def llm_generate(): | ||
| if not args.use_beam_search: | ||
| llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) | ||
| else: | ||
| llm.beam_search( | ||
| dummy_prompts, | ||
| BeamSearchParams( | ||
| beam_width=args.n, | ||
| max_tokens=args.output_len, | ||
| ignore_eos=True, | ||
| ), | ||
| ) | ||
|
|
||
| def run_to_completion(profile_dir: Optional[str] = None): | ||
| if profile_dir: | ||
| llm.start_profile() | ||
| llm_generate() | ||
| llm.stop_profile() | ||
| else: | ||
| start_time = time.perf_counter() | ||
| llm_generate() | ||
| end_time = time.perf_counter() | ||
| latency = end_time - start_time | ||
| return latency | ||
|
|
||
| print("Warming up...") | ||
| for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): | ||
| run_to_completion(profile_dir=None) | ||
|
|
||
| if args.profile: | ||
| profile_dir = envs.VLLM_TORCH_PROFILER_DIR | ||
| print(f"Profiling (results will be saved to '{profile_dir}')...") | ||
| run_to_completion(profile_dir=profile_dir) | ||
| return | ||
|
|
||
| # Benchmark. | ||
| latencies = [] | ||
| for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): | ||
| latencies.append(run_to_completion(profile_dir=None)) | ||
| latencies = np.array(latencies) | ||
| percentages = [10, 25, 50, 75, 90, 99] | ||
| percentiles = np.percentile(latencies, percentages) | ||
| print(f"Avg latency: {np.mean(latencies)} seconds") | ||
| for percentage, percentile in zip(percentages, percentiles): | ||
| print(f"{percentage}% percentile latency: {percentile} seconds") | ||
|
|
||
| # Output JSON results if specified | ||
| if args.output_json: | ||
| results = { | ||
| "avg_latency": np.mean(latencies), | ||
| "latencies": latencies.tolist(), | ||
| "percentiles": dict(zip(percentages, percentiles.tolist())), | ||
| } | ||
| with open(args.output_json, "w") as f: | ||
| json.dump(results, f, indent=4) | ||
| save_to_pytorch_benchmark_format(args, results) | ||
|
|
||
|
|
||
| def create_argument_parser(): | ||
| parser = FlexibleArgumentParser( | ||
| description="Benchmark the latency of processing a single batch of " | ||
| "requests till completion." | ||
| ) | ||
| parser.add_argument("--input-len", type=int, default=32) | ||
| parser.add_argument("--output-len", type=int, default=128) | ||
| parser.add_argument("--batch-size", type=int, default=8) | ||
| parser.add_argument( | ||
| "--n", | ||
| type=int, | ||
| default=1, | ||
| help="Number of generated sequences per prompt.", | ||
| ) | ||
| parser.add_argument("--use-beam-search", action="store_true") | ||
| parser.add_argument( | ||
| "--num-iters-warmup", | ||
| type=int, | ||
| default=10, | ||
| help="Number of iterations to run for warmup.", | ||
| ) | ||
| parser.add_argument( | ||
| "--num-iters", type=int, default=30, help="Number of iterations to run." | ||
| ) | ||
| parser.add_argument( | ||
| "--profile", | ||
| action="store_true", | ||
| help="profile the generation process of a single batch", | ||
| ) | ||
| parser.add_argument( | ||
| "--output-json", | ||
| type=str, | ||
| default=None, | ||
| help="Path to save the latency results in JSON format.", | ||
| ) | ||
| parser.add_argument( | ||
| "--disable-detokenize", | ||
| action="store_true", | ||
| help=( | ||
| "Do not detokenize responses (i.e. do not include " | ||
| "detokenization time in the latency measurement)" | ||
| ), | ||
| ) | ||
|
|
||
| parser = EngineArgs.add_cli_args(parser) | ||
| # V1 enables prefix caching by default which skews the latency | ||
| # numbers. We need to disable prefix caching by default. | ||
| parser.set_defaults(enable_prefix_caching=False) | ||
| if __name__ == "__main__": | ||
| print("""DEPRECATED: This script has been moved to the vLLM CLI. | ||
|
|
||
| return parser | ||
| Please use the following command instead: | ||
| vllm bench latency | ||
|
|
||
| For help with the new command, run: | ||
| vllm bench latency --help | ||
|
|
||
| if __name__ == "__main__": | ||
| parser = create_argument_parser() | ||
| args = parser.parse_args() | ||
| if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: | ||
| raise OSError( | ||
| "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " | ||
| "Please set it to a valid path to use torch profiler." | ||
| ) | ||
| main(args) | ||
| Alternatively, you can run the new command directly with: | ||
| python -m vllm.entrypoints.cli.main bench latency --help | ||
| """) | ||
| sys.exit(1) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.