Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,24 @@ python3 vllm/benchmarks/benchmark_serving.py \
--seed 42
```

### Running With Sampling Parameters

When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:

```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
```

---
## Example - Offline Throughput Benchmark

Expand Down
6 changes: 6 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,3 +497,9 @@ def get_tokenizer(
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
}

OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_openai_chat_completions)
]
59 changes: 56 additions & 3 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
from typing import Any, Optional

import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
from backend_request_func import (ASYNC_REQUEST_FUNCS,
OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
RequestFuncOutput)
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
Expand Down Expand Up @@ -260,6 +261,7 @@ async def benchmark(
goodput_config_dict: dict[str, float],
max_concurrency: Optional[int],
lora_modules: Optional[Iterable[str]],
extra_body: Optional[dict],
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
Expand Down Expand Up @@ -287,6 +289,7 @@ async def benchmark(
logprobs=logprobs,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
extra_body=extra_body,
)

test_output = await request_func(request_func_input=test_input)
Expand All @@ -313,7 +316,8 @@ async def benchmark(
output_len=test_output_len,
logprobs=logprobs,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos)
ignore_eos=ignore_eos,
extra_body=extra_body)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
Expand Down Expand Up @@ -363,7 +367,8 @@ async def limited_request_func(request_func_input, pbar):
output_len=output_len,
logprobs=logprobs,
multi_modal_content=mm_content,
ignore_eos=ignore_eos)
ignore_eos=ignore_eos,
extra_body=extra_body)
tasks.append(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
Expand Down Expand Up @@ -652,6 +657,26 @@ def main(args: argparse.Namespace):
raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
goodput_config_dict = check_goodput_args(args)

# Collect the sampling parameters.
sampling_params = {
k: v
for k, v in {
"top_p": args.top_p,
"top_k": args.top_k,
"min_p": args.min_p,
"temperature": args.temperature
}.items() if v is not None
}

# Sampling parameters are only supported by openai-compatible backend.
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
raise ValueError(
"Sampling parameters are only supported by openai-compatible "
"backends.")

if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.

# Avoid GC processing "static" data - reduce pause times.
gc.collect()
gc.freeze()
Expand All @@ -678,6 +703,7 @@ def main(args: argparse.Namespace):
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
extra_body=sampling_params,
))

# Save config and results to json
Expand Down Expand Up @@ -1000,6 +1026,33 @@ def main(args: argparse.Namespace):
"from the sampled HF dataset.",
)

sampling_group = parser.add_argument_group("sampling parameters")
sampling_group.add_argument(
"--top-p",
type=float,
default=None,
help="Top-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--top-k",
type=int,
default=None,
help="Top-k sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--min-p",
type=float,
default=None,
help="Min-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--temperature",
type=float,
default=None,
help="Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
"decoding (i.e. temperature==0.0).")

parser.add_argument(
'--tokenizer-mode',
type=str,
Expand Down