Skip to content

Commit 9ae401c

Browse files
authored
Update benchmark scripts (sgl-project#8)
1 parent 8cc6f3a commit 9ae401c

File tree

28 files changed

+183
-50
lines changed

28 files changed

+183
-50
lines changed

benchmark/dspy/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang
2323
```
2424
docker run --name tgi --rm -ti --gpus all --network host \
2525
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
26-
ghcr.io/huggingface/text-generation-inference:1.1.0 \
26+
ghcr.io/huggingface/text-generation-inference:1.3.0 \
2727
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
2828
--max-input-length 2048 --max-total-tokens 4096 \
2929
--port 24000

benchmark/hellaswag/bench_other.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ def call_select(context, choices):
5757
out = model + context + select(choices, name="answer")
5858
return choices.index(out["answer"])
5959

60+
call_select("Hello,", ["world", "earth"])
61+
6062
elif args.backend == "lmql":
6163
import lmql
6264
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
@@ -135,6 +137,6 @@ async def batched_call(batch_size):
135137
parser = argparse.ArgumentParser()
136138
parser.add_argument("--num-shot", type=int, default=20)
137139
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
138-
parser.add_argument("--num-questions", type=int, default=100)
140+
parser.add_argument("--num-questions", type=int, default=200)
139141
args = add_common_other_args_and_parse(parser)
140142
main(args)

benchmark/hellaswag/bench_sglang.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,6 @@ def few_shot_hellaswag(s, question, choices):
9191
parser = argparse.ArgumentParser()
9292
parser.add_argument("--num-shot", type=int, default=20)
9393
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
94-
parser.add_argument("--num-questions", type=int, default=100)
94+
parser.add_argument("--num-questions", type=int, default=200)
9595
args = add_common_sglang_args_and_parse(parser)
9696
main(args)

benchmark/json_regex_decode/README.md renamed to benchmark/json_decode_regex/README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,13 @@ outlines 0.0.22
1717

1818
### Benchmark sglang
1919

20-
Run llama-7b
20+
Run Llama-7B
2121

2222
```
2323
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
2424
```
2525

26-
Run mixtral-8x7b
27-
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
26+
Run Mixtral-8x7B
2827

2928
```
3029
python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
@@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10
3938

4039
### Benchmark vllm
4140

42-
Run llama-7b
41+
Run Llama-7B
4342

4443
```
4544
python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
@@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10
5453

5554
### Benchmark guidance
5655

57-
Run llama-7b and benchmark
56+
Run Llama-7B and benchmark
5857

5958
```
6059
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
61-
```
60+
```

benchmark/json_regex_decode/bench_other.py renamed to benchmark/json_decode_regex/bench_other.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def get_one_answer(i):
105105

106106
with open(args.result_file, "a") as fout:
107107
value = {
108-
"task": "json_regex_decode",
108+
"task": "json_decode_regex",
109109
"backend": args.backend,
110110
"num_gpus": 1,
111111
"latency": round(latency, 3),

benchmark/json_regex_decode/bench_sglang.py renamed to benchmark/json_decode_regex/bench_sglang.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,6 @@ def main(args):
6464
# Run requests
6565
tic = time.time()
6666
states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel)
67-
for state in states:
68-
state.sync()
6967
latency = time.time() - tic
7068

7169
# Compute accuracy
@@ -80,7 +78,7 @@ def main(args):
8078

8179
with open(args.result_file, "a") as fout:
8280
value = {
83-
"task": "json_regex_decode",
81+
"task": "json_decode_regex",
8482
"backend": args.backend,
8583
"num_gpus": 1,
8684
"latency": round(latency, 3),

benchmark/latency_throughput/README.md

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,6 @@
33
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
44
```
55

6-
### Performance
7-
8-
- Model: Llama-2-7b-chat-hf
9-
- `--num-prompts 2000 --request-rate 200`
10-
- On 4 A10 (24G) GPUs
11-
12-
| Backend | Throughput | Latency |
13-
| ----------- | --------------- | -------- |
14-
| srt | 5.82 requests/s | 343.54 s |
15-
| vllm==0.2.6 | 3.93 requests/s | 509.08 s |
16-
| vllm==0.2.7 | 5.02 requests/s | 398.25 s |
17-
18-
196
### SGLang
207
```
218
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
@@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat
2815

2916
### vLLM
3017
```
31-
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16
18+
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16 --port 21000
3219
```
3320

3421
```

benchmark/llm_judge/articles.jsonl

Lines changed: 25 additions & 0 deletions
Large diffs are not rendered by default.

benchmark/mmlu/bench_other.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ def call_generate(prompt, temperature, max_tokens):
9595
max_tokens=max_tokens, temperature=0)
9696
return out["answer"]
9797

98+
# warmup
99+
call_generate("Hello,", temperature=1.0, max_tokens=8)
100+
98101
elif args.backend == "lmql":
99102
import lmql
100103
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",

0 commit comments

Comments
 (0)