Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/dspy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang
```
docker run --name tgi --rm -ti --gpus all --network host \
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
ghcr.io/huggingface/text-generation-inference:1.1.0 \
ghcr.io/huggingface/text-generation-inference:1.3.0 \
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
--max-input-length 2048 --max-total-tokens 4096 \
--port 24000
Expand Down
4 changes: 3 additions & 1 deletion benchmark/hellaswag/bench_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def call_select(context, choices):
out = model + context + select(choices, name="answer")
return choices.index(out["answer"])

call_select("Hello,", ["world", "earth"])

elif args.backend == "lmql":
import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
Expand Down Expand Up @@ -135,6 +137,6 @@ async def batched_call(batch_size):
parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=100)
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_other_args_and_parse(parser)
main(args)
2 changes: 1 addition & 1 deletion benchmark/hellaswag/bench_sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,6 @@ def few_shot_hellaswag(s, question, choices):
parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=100)
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_sglang_args_and_parse(parser)
main(args)
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@ outlines 0.0.22

### Benchmark sglang

Run llama-7b
Run Llama-7B

```
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```

Run mixtral-8x7b
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
Run Mixtral-8x7B

```
python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
Expand All @@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10

### Benchmark vllm

Run llama-7b
Run Llama-7B

```
python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
Expand All @@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10

### Benchmark guidance

Run llama-7b and benchmark
Run Llama-7B and benchmark

```
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def get_one_answer(i):

with open(args.result_file, "a") as fout:
value = {
"task": "json_regex_decode",
"task": "json_decode_regex",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ def main(args):
# Run requests
tic = time.time()
states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel)
for state in states:
state.sync()
latency = time.time() - tic

# Compute accuracy
Expand All @@ -80,7 +78,7 @@ def main(args):

with open(args.result_file, "a") as fout:
value = {
"task": "json_regex_decode",
"task": "json_decode_regex",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
Expand Down
15 changes: 1 addition & 14 deletions benchmark/latency_throughput/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,6 @@
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```

### Performance

- Model: Llama-2-7b-chat-hf
- `--num-prompts 2000 --request-rate 200`
- On 4 A10 (24G) GPUs

| Backend | Throughput | Latency |
| ----------- | --------------- | -------- |
| srt | 5.82 requests/s | 343.54 s |
| vllm==0.2.6 | 3.93 requests/s | 509.08 s |
| vllm==0.2.7 | 5.02 requests/s | 398.25 s |


### SGLang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
Expand All @@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat

### vLLM
```
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16 --port 21000
```

```
Expand Down
25 changes: 25 additions & 0 deletions benchmark/llm_judge/articles.jsonl

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions benchmark/mmlu/bench_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ def call_generate(prompt, temperature, max_tokens):
max_tokens=max_tokens, temperature=0)
return out["answer"]

# warmup
call_generate("Hello,", temperature=1.0, max_tokens=8)

elif args.backend == "lmql":
import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
### Benchmark sglang

Run llama-7b
Run Llama-7B

```
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```

Run mixtral-8x7b
Run Mixtral-8x7B
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)

```
Expand All @@ -27,13 +27,13 @@ python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long

### Benchmark vLLM

Run llama-7b
Run Llama-7B

```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```

Run mixtral-8x7b
Run Mixtral-8x7B

```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
Expand All @@ -53,14 +53,14 @@ python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm

### Benchmark guidance

Benchmark llama-7b(short output)
Benchmark Llama-7B (short output)

```
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1
```

Benchmark llama-7b(long output)
Benchmark Llama-7B (long output)

```
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --long
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def get_one_answer(i):

with open(args.result_file, "a") as fout:
value = {
"task": "multi_turns",
"task": "multi_turn_chat",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ def multi_turns(s, qas):


def main(args):
print(args)

tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)

multi_qas = gen_arguments(args, tokenizer)
Expand All @@ -33,8 +31,6 @@ def main(args):
states = multi_turns.run_batch(
multi_qas, temperature=0, backend=backend, num_threads=args.parallel
)
for state in states:
state.sync()
latency = time.time() - tic

print(f"Latency: {latency:.3f}")
Expand All @@ -43,7 +39,7 @@ def main(args):

with open(args.result_file, "a") as fout:
value = {
"task": "multi_turns",
"task": "multi_turn_chat",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
Expand Down Expand Up @@ -74,4 +70,6 @@ def main(args):
args.min_len_a = 256
args.max_len_a = 512
args.num_qa = 20

print(args)
main(args)
2 changes: 2 additions & 0 deletions benchmark/react/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## Run benchmark

NOTE: This is an implementation for replaying a given trace for throughput/latency benchmark purposes. It is not an actual ReAct agent implementation.

### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
Expand Down
3 changes: 3 additions & 0 deletions benchmark/react/bench_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ def call_generate(prompt, temperature, max_tokens, stop):
))
return out["result"]

# warmup
call_generate("Hello,", 1.0, 8, ".")

else:
raise ValueError(f"Invalid backend: {args.backend}")

Expand Down
3 changes: 2 additions & 1 deletion benchmark/react/bench_sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,10 @@ def webthink(s, question, triplets):
""" + question)
for i in range(1, len(triplets) + 2):
s += "Thought " + str(i) + ":"
# NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
ss = s.fork(1)
ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation")
# ss.join()
ss.join()
# to verify the correctness of output, this should be collected
# print(ss[0]["thought_action"])
if i > len(triplets):
Expand Down
Loading