Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions benchmarks/inference/gpt-bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import torch
import time
import deepspeed
import argparse
from transformers import pipeline

parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, help="hf model name")
parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
parser.add_argument("--local_rank", type=int, default=0, help="local rank")
parser.add_argument("--trials", type=int, default=30, help="number of trials")
args = parser.parse_args()


def print_latency(latency_set, title, warmup=3):
# trim warmup queries
latency_set = latency_set[warmup:]
count = len(latency_set)
if count > 0:
latency_set.sort()
n50 = (count - 1) * 0.5 + 1
n90 = (count - 1) * 0.9 + 1
n95 = (count - 1) * 0.95 + 1
n99 = (count - 1) * 0.99 + 1
n999 = (count - 1) * 0.999 + 1

avg = sum(latency_set) / count
p50 = latency_set[int(n50) - 1]
p90 = latency_set[int(n90) - 1]
p95 = latency_set[int(n95) - 1]
p99 = latency_set[int(n99) - 1]
p999 = latency_set[int(n999) - 1]

print(f"====== latency stats {title} ======")
print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))


deepspeed.init_distributed("nccl")

print(args.model, args.max_tokens, args.dtype)

if args.dtype.lower() == "fp16":
dtype = torch.float16
else:
dtype = torch.float32

pipe = pipeline("text-generation",
model=args.model,
framework="pt",
device=args.local_rank)

if dtype == torch.half:
pipe.model.half()

if args.deepspeed:
pipe.model = deepspeed.init_inference(pipe.model,
dtype=dtype,
replace_with_kernel_inject=True,
replace_method='auto')

responses = []
times = []
for i in range(args.trials):
torch.cuda.synchronize()
start = time.time()
r = pipe("DeepSpeed is", max_new_tokens=args.max_tokens)
torch.cuda.synchronize()
end = time.time()
responses.append(r)
times.append((end - start) / (args.max_tokens - 3))

print_latency(times, "token latency")

print(responses[0:3])
1 change: 1 addition & 0 deletions benchmarks/inference/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
transformers>=4.21.3
24 changes: 24 additions & 0 deletions benchmarks/inference/run_model.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
set -ex

model=$1
branch1=$2
branch2=$3

version=0
log_path=results/${model}_v${version}
mkdir -p ${log_path}

echo "baseline $log_path"
deepspeed --num_gpus 1 gpt-bench.py -m "${model}" &> ${log_path}/baseline.log

cd ../../
git checkout ${branch1}
cd -
echo "ds ${branch1} $log_path"
deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch1}.log

cd ../../
git checkout ${branch2}
cd -
echo "ds ${branch2} $log_path"
deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch2}.log
26 changes: 26 additions & 0 deletions benchmarks/inference/sweep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
set -ex

export TRANSFORMERS_CACHE=/tmp/hf-cache

branch1=$1
branch2=$2

for m in `echo "EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"`; do
bash run_model.sh $m $branch1 $branch2
done

for m in `echo "gpt2 gpt2-large gpt2-xl"`; do
bash run_model.sh $m $branch1 $branch2
done

for m in `echo "EleutherAI/gpt-j-6B"`; do
bash run_model.sh $m $branch1 $branch2
done

for m in `echo "facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"`; do
bash run_model.sh $m $branch1 $branch2
done

for m in `echo "bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"`; do
bash run_model.sh $m $branch1 $branch2
done