diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py new file mode 100644 index 000000000000..fde708f63c4c --- /dev/null +++ b/benchmarks/inference/gpt-bench.py @@ -0,0 +1,81 @@ +import torch +import time +import deepspeed +import argparse +from transformers import pipeline + +parser = argparse.ArgumentParser() +parser.add_argument("--model", "-m", type=str, help="hf model name") +parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference") +parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32") +parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens") +parser.add_argument("--local_rank", type=int, default=0, help="local rank") +parser.add_argument("--trials", type=int, default=30, help="number of trials") +args = parser.parse_args() + + +def print_latency(latency_set, title, warmup=3): + # trim warmup queries + latency_set = latency_set[warmup:] + count = len(latency_set) + if count > 0: + latency_set.sort() + n50 = (count - 1) * 0.5 + 1 + n90 = (count - 1) * 0.9 + 1 + n95 = (count - 1) * 0.95 + 1 + n99 = (count - 1) * 0.99 + 1 + n999 = (count - 1) * 0.999 + 1 + + avg = sum(latency_set) / count + p50 = latency_set[int(n50) - 1] + p90 = latency_set[int(n90) - 1] + p95 = latency_set[int(n95) - 1] + p99 = latency_set[int(n99) - 1] + p999 = latency_set[int(n999) - 1] + + print(f"====== latency stats {title} ======") + print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000)) + print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000)) + print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000)) + print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000)) + print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000)) + print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000)) + + +deepspeed.init_distributed("nccl") + +print(args.model, args.max_tokens, args.dtype) + +if args.dtype.lower() == "fp16": + dtype = torch.float16 +else: + dtype = torch.float32 + +pipe = pipeline("text-generation", + model=args.model, + framework="pt", + device=args.local_rank) + +if dtype == torch.half: + pipe.model.half() + +if args.deepspeed: + pipe.model = deepspeed.init_inference(pipe.model, + dtype=dtype, + replace_with_kernel_inject=True, + replace_method='auto') + +responses = [] +times = [] +for i in range(args.trials): + torch.cuda.synchronize() + start = time.time() + r = pipe("DeepSpeed is", max_new_tokens=args.max_tokens) + torch.cuda.synchronize() + end = time.time() + responses.append(r) + times.append((end - start) / (args.max_tokens - 3)) + +print_latency(times, "token latency") + +print(responses[0:3]) diff --git a/benchmarks/inference/requirements.txt b/benchmarks/inference/requirements.txt new file mode 100644 index 000000000000..00899dd5f485 --- /dev/null +++ b/benchmarks/inference/requirements.txt @@ -0,0 +1 @@ +transformers>=4.21.3 diff --git a/benchmarks/inference/run_model.sh b/benchmarks/inference/run_model.sh new file mode 100644 index 000000000000..5237f5bceddc --- /dev/null +++ b/benchmarks/inference/run_model.sh @@ -0,0 +1,24 @@ +set -ex + +model=$1 +branch1=$2 +branch2=$3 + +version=0 +log_path=results/${model}_v${version} +mkdir -p ${log_path} + +echo "baseline $log_path" +deepspeed --num_gpus 1 gpt-bench.py -m "${model}" &> ${log_path}/baseline.log + +cd ../../ +git checkout ${branch1} +cd - +echo "ds ${branch1} $log_path" +deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch1}.log + +cd ../../ +git checkout ${branch2} +cd - +echo "ds ${branch2} $log_path" +deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch2}.log diff --git a/benchmarks/inference/sweep.sh b/benchmarks/inference/sweep.sh new file mode 100644 index 000000000000..8a9bb41b5335 --- /dev/null +++ b/benchmarks/inference/sweep.sh @@ -0,0 +1,26 @@ +set -ex + +export TRANSFORMERS_CACHE=/tmp/hf-cache + +branch1=$1 +branch2=$2 + +for m in `echo "EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"`; do + bash run_model.sh $m $branch1 $branch2 +done + +for m in `echo "gpt2 gpt2-large gpt2-xl"`; do + bash run_model.sh $m $branch1 $branch2 +done + +for m in `echo "EleutherAI/gpt-j-6B"`; do + bash run_model.sh $m $branch1 $branch2 +done + +for m in `echo "facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"`; do + bash run_model.sh $m $branch1 $branch2 +done + +for m in `echo "bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"`; do + bash run_model.sh $m $branch1 $branch2 +done