diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py
new file mode 100644
index 000000000000..fde708f63c4c
--- /dev/null
+++ b/benchmarks/inference/gpt-bench.py
@@ -0,0 +1,81 @@
+import torch
+import time
+import deepspeed
+import argparse
+from transformers import pipeline
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", "-m", type=str, help="hf model name")
+parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
+parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
+parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
+parser.add_argument("--local_rank", type=int, default=0, help="local rank")
+parser.add_argument("--trials", type=int, default=30, help="number of trials")
+args = parser.parse_args()
+
+
+def print_latency(latency_set, title, warmup=3):
+    # trim warmup queries
+    latency_set = latency_set[warmup:]
+    count = len(latency_set)
+    if count > 0:
+        latency_set.sort()
+        n50 = (count - 1) * 0.5 + 1
+        n90 = (count - 1) * 0.9 + 1
+        n95 = (count - 1) * 0.95 + 1
+        n99 = (count - 1) * 0.99 + 1
+        n999 = (count - 1) * 0.999 + 1
+
+        avg = sum(latency_set) / count
+        p50 = latency_set[int(n50) - 1]
+        p90 = latency_set[int(n90) - 1]
+        p95 = latency_set[int(n95) - 1]
+        p99 = latency_set[int(n99) - 1]
+        p999 = latency_set[int(n999) - 1]
+
+        print(f"====== latency stats {title} ======")
+        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
+        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
+        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
+        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
+        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
+        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
+
+
+deepspeed.init_distributed("nccl")
+
+print(args.model, args.max_tokens, args.dtype)
+
+if args.dtype.lower() == "fp16":
+    dtype = torch.float16
+else:
+    dtype = torch.float32
+
+pipe = pipeline("text-generation",
+                model=args.model,
+                framework="pt",
+                device=args.local_rank)
+
+if dtype == torch.half:
+    pipe.model.half()
+
+if args.deepspeed:
+    pipe.model = deepspeed.init_inference(pipe.model,
+                                          dtype=dtype,
+                                          replace_with_kernel_inject=True,
+                                          replace_method='auto')
+
+responses = []
+times = []
+for i in range(args.trials):
+    torch.cuda.synchronize()
+    start = time.time()
+    r = pipe("DeepSpeed is", max_new_tokens=args.max_tokens)
+    torch.cuda.synchronize()
+    end = time.time()
+    responses.append(r)
+    times.append((end - start) / (args.max_tokens - 3))
+
+print_latency(times, "token latency")
+
+print(responses[0:3])
diff --git a/benchmarks/inference/requirements.txt b/benchmarks/inference/requirements.txt
new file mode 100644
index 000000000000..00899dd5f485
--- /dev/null
+++ b/benchmarks/inference/requirements.txt
@@ -0,0 +1 @@
+transformers>=4.21.3
diff --git a/benchmarks/inference/run_model.sh b/benchmarks/inference/run_model.sh
new file mode 100644
index 000000000000..5237f5bceddc
--- /dev/null
+++ b/benchmarks/inference/run_model.sh
@@ -0,0 +1,24 @@
+set -ex
+
+model=$1
+branch1=$2
+branch2=$3
+
+version=0
+log_path=results/${model}_v${version}
+mkdir -p ${log_path}
+
+echo "baseline $log_path"
+deepspeed --num_gpus 1 gpt-bench.py -m "${model}" &> ${log_path}/baseline.log
+
+cd ../../
+git checkout ${branch1}
+cd -
+echo "ds ${branch1} $log_path"
+deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch1}.log
+
+cd ../../
+git checkout ${branch2}
+cd -
+echo "ds ${branch2} $log_path"
+deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch2}.log
diff --git a/benchmarks/inference/sweep.sh b/benchmarks/inference/sweep.sh
new file mode 100644
index 000000000000..8a9bb41b5335
--- /dev/null
+++ b/benchmarks/inference/sweep.sh
@@ -0,0 +1,26 @@
+set -ex
+
+export TRANSFORMERS_CACHE=/tmp/hf-cache
+
+branch1=$1
+branch2=$2
+
+for m in `echo "EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"`; do
+  bash run_model.sh $m $branch1 $branch2
+done
+
+for m in `echo "gpt2 gpt2-large gpt2-xl"`; do
+  bash run_model.sh $m $branch1 $branch2
+done
+
+for m in `echo "EleutherAI/gpt-j-6B"`; do
+  bash run_model.sh $m $branch1 $branch2
+done
+
+for m in `echo "facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"`; do
+  bash run_model.sh $m $branch1 $branch2
+done
+
+for m in `echo "bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"`; do
+  bash run_model.sh $m $branch1 $branch2
+done