Skip to content

Commit 30fb9b4

Browse files
committed
Mini testbench for various cases (service, model, num_device)
1 parent baf656f commit 30fb9b4

File tree

4 files changed

+117
-0
lines changed

4 files changed

+117
-0
lines changed

examples/lpu_inference.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from vllm import LLM, SamplingParams
2+
3+
# Sample prompts.
4+
prompts = [
5+
"Hello, my name is"
6+
]
7+
# Create a sampling params object.
8+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, top_k=1, min_tokens=30, max_tokens=30)
9+
10+
# Create an LLM.
11+
#llm = LLM(model="facebook/opt-1.3b", device="fpga", pipeline_parallel_size=2)
12+
llm = LLM(model="meta-llama/Meta-Llama-3-8B", device="fpga", tensor_parallel_size=1)
13+
#llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device="fpga", tensor_parallel_size=1)
14+
15+
# Generate texts from the prompts. The output is a list of RequestOutput objects
16+
# that contain the prompt, generated text, and other information.
17+
outputs = llm.generate(prompts, sampling_params)
18+
19+
# Print the outputs.
20+
for output in outputs:
21+
prompt = output.prompt
22+
generated_text = output.outputs[0].text
23+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

examples/lpu_inference_arg.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from vllm import LLM, SamplingParams
2+
import argparse
3+
4+
# Get arguments
5+
parser = argparse.ArgumentParser(description='vLLM Inference Test Script')
6+
parser.add_argument("-m", "--model", default="facebook/opt-1.3b", type=str, help="name of the language model")
7+
parser.add_argument("-n", "--ncore", default=1, type=int, help="the number of the LPU")
8+
parser.add_argument("-i", "--i_token", default="Hello, my name is", type=str, help="input prompt")
9+
parser.add_argument("-o", "--o_token", default=32, type=int, help="the number of output")
10+
args = parser.parse_args()
11+
12+
# Sample prompts.
13+
prompts = [args.i_token]
14+
15+
# Create a sampling params object and LLM
16+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, top_k=1, max_tokens=args.o_token)
17+
llm = LLM(model=args.model, device="fpga", tensor_parallel_size=args.ncore)
18+
19+
# Run and print the outputs.
20+
outputs = llm.generate(prompts, sampling_params)
21+
for output in outputs:
22+
prompt = output.prompt
23+
generated_text = output.outputs[0].text
24+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

examples/mini_testbench.sh

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
2+
log_sum="log/service_model_device.txt"
3+
4+
model_ids=("TinyLlama/TinyLlama-1.1B-Chat-v1.0") # "facebook/opt-1.3b" "huggyllama/llama-7b")
5+
num_devices=(1 2 4)
6+
7+
current_datetime=$(date "+%Y-%m-%d %H:%M:%S")
8+
echo "$current_datetime"
9+
echo "$current_datetime" >> ${log_sum}
10+
11+
"""
12+
for model_id in "${model_ids[@]}"; do
13+
for num_device in "${num_devices[@]}"; do
14+
#IFS='\' read -ra parts <<< "$model_id"
15+
#model_name="${parts[-1]}"
16+
model_name=$(echo "$model_id" | awk -F'/' '{print $NF}')
17+
echo "*********************************"
18+
echo "**** Start inference_${model_name}_${num_device}"
19+
echo "*********************************"
20+
python lpu_inference_arg.py -m ${model_id} -n ${num_device} > log/inference_${model_name}_${num_device}.txt
21+
echo "*********************************" >> ${log_sum}
22+
echo "The Result of log/inference_${model_name}_${num_device}.txt" >> ${log_sum}
23+
tail -n 1 "log/inference_${model_name}_${num_device}.txt" >> ${log_sum}
24+
echo "" >> ${log_sum}
25+
done
26+
done
27+
"""
28+
29+
for model_id in "${model_ids[@]}"; do
30+
for num_device in "${num_devices[@]}"; do
31+
model_name=$(echo "$model_id" | awk -F'/' '{print $NF}')
32+
echo "*********************************"
33+
echo "**** Start serving_${model_name}_${num_device}"
34+
echo "*********************************"
35+
python -m vllm.entrypoints.api_server --model ${model_id} --device fpga --tensor-parallel-size ${num_device} &
36+
37+
# Waiting for server
38+
while ! nc -z localhost "8000"; do
39+
echo "Waiting for server..."
40+
sleep 3
41+
done
42+
echo "The server is ready!"
43+
44+
python lpu_client.py > log/vllm_serve_${model_name}_${num_device}.txt
45+
46+
# Waiting for process kill
47+
PID=$(jobs -p | tail -n 1)
48+
if [ -n "$PID" ]; then
49+
kill -SIGINT "$PID"
50+
while true; do
51+
if ps -p "$PID" > /dev/null; then
52+
echo "Kill the process..."
53+
sleep 3
54+
else
55+
echo "Process (PID: $PID) is killed."
56+
break
57+
fi
58+
done
59+
fi
60+
61+
# Write log in text file
62+
echo "*********************************" >> ${log_sum}
63+
echo "The Result of log/vllm_serve_${model_name}_${num_device}.txt" >> ${log_sum}
64+
tail -n 1 "log/vllm_serve_${model_name}_${num_device}.txt" >> ${log_sum}
65+
echo "" >> ${log_sum}
66+
done
67+
done
68+
69+

scripts/install_script.sh

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
site_packages=$(python3 -c "import site; print(site.getsitepackages()[0])")
33

4+
pip install numpy==1.26.0
45
pip install vllm==0.5.5
56
pip install mistral_common
67
echo "Start move vllm to ${site_packages}"

0 commit comments

Comments
 (0)