File tree 1 file changed +3
-9
lines changed
1 file changed +3
-9
lines changed Original file line number Diff line number Diff line change 1
- from time import time
2
-
3
1
from vllm import LLM , SamplingParams
4
2
3
+ # NOTE: This is just a running example. For benchmarking purpose,
4
+ # please see benchmarks/benchmark_prefix_caching.py
5
+
5
6
# Common prefix.
6
7
prefix = (
7
8
"You are an expert school principal, skilled in effectively managing "
37
38
38
39
# Generate texts from the prompts. The output is a list of RequestOutput objects
39
40
# that contain the prompt, generated text, and other information.
40
- start_time_regular = time ()
41
41
outputs = regular_llm .generate (generating_prompts , sampling_params )
42
- duration_regular = time () - start_time_regular
43
42
44
43
regular_generated_texts = []
45
44
# Print the outputs.
55
54
prefix_cached_llm .generate (generating_prompts [0 ], sampling_params )
56
55
57
56
# Generate with prefix caching.
58
- start_time_cached = time ()
59
57
outputs = prefix_cached_llm .generate (generating_prompts , sampling_params )
60
- duration_cached = time () - start_time_cached
61
58
62
59
print ("Results with `enable_prefix_caching`" )
63
60
77
74
for i in range (len (prompts ))
78
75
])
79
76
print (f"Generated answers are the same: { generated_same } " )
80
-
81
- speedup = round (duration_regular / duration_cached , 2 )
82
- print (f"Speed up of cached generation compared to the regular is: { speedup } " )
You can’t perform that action at this time.
0 commit comments