Skip to content

Commit 05c531b

Browse files
authored
[Misc] Improved prefix cache example (#9077)
1 parent fbb7442 commit 05c531b

File tree

1 file changed

+3
-9
lines changed

1 file changed

+3
-9
lines changed

examples/offline_inference_with_prefix.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
from time import time
2-
31
from vllm import LLM, SamplingParams
42

3+
# NOTE: This is just a running example. For benchmarking purpose,
4+
# please see benchmarks/benchmark_prefix_caching.py
5+
56
# Common prefix.
67
prefix = (
78
"You are an expert school principal, skilled in effectively managing "
@@ -37,9 +38,7 @@
3738

3839
# Generate texts from the prompts. The output is a list of RequestOutput objects
3940
# that contain the prompt, generated text, and other information.
40-
start_time_regular = time()
4141
outputs = regular_llm.generate(generating_prompts, sampling_params)
42-
duration_regular = time() - start_time_regular
4342

4443
regular_generated_texts = []
4544
# Print the outputs.
@@ -55,9 +54,7 @@
5554
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
5655

5756
# Generate with prefix caching.
58-
start_time_cached = time()
5957
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
60-
duration_cached = time() - start_time_cached
6158

6259
print("Results with `enable_prefix_caching`")
6360

@@ -77,6 +74,3 @@
7774
for i in range(len(prompts))
7875
])
7976
print(f"Generated answers are the same: {generated_same}")
80-
81-
speedup = round(duration_regular / duration_cached, 2)
82-
print(f"Speed up of cached generation compared to the regular is: {speedup}")

0 commit comments

Comments
 (0)