Skip to content

Commit 12dd715

Browse files
[misc] [doc] [frontend] LLM torch profiler support (vllm-project#7943)
1 parent 29f49cd commit 12dd715

File tree

6 files changed

+74
-3
lines changed

6 files changed

+74
-3
lines changed

docs/source/dev/profiling/profiling_index.rst

+17-3
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
1717
.. tip::
1818

1919
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
20-
21-
Example commands:
20+
21+
.. tip::
22+
23+
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
24+
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
25+
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
26+
27+
Example commands and usage:
28+
===========================
29+
30+
Offline Inference:
31+
------------------
32+
33+
Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
34+
2235

2336
OpenAI Server:
37+
--------------
2438

2539
.. code-block:: bash
2640
27-
VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
41+
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
2842
2943
benchmark_serving.py:
3044

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import os
2+
3+
from vllm import LLM, SamplingParams
4+
5+
# enable torch profiler, can also be set on cmd line
6+
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
7+
8+
# Sample prompts.
9+
prompts = [
10+
"Hello, my name is",
11+
"The president of the United States is",
12+
"The capital of France is",
13+
"The future of AI is",
14+
]
15+
# Create a sampling params object.
16+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
17+
18+
# Create an LLM.
19+
llm = LLM(model="facebook/opt-125m")
20+
21+
llm.start_profile()
22+
23+
# Generate texts from the prompts. The output is a list of RequestOutput objects
24+
# that contain the prompt, generated text, and other information.
25+
outputs = llm.generate(prompts, sampling_params)
26+
27+
llm.stop_profile()
28+
29+
# Print the outputs.
30+
for output in outputs:
31+
prompt = output.prompt
32+
generated_text = output.outputs[0].text
33+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

vllm/engine/llm_engine.py

+6
Original file line numberDiff line numberDiff line change
@@ -1914,6 +1914,12 @@ def check_health(self) -> None:
19141914
self.tokenizer.check_health()
19151915
self.model_executor.check_health()
19161916

1917+
def start_profile(self) -> None:
1918+
self.model_executor.start_profile()
1919+
1920+
def stop_profile(self) -> None:
1921+
self.model_executor.stop_profile()
1922+
19171923
def is_tracing_enabled(self) -> bool:
19181924
return self.tracer is not None
19191925

vllm/entrypoints/llm.py

+6
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,12 @@ def encode(
560560
outputs = self._run_engine(use_tqdm=use_tqdm)
561561
return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput)
562562

563+
def start_profile(self) -> None:
564+
self.llm_engine.start_profile()
565+
566+
def stop_profile(self) -> None:
567+
self.llm_engine.stop_profile()
568+
563569
# LEGACY
564570
def _convert_v1_inputs(
565571
self,

vllm/executor/cpu_executor.py

+6
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,12 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
296296
for result in parallel_worker_tasks:
297297
result.get()
298298

299+
def start_profile(self) -> None:
300+
self.driver_method_invoker(self.driver_worker, "start_profile")
301+
302+
def stop_profile(self) -> None:
303+
self.driver_method_invoker(self.driver_worker, "stop_profile")
304+
299305

300306
class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
301307

vllm/executor/gpu_executor.py

+6
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,12 @@ def check_health(self) -> None:
169169
# it's running.
170170
return
171171

172+
def start_profile(self) -> None:
173+
self.driver_worker.start_profile()
174+
175+
def stop_profile(self) -> None:
176+
self.driver_worker.stop_profile()
177+
172178

173179
class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
174180

0 commit comments

Comments
 (0)