Skip to content

Commit 98c00a2

Browse files
authored
Fix torch profiler bugs for bench_offline_throughput.py (#6557)
1 parent 451ffe7 commit 98c00a2

File tree

5 files changed

+49
-5
lines changed

5 files changed

+49
-5
lines changed

docs/references/benchmark_and_profiling.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,17 @@
5252
python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
5353
```
5454

55+
- Possible PyTorch Bug
56+
If in any cases you encounter the following error (for example, using qwen 2.5 VL):
57+
```bash
58+
RuntimeError: !stack.empty() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/autograd/profiler_python.cpp":983, please report a bug to PyTorch. Python replay stack is empty.
59+
```
60+
This is likely a PyTorch Bug reported in [Bug: vLLM Profiler](https://github.com/vllm-project/vllm/issues/18240) and [Bug: torch.profiler.profile](https://github.com/pytorch/pytorch/issues/101632). As a workaround, you may disable `with_stack` with an environment variable such as follows:
61+
```bash
62+
export SGLANG_PROFILE_WITH_STACK=False
63+
python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
64+
```
65+
5566
- View Traces
5667

5768
Trace files can be loaded and visualized from:

docs/references/environment_variables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ SGLang supports various environment variables that can be used to configure its
8888
| Environment Variable | Description | Default Value |
8989
| --- | --- | --- |
9090
| `SGLANG_TORCH_PROFILER_DIR` | Directory for PyTorch profiler output | `/tmp` |
91+
| `SGLANG_PROFILE_WITH_STACK` | Set `with_stack` option (bool) for PyTorch profiler (capture stack trace) | `true` |
9192

9293
## Storage & Caching
9394

python/sglang/bench_offline_throughput.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
"""
1212

1313
import argparse
14+
import asyncio
1415
import dataclasses
16+
import inspect
1517
import json
1618
import logging
1719
import os
@@ -235,8 +237,10 @@ def throughput_test_once(
235237
latency = time.perf_counter() - st
236238

237239
if profile:
240+
dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
241+
known_files = set(os.listdir(dir))
238242
backend.stop_profile()
239-
monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
243+
monitor_trace_file(known_files, dir)
240244

241245
if backend_name == "runtime":
242246
gen_out = json.loads(gen_out)
@@ -260,18 +264,20 @@ def throughput_test_once(
260264
measurement_results["total_input_tokens"]
261265
+ measurement_results["total_output_tokens"]
262266
) / latency
267+
268+
if inspect.isawaitable(server_info):
269+
server_info = asyncio.run(server_info)
270+
263271
measurement_results["last_gen_throughput"] = server_info["internal_states"][0][
264272
"last_gen_throughput"
265273
]
266274

267275
return measurement_results
268276

269277

270-
def monitor_trace_file(directory, interval=1):
278+
def monitor_trace_file(known_files, directory, interval=1):
271279
print(f"Monitoring {directory} for new trace files...")
272280

273-
known_files = set(os.listdir(directory))
274-
275281
while True:
276282
flag = False
277283
time.sleep(interval)

python/sglang/lang/backend/runtime_endpoint.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,22 @@ def cache_prefix(self, prefix_str: str):
8585
)
8686
self._assert_success(res)
8787

88+
def start_profile(self):
89+
res = http_request(
90+
self.base_url + "/start_profile",
91+
api_key=self.api_key,
92+
verify=self.verify,
93+
)
94+
self._assert_success(res)
95+
96+
def stop_profile(self):
97+
res = http_request(
98+
self.base_url + "/stop_profile",
99+
api_key=self.api_key,
100+
verify=self.verify,
101+
)
102+
self._assert_success(res)
103+
88104
def commit_lazy_operations(self, s: StreamExecutor):
89105
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
90106
self._add_images(s, data)
@@ -374,7 +390,8 @@ def __init__(
374390
self.pid = None
375391
pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
376392

377-
proc = multiprocessing.Process(
393+
ctx = multiprocessing.get_context("spawn")
394+
proc = ctx.Process(
378395
target=launch_server,
379396
args=(self.server_args, pipe_writer),
380397
)
@@ -406,6 +423,12 @@ def shutdown(self):
406423
kill_process_tree(self.pid)
407424
self.pid = None
408425

426+
def start_profile(self):
427+
self.endpoint.start_profile()
428+
429+
def stop_profile(self):
430+
self.endpoint.stop_profile()
431+
409432
def cache_prefix(self, prefix: str):
410433
self.endpoint.cache_prefix(prefix)
411434

python/sglang/srt/managers/tokenizer_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@
116116
from sglang.srt.server_args import PortArgs, ServerArgs
117117
from sglang.srt.utils import (
118118
dataclass_to_string_truncated,
119+
get_bool_env_var,
119120
get_zmq_socket,
120121
kill_process_tree,
121122
)
@@ -805,6 +806,8 @@ async def start_profile(
805806
profile_by_stage: bool = False,
806807
):
807808
self.auto_create_handle_loop()
809+
env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
810+
with_stack = False if with_stack is False or env_with_stack is False else True
808811
req = ProfileReq(
809812
type=ProfileReqType.START_PROFILE,
810813
output_dir=output_dir,

0 commit comments

Comments
 (0)