Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/references/benchmark_and_profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,17 @@
python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
```

- Possible PyTorch Bug
If in any cases you encounter the following error (for example, using qwen 2.5 VL):
```bash
RuntimeError: !stack.empty() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/autograd/profiler_python.cpp":983, please report a bug to PyTorch. Python replay stack is empty.
```
This is likely a PyTorch Bug reported in [Bug: vLLM Profiler](https://github.com/vllm-project/vllm/issues/18240) and [Bug: torch.profiler.profile](https://github.com/pytorch/pytorch/issues/101632). As a workaround, you may disable `with_stack` with an environment variable such as follows:
```bash
export SGLANG_PROFILE_WITH_STACK=False
python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
```

- View Traces

Trace files can be loaded and visualized from:
Expand Down
1 change: 1 addition & 0 deletions docs/references/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ SGLang supports various environment variables that can be used to configure its
| Environment Variable | Description | Default Value |
| --- | --- | --- |
| `SGLANG_TORCH_PROFILER_DIR` | Directory for PyTorch profiler output | `/tmp` |
| `SGLANG_PROFILE_WITH_STACK` | Set `with_stack` option (bool) for PyTorch profiler (capture stack trace) | Not set |

## Storage & Caching

Expand Down
14 changes: 10 additions & 4 deletions python/sglang/bench_offline_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
"""

import argparse
import asyncio
import dataclasses
import inspect
import json
import logging
import os
Expand Down Expand Up @@ -235,8 +237,10 @@ def throughput_test_once(
latency = time.perf_counter() - st

if profile:
dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
known_files = set(os.listdir(dir))
backend.stop_profile()
monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
monitor_trace_file(known_files, dir)

if backend_name == "runtime":
gen_out = json.loads(gen_out)
Expand All @@ -260,18 +264,20 @@ def throughput_test_once(
measurement_results["total_input_tokens"]
+ measurement_results["total_output_tokens"]
) / latency

if inspect.isawaitable(server_info):
server_info = asyncio.run(server_info)

measurement_results["last_gen_throughput"] = server_info["internal_states"][0][
"last_gen_throughput"
]

return measurement_results


def monitor_trace_file(directory, interval=1):
def monitor_trace_file(known_files, directory, interval=1):
print(f"Monitoring {directory} for new trace files...")

known_files = set(os.listdir(directory))

while True:
flag = False
time.sleep(interval)
Expand Down
25 changes: 24 additions & 1 deletion python/sglang/lang/backend/runtime_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,22 @@ def cache_prefix(self, prefix_str: str):
)
self._assert_success(res)

def start_profile(self):
res = http_request(
self.base_url + "/start_profile",
api_key=self.api_key,
verify=self.verify,
)
self._assert_success(res)

def stop_profile(self):
res = http_request(
self.base_url + "/stop_profile",
api_key=self.api_key,
verify=self.verify,
)
self._assert_success(res)

def commit_lazy_operations(self, s: StreamExecutor):
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
self._add_images(s, data)
Expand Down Expand Up @@ -374,7 +390,8 @@ def __init__(
self.pid = None
pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)

proc = multiprocessing.Process(
ctx = multiprocessing.get_context("spawn")
proc = ctx.Process(
target=launch_server,
args=(self.server_args, pipe_writer),
)
Expand Down Expand Up @@ -406,6 +423,12 @@ def shutdown(self):
kill_process_tree(self.pid)
self.pid = None

def start_profile(self):
self.endpoint.start_profile()

def stop_profile(self):
self.endpoint.stop_profile()

def cache_prefix(self, prefix: str):
self.endpoint.cache_prefix(prefix)

Expand Down
11 changes: 11 additions & 0 deletions python/sglang/srt/managers/tokenizer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,17 @@ async def start_profile(
profile_by_stage: bool = False,
):
self.auto_create_handle_loop()
env_with_stack_str = os.getenv("SGLANG_PROFILE_WITH_STACK")
env_with_stack: Optional[bool] = (
None
if env_with_stack_str is None
else env_with_stack_str.lower() in ["true", "1"]
)
with_stack = (
False
if with_stack is False or env_with_stack is False
else (True if with_stack is True or env_with_stack is True else None)
)
req = ProfileReq(
type=ProfileReqType.START_PROFILE,
output_dir=output_dir,
Expand Down