Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 24 additions & 9 deletions python/sglang/srt/managers/configure_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,30 @@
"--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
)
parser.add_argument("--dump-requests-threshold", type=int, default=1000)
parser.add_argument(
"--dump-requests-exclude-meta-keys",
type=str,
default=None,
help=(
"Comma-separated meta_info keys to strip from each dumped request "
"(e.g. 'routed_experts,hidden_states'). Pass an empty string to "
"keep all keys. If not set, the server default is used."
),
)
args = parser.parse_args()

response = requests.post(
args.url + "/configure_logging",
json={
"log_requests": args.log_requests,
"log_requests_level": args.log_requests_level, # Log full requests
"dump_requests_folder": args.dump_requests_folder,
"dump_requests_threshold": args.dump_requests_threshold,
},
)
payload = {
"log_requests": args.log_requests,
"log_requests_level": args.log_requests_level, # Log full requests
"dump_requests_folder": args.dump_requests_folder,
"dump_requests_threshold": args.dump_requests_threshold,
}
if args.dump_requests_exclude_meta_keys is not None:
payload["dump_requests_exclude_meta_keys"] = [
k.strip()
for k in args.dump_requests_exclude_meta_keys.split(",")
if k.strip()
]

response = requests.post(args.url + "/configure_logging", json=payload)
assert response.status_code == 200
1 change: 1 addition & 0 deletions python/sglang/srt/managers/io_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -1758,6 +1758,7 @@ class ConfigureLoggingReq(BaseReq):
dump_requests_folder: Optional[str] = None
dump_requests_threshold: Optional[int] = None
crash_dump_folder: Optional[str] = None
dump_requests_exclude_meta_keys: Optional[List[str]] = None


@dataclass
Expand Down
48 changes: 46 additions & 2 deletions python/sglang/srt/managers/tokenizer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,10 @@ def init_request_logging_and_dumping(self):
# Dumping
self.dump_requests_folder = "" # By default do not dump
self.dump_requests_threshold = 1000
self.dump_requests_exclude_meta_keys: List[str] = [
"routed_experts",
"hidden_states",
]
self.dump_request_list: List[Tuple] = []
self.crash_dump_request_list: deque[Tuple] = deque()
self.crash_dump_performed = False # Flag to ensure dump is only called once
Expand Down Expand Up @@ -1577,6 +1581,10 @@ def configure_logging(self, obj: ConfigureLoggingReq):
self.dump_requests_folder = obj.dump_requests_folder
if obj.dump_requests_threshold is not None:
self.dump_requests_threshold = obj.dump_requests_threshold
if obj.dump_requests_exclude_meta_keys is not None:
self.dump_requests_exclude_meta_keys = list(
obj.dump_requests_exclude_meta_keys
)
if obj.crash_dump_folder is not None:
self.crash_dump_folder = obj.crash_dump_folder
logging.info(f"Config logging: {obj=}")
Expand Down Expand Up @@ -2193,6 +2201,16 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOutput, i: int):
)

def dump_requests(self, state: ReqState, out_dict: dict):
if self.dump_requests_exclude_meta_keys and isinstance(
out_dict.get("meta_info"), dict
):
exclude = self.dump_requests_exclude_meta_keys
if any(k in out_dict["meta_info"] for k in exclude):
filtered_meta = {
k: v for k, v in out_dict["meta_info"].items() if k not in exclude
}
out_dict = {**out_dict, "meta_info": filtered_meta}

self.dump_request_list.append(
(
state.obj,
Expand Down Expand Up @@ -2243,7 +2261,20 @@ def _dump_data_to_file(
def background_task():
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "wb") as f:
pickle.dump(to_dump_with_server_args, f)
try:
pickle.dump(to_dump_with_server_args, f)
except Exception as e:
# When the server is launched with --trust-remote-code,
# server_args sometimes fails to pickle. Retry without
# server_args so the request data still gets persisted.
logger.error(
f"Failed to pickle dump with server_args: {e!r}; "
"retrying without server_args"
)
f.seek(0)
f.truncate()
to_dump_with_server_args["server_args"] = None
pickle.dump(to_dump_with_server_args, f)

asyncio.create_task(asyncio.to_thread(background_task))

Expand Down Expand Up @@ -2306,7 +2337,20 @@ def dump_requests_before_crash(
"launch_command": " ".join(sys.argv),
}
with open(filename, "wb") as f:
pickle.dump(data_to_dump_with_server_args, f)
try:
pickle.dump(data_to_dump_with_server_args, f)
except Exception as e:
# When the server is launched with --trust-remote-code,
# server_args sometimes fails to pickle. Retry without
# server_args so the request data still gets persisted.
logger.error(
f"Failed to pickle dump with server_args: {e!r}; "
"retrying without server_args"
)
f.seek(0)
f.truncate()
data_to_dump_with_server_args["server_args"] = None
pickle.dump(data_to_dump_with_server_args, f)
logger.error(
f"Dumped {len(self.crash_dump_request_list)} finished and {len(unfinished_requests)} unfinished requests before crash to {filename}"
)
Expand Down
Loading