From 6bdb144c843921c508d78fe2d9e28e9440ec397f Mon Sep 17 00:00:00 2001 From: Byron Hsu Date: Sat, 9 May 2026 05:28:29 +0000 Subject: [PATCH 1/2] Fix request dump pickling and add meta_info key filtering Three related improvements to the /configure_logging request dump pipeline that surfaced when running with --trust-remote-code and MoE models that emit large per-request meta_info blobs: 1. Pickle safety. ServerArgs.get_model_config() lazily attaches the resolved ModelConfig back onto the ServerArgs instance. With --trust-remote-code, that ModelConfig holds an hf_config whose class lives under the dynamic transformers_modules. namespace, which is not safely picklable (pickle's class identity round-trip fails when the dynamic module is re-exec'd). Wrap the pickle.dump in try/except in both _dump_data_to_file and dump_requests_before_crash; on failure, retry with server_args=None so the request data still gets persisted instead of leaving an empty/corrupt file. 2. meta_info key filtering. Request dumps grow rapidly when the server runs MoE models with --enable-routing-replay (each finished request stashes a base64-encoded routed_experts tensor in meta_info). hidden_states is similarly bulky when --return-hidden-states is on. Add a configurable list dump_requests_exclude_meta_keys on the tokenizer manager and ConfigureLoggingReq, defaulting to ["routed_experts", "hidden_states"]. Filter those keys out of meta_info via a shallow copy in dump_requests so the original out_dict (still referenced by the response path / observers) is not mutated. 3. CLI surface. Surface the new option in the configure_logging CLI as --dump-requests-exclude-meta-keys 'a,b,c' (empty string keeps all). Existing callers that don't pass the flag get the smaller dumps for free. Pass an empty list to /configure_logging to restore the previous behavior. Co-authored-by: Cursor --- .../sglang/srt/managers/configure_logging.py | 33 ++++++++--- python/sglang/srt/managers/io_struct.py | 5 ++ .../sglang/srt/managers/tokenizer_manager.py | 58 ++++++++++++++++++- 3 files changed, 85 insertions(+), 11 deletions(-) diff --git a/python/sglang/srt/managers/configure_logging.py b/python/sglang/srt/managers/configure_logging.py index 0dc78edfa075..7f268e31b7b0 100644 --- a/python/sglang/srt/managers/configure_logging.py +++ b/python/sglang/srt/managers/configure_logging.py @@ -33,15 +33,30 @@ "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump" ) parser.add_argument("--dump-requests-threshold", type=int, default=1000) + parser.add_argument( + "--dump-requests-exclude-meta-keys", + type=str, + default=None, + help=( + "Comma-separated meta_info keys to strip from each dumped request " + "(e.g. 'routed_experts,hidden_states'). Pass an empty string to " + "keep all keys. If not set, the server default is used." + ), + ) args = parser.parse_args() - response = requests.post( - args.url + "/configure_logging", - json={ - "log_requests": args.log_requests, - "log_requests_level": args.log_requests_level, # Log full requests - "dump_requests_folder": args.dump_requests_folder, - "dump_requests_threshold": args.dump_requests_threshold, - }, - ) + payload = { + "log_requests": args.log_requests, + "log_requests_level": args.log_requests_level, # Log full requests + "dump_requests_folder": args.dump_requests_folder, + "dump_requests_threshold": args.dump_requests_threshold, + } + if args.dump_requests_exclude_meta_keys is not None: + payload["dump_requests_exclude_meta_keys"] = [ + k.strip() + for k in args.dump_requests_exclude_meta_keys.split(",") + if k.strip() + ] + + response = requests.post(args.url + "/configure_logging", json=payload) assert response.status_code == 200 diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 005079801f70..a7dd1d033c0e 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -1758,6 +1758,11 @@ class ConfigureLoggingReq(BaseReq): dump_requests_folder: Optional[str] = None dump_requests_threshold: Optional[int] = None crash_dump_folder: Optional[str] = None + # Keys to strip from `meta_info` of every dumped request. Useful for + # dropping heavy blobs that bloat the dump file (e.g. "routed_experts" + # captured by --enable-routing-replay, "hidden_states" captured by + # --return-hidden-states). Pass an empty list to keep everything. + dump_requests_exclude_meta_keys: Optional[List[str]] = None @dataclass diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index c3b4005abe90..4b660521934d 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -391,6 +391,14 @@ def init_request_logging_and_dumping(self): # Dumping self.dump_requests_folder = "" # By default do not dump self.dump_requests_threshold = 1000 + # Drop heavy meta_info entries from the dump payload by default. Both + # `routed_experts` (base64'd MoE routing tensor, ~KB-MB per request) + # and `hidden_states` blow up the pkl file size and are not used by + # the replay tooling. Override via /configure_logging if needed. + self.dump_requests_exclude_meta_keys: List[str] = [ + "routed_experts", + "hidden_states", + ] self.dump_request_list: List[Tuple] = [] self.crash_dump_request_list: deque[Tuple] = deque() self.crash_dump_performed = False # Flag to ensure dump is only called once @@ -1577,6 +1585,10 @@ def configure_logging(self, obj: ConfigureLoggingReq): self.dump_requests_folder = obj.dump_requests_folder if obj.dump_requests_threshold is not None: self.dump_requests_threshold = obj.dump_requests_threshold + if obj.dump_requests_exclude_meta_keys is not None: + self.dump_requests_exclude_meta_keys = list( + obj.dump_requests_exclude_meta_keys + ) if obj.crash_dump_folder is not None: self.crash_dump_folder = obj.crash_dump_folder logging.info(f"Config logging: {obj=}") @@ -2193,6 +2205,19 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOutput, i: int): ) def dump_requests(self, state: ReqState, out_dict: dict): + # Strip heavy keys from `meta_info` (e.g. routed_experts, hidden_states) + # to keep the on-disk pkl small. Don't mutate the original dict — it + # may still be referenced by the response path or other observers. + if self.dump_requests_exclude_meta_keys and isinstance( + out_dict.get("meta_info"), dict + ): + exclude = self.dump_requests_exclude_meta_keys + if any(k in out_dict["meta_info"] for k in exclude): + filtered_meta = { + k: v for k, v in out_dict["meta_info"].items() if k not in exclude + } + out_dict = {**out_dict, "meta_info": filtered_meta} + self.dump_request_list.append( ( state.obj, @@ -2243,7 +2268,23 @@ def _dump_data_to_file( def background_task(): os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, "wb") as f: - pickle.dump(to_dump_with_server_args, f) + try: + pickle.dump(to_dump_with_server_args, f) + except Exception as e: + # When the server is launched with --trust-remote-code, + # server_args sometimes fails to pickle because the + # lazily-attached ModelConfig holds an hf_config whose + # class lives under the dynamic transformers_modules.* + # namespace. Retry without server_args so the request + # data still gets persisted. + logger.error( + f"Failed to pickle dump with server_args: {e!r}; " + "retrying without server_args" + ) + f.seek(0) + f.truncate() + to_dump_with_server_args["server_args"] = None + pickle.dump(to_dump_with_server_args, f) asyncio.create_task(asyncio.to_thread(background_task)) @@ -2306,7 +2347,20 @@ def dump_requests_before_crash( "launch_command": " ".join(sys.argv), } with open(filename, "wb") as f: - pickle.dump(data_to_dump_with_server_args, f) + try: + pickle.dump(data_to_dump_with_server_args, f) + except Exception as e: + # When the server is launched with --trust-remote-code, + # server_args sometimes fails to pickle. Retry without + # server_args so the request data still gets persisted. + logger.error( + f"Failed to pickle dump with server_args: {e!r}; " + "retrying without server_args" + ) + f.seek(0) + f.truncate() + data_to_dump_with_server_args["server_args"] = None + pickle.dump(data_to_dump_with_server_args, f) logger.error( f"Dumped {len(self.crash_dump_request_list)} finished and {len(unfinished_requests)} unfinished requests before crash to {filename}" ) From 0cf24e4c041872e96f22eaecb79322d07a612d49 Mon Sep 17 00:00:00 2001 From: Byron Hsu Date: Sun, 10 May 2026 04:25:58 +0000 Subject: [PATCH 2/2] Trim verbose comments per review feedback Remove descriptive comments that restate what the code does; keep only concise "why" notes on the non-obvious pickle fallback. Co-authored-by: Cursor --- python/sglang/srt/managers/io_struct.py | 4 ---- python/sglang/srt/managers/tokenizer_manager.py | 14 ++------------ 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index a7dd1d033c0e..2d61a4a49d48 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -1758,10 +1758,6 @@ class ConfigureLoggingReq(BaseReq): dump_requests_folder: Optional[str] = None dump_requests_threshold: Optional[int] = None crash_dump_folder: Optional[str] = None - # Keys to strip from `meta_info` of every dumped request. Useful for - # dropping heavy blobs that bloat the dump file (e.g. "routed_experts" - # captured by --enable-routing-replay, "hidden_states" captured by - # --return-hidden-states). Pass an empty list to keep everything. dump_requests_exclude_meta_keys: Optional[List[str]] = None diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 4b660521934d..1eea93d2eb65 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -391,10 +391,6 @@ def init_request_logging_and_dumping(self): # Dumping self.dump_requests_folder = "" # By default do not dump self.dump_requests_threshold = 1000 - # Drop heavy meta_info entries from the dump payload by default. Both - # `routed_experts` (base64'd MoE routing tensor, ~KB-MB per request) - # and `hidden_states` blow up the pkl file size and are not used by - # the replay tooling. Override via /configure_logging if needed. self.dump_requests_exclude_meta_keys: List[str] = [ "routed_experts", "hidden_states", @@ -2205,9 +2201,6 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOutput, i: int): ) def dump_requests(self, state: ReqState, out_dict: dict): - # Strip heavy keys from `meta_info` (e.g. routed_experts, hidden_states) - # to keep the on-disk pkl small. Don't mutate the original dict — it - # may still be referenced by the response path or other observers. if self.dump_requests_exclude_meta_keys and isinstance( out_dict.get("meta_info"), dict ): @@ -2272,11 +2265,8 @@ def background_task(): pickle.dump(to_dump_with_server_args, f) except Exception as e: # When the server is launched with --trust-remote-code, - # server_args sometimes fails to pickle because the - # lazily-attached ModelConfig holds an hf_config whose - # class lives under the dynamic transformers_modules.* - # namespace. Retry without server_args so the request - # data still gets persisted. + # server_args sometimes fails to pickle. Retry without + # server_args so the request data still gets persisted. logger.error( f"Failed to pickle dump with server_args: {e!r}; " "retrying without server_args"