Skip to content

Commit 2e1f507

Browse files
authored
feat: add flag for dumping dynamo engine config and environment (#3286)
Signed-off-by: William Arnold <[email protected]>
1 parent 04aafa9 commit 2e1f507

File tree

14 files changed

+586
-2
lines changed

14 files changed

+586
-2
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,7 @@ TensorRT-LLM
100100
/CLAUDE.md.bak
101101

102102
# Benchmarks
103-
benchmarks/results
103+
benchmarks/results
104+
105+
# Direnv
106+
.envrc
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Dynamo Common Module
6+
7+
This module contains shared utilities and components used across multiple
8+
Dynamo backends and components.
9+
10+
Main submodules:
11+
- config_dump: Configuration dumping and system diagnostics utilities
12+
"""
13+
14+
from dynamo.common import config_dump
15+
from dynamo.common._version import __version__
16+
17+
__all__ = ["__version__", "config_dump"]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Configuration Dumping Utilities
6+
7+
This module provides utilities for dumping configuration and system information
8+
for debugging and diagnostics purposes.
9+
"""
10+
11+
from dynamo.common.config_dump.config_dumper import (
12+
add_config_dump_args,
13+
dump_config,
14+
get_config_dump,
15+
register_encoder,
16+
)
17+
from dynamo.common.config_dump.environment import get_environment_vars
18+
from dynamo.common.config_dump.system_info import (
19+
get_gpu_info,
20+
get_runtime_info,
21+
get_system_info,
22+
)
23+
24+
__all__ = [
25+
"add_config_dump_args",
26+
"dump_config",
27+
"get_config_dump",
28+
"get_environment_vars",
29+
"get_gpu_info",
30+
"get_runtime_info",
31+
"get_system_info",
32+
"register_encoder",
33+
]
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import argparse
5+
import dataclasses
6+
import functools
7+
import json
8+
import logging
9+
import pathlib
10+
from enum import Enum
11+
from typing import Any, Dict, Optional
12+
13+
from dynamo.common._version import __version__
14+
15+
from .environment import get_environment_vars
16+
from .system_info import (
17+
get_gpu_info,
18+
get_package_info,
19+
get_runtime_info,
20+
get_system_info,
21+
)
22+
23+
logger = logging.getLogger(__name__)
24+
25+
26+
def _get_sglang_version() -> Optional[str]:
27+
"""Get SGLang version if available.
28+
29+
Returns:
30+
Version string if SGLang is installed, None otherwise.
31+
"""
32+
try:
33+
import sglang as sgl
34+
35+
return sgl.__version__
36+
except ImportError:
37+
logger.debug("SGLang not available")
38+
return None
39+
except AttributeError:
40+
logger.warning("SGLang installed but version not available")
41+
return None
42+
43+
44+
def _get_trtllm_version() -> Optional[str]:
45+
"""Get TensorRT-LLM version if available.
46+
47+
Returns:
48+
Version string if TensorRT-LLM is installed, None otherwise.
49+
"""
50+
try:
51+
import tensorrt_llm
52+
53+
return tensorrt_llm.__version__
54+
except ImportError:
55+
logger.debug("TensorRT-LLM not available")
56+
return None
57+
except AttributeError:
58+
logger.warning("TensorRT-LLM installed but version not available")
59+
return None
60+
61+
62+
def _get_vllm_version() -> Optional[str]:
63+
"""Get vLLM version if available.
64+
65+
Returns:
66+
Version string if vLLM is installed, None otherwise.
67+
"""
68+
try:
69+
import vllm
70+
71+
return vllm.__version__
72+
except ImportError:
73+
logger.debug("vLLM not available")
74+
return None
75+
except AttributeError:
76+
logger.warning("vLLM installed but version not available")
77+
return None
78+
79+
80+
def dump_config(dump_config_to: Optional[str], config: Any) -> None:
81+
"""
82+
Dump the configuration to a file or stdout.
83+
84+
If dump_config_to is not provided, the config will be logged to stdout at VERBOSE level.
85+
86+
Args:
87+
dump_config_to: Optional path to dump the config to. If None, logs to stdout.
88+
config: The configuration object to dump (must be JSON-serializable).
89+
90+
Raises:
91+
Logs errors but does not raise exceptions to ensure graceful degradation.
92+
"""
93+
config_dump_payload = get_config_dump(config)
94+
95+
if dump_config_to:
96+
try:
97+
dump_path = pathlib.Path(dump_config_to)
98+
dump_path.parent.mkdir(parents=True, exist_ok=True)
99+
with open(dump_path.resolve(), "w", encoding="utf-8") as f:
100+
f.write(config_dump_payload)
101+
logger.info(f"Dumped config to {dump_path.resolve()}")
102+
except (OSError, IOError):
103+
logger.exception(f"Failed to dump config to {dump_config_to}")
104+
logger.info(f"CONFIG_DUMP: {config_dump_payload}")
105+
except Exception:
106+
logger.exception("Unexpected error dumping config")
107+
logger.info(f"CONFIG_DUMP: {config_dump_payload}")
108+
else:
109+
logger.info(f"CONFIG_DUMP: {config_dump_payload}")
110+
111+
112+
def get_config_dump(config: Any, extra_info: Optional[Dict[str, Any]] = None) -> str:
113+
"""
114+
Collect comprehensive config information about a backend instance.
115+
116+
Args:
117+
config: Any JSON-serializable object containing the backend configuration.
118+
extra_info: Optional dict of additional information to include in the dump.
119+
120+
Returns:
121+
JSON string containing comprehensive information.
122+
123+
Note:
124+
Returns error information if collection fails, ensuring some diagnostic data is always available.
125+
"""
126+
if extra_info is None:
127+
extra_info = {}
128+
try:
129+
config_dump = {
130+
"system_info": get_system_info(),
131+
"environment": get_environment_vars(),
132+
"config": config,
133+
"runtime_info": get_runtime_info(),
134+
"dynamo_version": __version__,
135+
"gpu_info": get_gpu_info(),
136+
"installed_packages": get_package_info(),
137+
}
138+
139+
# Add common versions
140+
if ver := _get_sglang_version():
141+
config_dump["sglang_version"] = ver
142+
if ver := _get_trtllm_version():
143+
config_dump["trtllm_version"] = ver
144+
if ver := _get_vllm_version():
145+
config_dump["vllm_version"] = ver
146+
147+
# Add any extra information provided by the caller
148+
if extra_info:
149+
config_dump.update(extra_info)
150+
151+
return canonical_json_encoder.encode(config_dump)
152+
153+
except Exception as e:
154+
logger.error(f"Error collecting config dump: {e}")
155+
# Return a basic error response with at least system info
156+
error_info = {
157+
"error": f"Failed to collect config dump: {str(e)}",
158+
"system_info": get_system_info(), # Always try to include basic system info
159+
}
160+
return canonical_json_encoder.encode(error_info)
161+
162+
163+
def add_config_dump_args(parser: argparse.ArgumentParser):
164+
"""
165+
Add arguments to the parser to dump the config to a file.
166+
167+
Args:
168+
parser: The parser to add the arguments to
169+
"""
170+
parser.add_argument(
171+
"--dump-config-to",
172+
type=str,
173+
default=None,
174+
help="Dump config to the specified file path. If not specified, the config will be dumped to stdout at INFO level.",
175+
)
176+
177+
178+
@functools.singledispatch
179+
def _preprocess_for_encode(obj: object) -> object:
180+
"""
181+
Single dispatch function for preprocessing objects before JSON encoding.
182+
183+
This function should be extended using @register_encoder decorator
184+
for backend-specific types.
185+
"""
186+
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
187+
return dataclasses.asdict(obj)
188+
logger.warning(f"Unknown type {type(obj)}, using __dict__ or str(obj)")
189+
if hasattr(obj, "__dict__"):
190+
return obj.__dict__
191+
return str(obj)
192+
193+
194+
def register_encoder(type_class):
195+
"""
196+
Decorator to register custom encoders for specific types.
197+
198+
Usage:
199+
@register_encoder(MyClass)
200+
def encode_my_class(obj: MyClass):
201+
return {"field": obj.field}
202+
"""
203+
logger.debug(f"Registering encoder for {type_class}")
204+
return _preprocess_for_encode.register(type_class)
205+
206+
207+
@register_encoder(set)
208+
def _preprocess_for_encode_set(
209+
obj: set,
210+
) -> list: # pyright: ignore[reportUnusedFunction]
211+
return sorted(list(obj))
212+
213+
214+
@register_encoder(Enum)
215+
def _preprocess_for_encode_enum(
216+
obj: Enum,
217+
) -> str: # pyright: ignore[reportUnusedFunction]
218+
return str(obj)
219+
220+
221+
# Create a canonical JSON encoder with consistent formatting
222+
canonical_json_encoder = json.JSONEncoder(
223+
ensure_ascii=False,
224+
separators=(",", ":"),
225+
allow_nan=False,
226+
sort_keys=True,
227+
indent=None,
228+
default=_preprocess_for_encode,
229+
)
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import os
5+
from typing import Dict, List, Optional, Set
6+
7+
# Default environment variable prefixes to capture
8+
# These cover common ML/GPU/Dynamo-related configurations
9+
DEFAULT_ENV_PREFIXES = [
10+
"DYN_", # Dynamo-specific variables
11+
"CUDA_", # CUDA configuration
12+
"NCCL_", # NVIDIA Collective Communications Library
13+
"HF_", # HuggingFace
14+
"TRANSFORMERS_", # Transformers library
15+
"SGLANG_", # SGLang
16+
"SGL_", # SGLang (short prefix)
17+
"MC_", # Mooncake
18+
"VLLM_", # vLLM
19+
"TENSORRT_", # TensorRT
20+
"TORCH_", # PyTorch
21+
"UCX_", # UCX
22+
"NIXL_", # NIXL
23+
"OMPI_", # OpenMPI
24+
]
25+
26+
# Sensitive variable patterns to redact (case-insensitive)
27+
SENSITIVE_PATTERNS = [
28+
"TOKEN",
29+
"API_KEY",
30+
"SECRET",
31+
"PASSWORD",
32+
"CREDENTIAL",
33+
"AUTH",
34+
]
35+
36+
37+
def get_environment_vars(
38+
prefixes: Optional[List[str]] = None,
39+
include_sensitive: bool = False,
40+
additional_vars: Optional[Set[str]] = None,
41+
) -> Dict[str, str]:
42+
"""
43+
Get relevant environment variables based on prefixes.
44+
45+
Args:
46+
prefixes: List of environment variable prefixes to capture.
47+
If None, uses DEFAULT_ENV_PREFIXES.
48+
include_sensitive: If False, redacts values of potentially sensitive variables.
49+
Default is False for security.
50+
additional_vars: Set of specific variable names to include regardless of prefix.
51+
52+
Returns:
53+
Dictionary of environment variable names to values.
54+
Sensitive values are replaced with "<REDACTED>" unless include_sensitive is True.
55+
56+
Examples:
57+
>>> get_environment_vars() # Uses default prefixes
58+
>>> get_environment_vars(prefixes=["MY_APP_"]) # Custom prefixes only
59+
>>> get_environment_vars(additional_vars={"PATH", "HOME"}) # Include specific vars
60+
"""
61+
if prefixes is None:
62+
prefixes = DEFAULT_ENV_PREFIXES
63+
64+
if additional_vars is None:
65+
additional_vars = set()
66+
67+
relevant_env_vars = {}
68+
69+
for key, value in os.environ.items():
70+
# Check if matches prefix or is in additional_vars
71+
if any(key.startswith(prefix) for prefix in prefixes) or key in additional_vars:
72+
# Redact sensitive values unless explicitly requested
73+
if not include_sensitive and _is_sensitive(key):
74+
relevant_env_vars[key] = "<REDACTED>"
75+
else:
76+
relevant_env_vars[key] = value
77+
78+
return relevant_env_vars
79+
80+
81+
def _is_sensitive(var_name: str) -> bool:
82+
"""
83+
Check if an environment variable name suggests it contains sensitive data.
84+
85+
Args:
86+
var_name: The environment variable name to check.
87+
88+
Returns:
89+
True if the variable name matches sensitive patterns.
90+
"""
91+
var_name_upper = var_name.upper()
92+
return any(pattern in var_name_upper for pattern in SENSITIVE_PATTERNS)

0 commit comments

Comments
 (0)