diff --git a/docs/advanced_features/observability.md b/docs/advanced_features/observability.md index 9c5d2e175340..ee24bcf17df4 100644 --- a/docs/advanced_features/observability.md +++ b/docs/advanced_features/observability.md @@ -2,6 +2,9 @@ ## Production Metrics SGLang exposes the following metrics via Prometheus. You can enable them by adding `--enable-metrics` when launching the server. + +By default, metrics are served on the main server's `/metrics` endpoint. You can optionally serve metrics on a separate port by specifying `--metrics-port `. + You can query them by: ``` curl http://localhost:30000/metrics diff --git a/docs/references/production_metrics.md b/docs/references/production_metrics.md index 85a6ff8a64a6..e2cdf5ddf95c 100644 --- a/docs/references/production_metrics.md +++ b/docs/references/production_metrics.md @@ -2,6 +2,8 @@ SGLang exposes the following metrics via Prometheus. You can enable it by adding `--enable-metrics` when you launch the server. +By default, metrics are served on the main server's `/metrics` endpoint. You can optionally serve metrics on a separate port by specifying `--metrics-port `, which is useful when you want to isolate metrics traffic from the main API server. + An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json). Here is an example of the metrics: diff --git a/python/sglang/srt/entrypoints/grpc_server.py b/python/sglang/srt/entrypoints/grpc_server.py index 090f650075aa..be4d7779e04d 100644 --- a/python/sglang/srt/entrypoints/grpc_server.py +++ b/python/sglang/srt/entrypoints/grpc_server.py @@ -35,7 +35,11 @@ ) from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import kill_process_tree +from sglang.srt.utils import ( + kill_process_tree, + launch_metrics_server, + set_prometheus_multiproc_dir, +) from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -704,6 +708,14 @@ async def serve_grpc( ): """Start the standalone gRPC server with integrated scheduler.""" + # Set prometheus multiproc dir BEFORE launching scheduler processes + # This ensures the environment variable is inherited by child processes + if server_args.enable_metrics: + set_prometheus_multiproc_dir() + # Launch metrics server on separate port if specified + if server_args.metrics_port is not None: + launch_metrics_server(server_args.host, server_args.metrics_port) + # Start bootstrap server BEFORE launching scheduler processes (only in PREFILL mode) # This ensures the bootstrap server is ready when prefill schedulers try to register bootstrap_server = None diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index fa14836dcd59..ce57255c8218 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -133,6 +133,7 @@ delete_directory, get_bool_env_var, kill_process_tree, + launch_metrics_server, set_uvicorn_logging_configs, ) from sglang.utils import get_exception_traceback @@ -229,9 +230,14 @@ async def lifespan(fast_api_app: FastAPI): warmup_thread_kwargs = dict(server_args=server_args) thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}" - # Add prometheus middleware + # Add prometheus middleware or launch separate metrics server if server_args.enable_metrics: - add_prometheus_middleware(app) + if server_args.metrics_port is not None: + # Launch metrics on a separate port + launch_metrics_server(server_args.host, server_args.metrics_port) + else: + # Add metrics endpoint to the main server + add_prometheus_middleware(app) enable_func_timer() # Init tracing diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 69f88b00bae7..32f82e0eb87b 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -331,6 +331,7 @@ class ServerArgs: crash_dump_folder: Optional[str] = None show_time_cost: bool = False enable_metrics: bool = False + metrics_port: Optional[int] = None enable_metrics_for_all_schedulers: bool = False tokenizer_metrics_custom_labels_header: str = "x-custom-labels" tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None @@ -2758,6 +2759,12 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Enable log prometheus metrics.", ) + parser.add_argument( + "--metrics-port", + type=int, + default=ServerArgs.metrics_port, + help="Port to serve metrics on a separate server. If not specified, metrics will be served on the main server's /metrics endpoint.", + ) parser.add_argument( "--enable-metrics-for-all-schedulers", action="store_true", diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py index 375064c56a11..505afa625c4c 100644 --- a/python/sglang/srt/utils/common.py +++ b/python/sglang/srt/utils/common.py @@ -1536,6 +1536,61 @@ async def track_http_status_code(request, call_next): return response +def launch_metrics_server(host: str, port: int): + """Launch a separate metrics server on the specified port. + + This function starts a lightweight HTTP server that serves only the + /metrics endpoint for Prometheus scraping on a dedicated port. + """ + import asyncio + + import uvicorn + from fastapi import FastAPI + + # Ensure PROMETHEUS_MULTIPROC_DIR is set BEFORE importing prometheus_client + # The multiprocess module checks for this env var at import time + if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: + set_prometheus_multiproc_dir() + + from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess + + metrics_app = FastAPI() + + # Create prometheus metrics endpoint + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) + + # Workaround for 307 Redirect for /metrics + metrics_route.path_regex = re.compile("^/metrics(?P.*)$") + metrics_app.routes.append(metrics_route) + + config = uvicorn.Config( + metrics_app, + host=host, + port=port, + timeout_keep_alive=5, + loop="auto", + log_config=None, + log_level="warning", + ) + server = uvicorn.Server(config=config) + + # Run server in a background daemon thread with its own event loop + def run_server(): + try: + asyncio.run(server.serve()) + except Exception as e: + logger.error(f"Metrics server failed to start: {e}") + raise + finally: + logger.info(f"Metrics server stopped at {host}:{port}") + + thread = threading.Thread(target=run_server, daemon=True, name="metrics-server") + thread.start() + logger.info(f"Metrics server started in background thread at {host}:{port}") + + def bind_port(port): """Bind to a specific port, assuming it's available.""" sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)