sgl-project · Kangyan-Zhou · Dec 4, 2025 · Dec 4, 2025
diff --git a/docs/advanced_features/observability.md b/docs/advanced_features/observability.md
@@ -2,6 +2,9 @@
 
 ## Production Metrics
 SGLang exposes the following metrics via Prometheus. You can enable them by adding `--enable-metrics` when launching the server.
+
+By default, metrics are served on the main server's `/metrics` endpoint. You can optionally serve metrics on a separate port by specifying `--metrics-port <port>`.
+
 You can query them by:
 ```
 curl http://localhost:30000/metrics

diff --git a/docs/references/production_metrics.md b/docs/references/production_metrics.md
@@ -2,6 +2,8 @@
 
 SGLang exposes the following metrics via Prometheus. You can enable it by adding `--enable-metrics` when you launch the server.
 
+By default, metrics are served on the main server's `/metrics` endpoint. You can optionally serve metrics on a separate port by specifying `--metrics-port <port>`, which is useful when you want to isolate metrics traffic from the main API server.
+
 An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json).
 
 Here is an example of the metrics:

@@ -35,7 +35,11 @@
 )
 from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_process_tree
+from sglang.srt.utils import (
+    kill_process_tree,
+    launch_metrics_server,
+    set_prometheus_multiproc_dir,
+)
 from sglang.utils import get_exception_traceback
 
 logger = logging.getLogger(__name__)
@@ -704,6 +708,14 @@ async def serve_grpc(
 ):
     """Start the standalone gRPC server with integrated scheduler."""
 
+    # Set prometheus multiproc dir BEFORE launching scheduler processes
+    # This ensures the environment variable is inherited by child processes
+    if server_args.enable_metrics:
+        set_prometheus_multiproc_dir()
+        # Launch metrics server on separate port if specified
+        if server_args.metrics_port is not None:
+            launch_metrics_server(server_args.host, server_args.metrics_port)
+
     # Start bootstrap server BEFORE launching scheduler processes (only in PREFILL mode)
     # This ensures the bootstrap server is ready when prefill schedulers try to register
     bootstrap_server = None

@@ -133,6 +133,7 @@
     delete_directory,
     get_bool_env_var,
     kill_process_tree,
+    launch_metrics_server,
     set_uvicorn_logging_configs,
 )
 from sglang.utils import get_exception_traceback
@@ -229,9 +230,14 @@ async def lifespan(fast_api_app: FastAPI):
         warmup_thread_kwargs = dict(server_args=server_args)
         thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"
 
-    # Add prometheus middleware
+    # Add prometheus middleware or launch separate metrics server
     if server_args.enable_metrics:
-        add_prometheus_middleware(app)
+        if server_args.metrics_port is not None:
+            # Launch metrics on a separate port
+            launch_metrics_server(server_args.host, server_args.metrics_port)
+        else:
+            # Add metrics endpoint to the main server
+            add_prometheus_middleware(app)
         enable_func_timer()
 
     # Init tracing

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -331,6 +331,7 @@ class ServerArgs:
     crash_dump_folder: Optional[str] = None
     show_time_cost: bool = False
     enable_metrics: bool = False
+    metrics_port: Optional[int] = None
     enable_metrics_for_all_schedulers: bool = False
     tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
     tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
@@ -2758,6 +2759,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Enable log prometheus metrics.",
         )
+        parser.add_argument(
+            "--metrics-port",
+            type=int,
+            default=ServerArgs.metrics_port,
+            help="Port to serve metrics on a separate server. If not specified, metrics will be served on the main server's /metrics endpoint.",
+        )
         parser.add_argument(
             "--enable-metrics-for-all-schedulers",
             action="store_true",

diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
@@ -1536,6 +1536,61 @@ async def track_http_status_code(request, call_next):
         return response
 
 
+def launch_metrics_server(host: str, port: int):
+    """Launch a separate metrics server on the specified port.
+
+    This function starts a lightweight HTTP server that serves only the
+    /metrics endpoint for Prometheus scraping on a dedicated port.
+    """
+    import asyncio
+
+    import uvicorn
+    from fastapi import FastAPI
+
+    # Ensure PROMETHEUS_MULTIPROC_DIR is set BEFORE importing prometheus_client
+    # The multiprocess module checks for this env var at import time
+    if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
+        set_prometheus_multiproc_dir()
+
+    from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess
+
+    metrics_app = FastAPI()
+
+    # Create prometheus metrics endpoint
+    registry = CollectorRegistry()
+    multiprocess.MultiProcessCollector(registry)
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    metrics_app.routes.append(metrics_route)
+
+    config = uvicorn.Config(
+        metrics_app,
+        host=host,
+        port=port,
+        timeout_keep_alive=5,
+        loop="auto",
+        log_config=None,
+        log_level="warning",
+    )
+    server = uvicorn.Server(config=config)
+
+    # Run server in a background daemon thread with its own event loop
+    def run_server():
+        try:
+            asyncio.run(server.serve())
+        except Exception as e:
+            logger.error(f"Metrics server failed to start: {e}")
+            raise
+        finally:
+            logger.info(f"Metrics server stopped at {host}:{port}")
+
+    thread = threading.Thread(target=run_server, daemon=True, name="metrics-server")
+    thread.start()
+    logger.info(f"Metrics server started in background thread at {host}:{port}")
+
+
 def bind_port(port):
     """Bind to a specific port, assuming it's available."""
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)