Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/advanced_features/observability.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## Production Metrics
SGLang exposes the following metrics via Prometheus. You can enable them by adding `--enable-metrics` when launching the server.

By default, metrics are served on the main server's `/metrics` endpoint. You can optionally serve metrics on a separate port by specifying `--metrics-port <port>`.

You can query them by:
```
curl http://localhost:30000/metrics
Expand Down
2 changes: 2 additions & 0 deletions docs/references/production_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

SGLang exposes the following metrics via Prometheus. You can enable it by adding `--enable-metrics` when you launch the server.

By default, metrics are served on the main server's `/metrics` endpoint. You can optionally serve metrics on a separate port by specifying `--metrics-port <port>`, which is useful when you want to isolate metrics traffic from the main API server.

An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json).

Here is an example of the metrics:
Expand Down
14 changes: 13 additions & 1 deletion python/sglang/srt/entrypoints/grpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@
)
from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_process_tree
from sglang.srt.utils import (
kill_process_tree,
launch_metrics_server,
set_prometheus_multiproc_dir,
)
from sglang.utils import get_exception_traceback

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -704,6 +708,14 @@ async def serve_grpc(
):
"""Start the standalone gRPC server with integrated scheduler."""

# Set prometheus multiproc dir BEFORE launching scheduler processes
# This ensures the environment variable is inherited by child processes
if server_args.enable_metrics:
set_prometheus_multiproc_dir()
# Launch metrics server on separate port if specified
if server_args.metrics_port is not None:
launch_metrics_server(server_args.host, server_args.metrics_port)

# Start bootstrap server BEFORE launching scheduler processes (only in PREFILL mode)
# This ensures the bootstrap server is ready when prefill schedulers try to register
bootstrap_server = None
Expand Down
10 changes: 8 additions & 2 deletions python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@
delete_directory,
get_bool_env_var,
kill_process_tree,
launch_metrics_server,
set_uvicorn_logging_configs,
)
from sglang.utils import get_exception_traceback
Expand Down Expand Up @@ -229,9 +230,14 @@ async def lifespan(fast_api_app: FastAPI):
warmup_thread_kwargs = dict(server_args=server_args)
thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"

# Add prometheus middleware
# Add prometheus middleware or launch separate metrics server
if server_args.enable_metrics:
add_prometheus_middleware(app)
if server_args.metrics_port is not None:
# Launch metrics on a separate port
launch_metrics_server(server_args.host, server_args.metrics_port)
else:
# Add metrics endpoint to the main server
add_prometheus_middleware(app)
enable_func_timer()

# Init tracing
Expand Down
7 changes: 7 additions & 0 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ class ServerArgs:
crash_dump_folder: Optional[str] = None
show_time_cost: bool = False
enable_metrics: bool = False
metrics_port: Optional[int] = None
enable_metrics_for_all_schedulers: bool = False
tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
Expand Down Expand Up @@ -2758,6 +2759,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
action="store_true",
help="Enable log prometheus metrics.",
)
parser.add_argument(
"--metrics-port",
type=int,
default=ServerArgs.metrics_port,
help="Port to serve metrics on a separate server. If not specified, metrics will be served on the main server's /metrics endpoint.",
)
parser.add_argument(
"--enable-metrics-for-all-schedulers",
action="store_true",
Expand Down
55 changes: 55 additions & 0 deletions python/sglang/srt/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1536,6 +1536,61 @@ async def track_http_status_code(request, call_next):
return response


def launch_metrics_server(host: str, port: int):
"""Launch a separate metrics server on the specified port.

This function starts a lightweight HTTP server that serves only the
/metrics endpoint for Prometheus scraping on a dedicated port.
"""
import asyncio

import uvicorn
from fastapi import FastAPI

# Ensure PROMETHEUS_MULTIPROC_DIR is set BEFORE importing prometheus_client
# The multiprocess module checks for this env var at import time
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
set_prometheus_multiproc_dir()

from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess

metrics_app = FastAPI()

# Create prometheus metrics endpoint
registry = CollectorRegistry()
multiprocess.MultiProcessCollector(registry)
metrics_route = Mount("/metrics", make_asgi_app(registry=registry))

# Workaround for 307 Redirect for /metrics
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
metrics_app.routes.append(metrics_route)

config = uvicorn.Config(
metrics_app,
host=host,
port=port,
timeout_keep_alive=5,
loop="auto",
log_config=None,
log_level="warning",
)
server = uvicorn.Server(config=config)

# Run server in a background daemon thread with its own event loop
def run_server():
try:
asyncio.run(server.serve())
except Exception as e:
logger.error(f"Metrics server failed to start: {e}")
raise
finally:
logger.info(f"Metrics server stopped at {host}:{port}")

thread = threading.Thread(target=run_server, daemon=True, name="metrics-server")
thread.start()
logger.info(f"Metrics server started in background thread at {host}:{port}")


def bind_port(port):
"""Bind to a specific port, assuming it's available."""
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
Expand Down
Loading