Skip to content
5 changes: 4 additions & 1 deletion vllm/entrypoints/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from vllm.v1.engine.core import EngineCoreProc
from vllm.v1.engine.core_client import CoreEngineProcManager
from vllm.v1.executor.abstract import Executor
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
from vllm.v1.utils import (CoreEngine, get_engine_client_zmq_addr,
wait_for_engine_startup)

Expand Down Expand Up @@ -162,7 +163,9 @@ def run_multi_api_server(args: argparse.Namespace):

assert not args.headless
num_api_servers = args.api_server_count
# assert num_api_servers > 1
assert num_api_servers > 1

setup_multiprocess_prometheus()

listen_address, sock = setup_server(args)

Expand Down
22 changes: 6 additions & 16 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app
from prometheus_fastapi_instrumentator import Instrumentator
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove?

Copy link
Copy Markdown

@kouroshHakha kouroshHakha May 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't remove. mount_metrics is added back here, so it's needed. That's why I wanted to keep mount_metrics entirely in prometheus.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doh, I misread, sorry. That's fine, this stuff is in the "api server" category not the "disgusting prometheus multi proc hackery" category 😃

from starlette.concurrency import iterate_in_threadpool
from starlette.datastructures import State
from starlette.routing import Mount
Expand Down Expand Up @@ -97,6 +99,7 @@
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
is_valid_ipv6_address, set_ulimit)
from vllm.v1.metrics.prometheus import get_prometheus_registry
from vllm.version import __version__ as VLLM_VERSION

TIMEOUT_KEEP_ALIVE = 5 # seconds
Expand Down Expand Up @@ -323,22 +326,9 @@ async def validate_json_request(raw_request: Request):


def mount_metrics(app: FastAPI):
# Lazy import for prometheus multiprocessing.
# We need to set PROMETHEUS_MULTIPROC_DIR environment variable
# before prometheus_client is imported.
# See https://prometheus.github.io/client_python/multiprocess/
from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app,
multiprocess)
from prometheus_fastapi_instrumentator import Instrumentator

registry = REGISTRY

prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
if prometheus_multiproc_dir_path is not None:
logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
prometheus_multiproc_dir_path)
registry = CollectorRegistry()
multiprocess.MultiProcessCollector(registry)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Everything above should be in the prometheus module, returning a registry

"""Mount prometheus metrics to a FastAPI app."""

registry = get_prometheus_registry()

Instrumentator(
excluded_handlers=[
Expand Down
3 changes: 3 additions & 0 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from vllm.v1.executor.abstract import Executor
from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory,
setup_default_loggers)
from vllm.v1.metrics.prometheus import shutdown_prometheus
from vllm.v1.metrics.stats import IterationStats, SchedulerStats

logger = init_logger(__name__)
Expand Down Expand Up @@ -198,6 +199,8 @@ def __del__(self):
def shutdown(self):
"""Shutdown, cleaning up the background proc and IPC."""

shutdown_prometheus()

if engine_core := getattr(self, "engine_core", None):
engine_core.shutdown()

Expand Down
31 changes: 19 additions & 12 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@
from vllm.logger import init_logger
from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
from vllm.v1.engine import FinishReason
from vllm.v1.metrics.prometheus import unregister_vllm_metrics
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm

logger = init_logger(__name__)

_LOCAL_LOGGING_INTERVAL_SEC = 5.0

StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]


Expand Down Expand Up @@ -143,7 +142,8 @@ def log_engine_initialized(self):
class PrometheusStatLogger(StatLoggerBase):

def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
self._unregister_vllm_metrics()

unregister_vllm_metrics()
self.vllm_config = vllm_config
self.engine_index = engine_index
# Use this flag to hide metrics that were deprecated in
Expand All @@ -168,11 +168,13 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
self.gauge_scheduler_running = prometheus_client.Gauge(
name="vllm:num_requests_running",
documentation="Number of requests in model execution batches.",
multiprocess_mode="mostrecent",
labelnames=labelnames).labels(*labelvalues)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're re-spinning again, then a very minor stylistic request ...

In the original version this line is all "label stuff"

In the new version, it becomes "label stuff, new line, multiproc stuff, label stuff"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry did not quite understand the desired style?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh you mean the order? put multiprocess_mode before label stuff?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok done.


self.gauge_scheduler_waiting = prometheus_client.Gauge(
name="vllm:num_requests_waiting",
documentation="Number of requests waiting to be processed.",
multiprocess_mode="mostrecent",
labelnames=labelnames).labels(*labelvalues)

#
Expand All @@ -181,6 +183,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
self.gauge_gpu_cache_usage = prometheus_client.Gauge(
name="vllm:gpu_cache_usage_perc",
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
multiprocess_mode="mostrecent",
labelnames=labelnames).labels(*labelvalues)

self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
Expand Down Expand Up @@ -241,6 +244,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
buckets=build_1_2_5_buckets(max_model_len),
labelnames=labelnames).labels(*labelvalues)

# TODO: This metric might be incorrect in case of using multiple
# api_server counts which uses prometheus mp.
# See: https://github.com/vllm-project/vllm/pull/18053
self.histogram_iteration_tokens = \
prometheus_client.Histogram(
name="vllm:iteration_tokens_total",
Expand Down Expand Up @@ -339,6 +345,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
#
# LoRA metrics
#

# TODO: This metric might be incorrect in case of using multiple
# api_server counts which uses prometheus mp.
self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
if vllm_config.lora_config is not None:
self.labelname_max_lora = "max_lora"
Expand All @@ -349,13 +358,16 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
prometheus_client.Gauge(
name="vllm:lora_requests_info",
documentation="Running stats on lora requests.",
multiprocess_mode="sum",
labelnames=[
self.labelname_max_lora,
self.labelname_waiting_lora_adapters,
self.labelname_running_lora_adapters,
])
],
)

def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):

metrics_info = config_obj.metrics_info()
metrics_info["engine"] = self.engine_index

Expand All @@ -371,7 +383,9 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
info_gauge = prometheus_client.Gauge(
name=name,
documentation=documentation,
labelnames=metrics_info.keys()).labels(**metrics_info)
multiprocess_mode="mostrecent",
labelnames=metrics_info.keys(),
).labels(**metrics_info)
info_gauge.set(1)

def record(self, scheduler_stats: Optional[SchedulerStats],
Expand Down Expand Up @@ -445,13 +459,6 @@ def record(self, scheduler_stats: Optional[SchedulerStats],
self.gauge_lora_info.labels(**lora_info_labels)\
.set_to_current_time()

@staticmethod
def _unregister_vllm_metrics():
# Unregister any existing vLLM collectors (for CI/CD
for collector in list(prometheus_client.REGISTRY._collector_to_names):
if hasattr(collector, "_name") and "vllm" in collector._name:
prometheus_client.REGISTRY.unregister(collector)

def log_engine_initialized(self):
self.log_metrics_info("cache_config", self.vllm_config.cache_config)

Expand Down
74 changes: 74 additions & 0 deletions vllm/v1/metrics/prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# SPDX-License-Identifier: Apache-2.0

import os
import tempfile
from typing import Optional

from prometheus_client import REGISTRY, CollectorRegistry, multiprocess

from vllm.logger import init_logger

logger = init_logger(__name__)

# Global temporary directory for prometheus multiprocessing
_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None


def setup_multiprocess_prometheus():
"""Set up prometheus multiprocessing directory if not already configured.

"""
global _prometheus_multiproc_dir

if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
# Make TemporaryDirectory for prometheus multiprocessing
# Note: global TemporaryDirectory will be automatically
# cleaned up upon exit.
_prometheus_multiproc_dir = tempfile.TemporaryDirectory()
os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s",
_prometheus_multiproc_dir.name)
else:
logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. "
"This directory must be wiped between vLLM runs or "
"you will find inaccurate metrics. Unset the variable "
"and vLLM will properly handle cleanup.")


def get_prometheus_registry():
"""Get the appropriate prometheus registry based on multiprocessing
configuration.

Returns:
Registry: A prometheus registry
"""
if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None:
logger.debug("Using multiprocess registry for prometheus metrics")
registry = CollectorRegistry()
multiprocess.MultiProcessCollector(registry)
return registry

return REGISTRY


def unregister_vllm_metrics():
"""Unregister any existing vLLM collectors from the prometheus registry.

This is useful for testing and CI/CD where metrics may be registered
multiple times across test runs.
"""
registry = get_prometheus_registry()
# Unregister any existing vLLM collectors
for collector in list(registry._collector_to_names):
if hasattr(collector, "_name") and "vllm" in collector._name:
registry.unregister(collector)


def shutdown_prometheus():
"""Shutdown prometheus metrics."""
try:
pid = os.getpid()
multiprocess.mark_process_dead(pid)
logger.debug("Marked Prometheus metrics for process %d as dead", pid)
except Exception as e:
logger.error("Error during metrics cleanup: %s", str(e))