-
Notifications
You must be signed in to change notification settings - Fork 1
[Misc][DP] Fix AsyncLLM metrics for multi-API server deployments #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ee7cc58
fe507f6
70e2513
b376674
0c0a1a5
0fa7aac
106265d
aace3a0
b876684
4cadec3
e3b0dd2
aad9b52
f3f65bd
7260f4f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,8 @@ | |
| from fastapi.exceptions import RequestValidationError | ||
| from fastapi.middleware.cors import CORSMiddleware | ||
| from fastapi.responses import JSONResponse, Response, StreamingResponse | ||
| from prometheus_client import make_asgi_app | ||
| from prometheus_fastapi_instrumentator import Instrumentator | ||
| from starlette.concurrency import iterate_in_threadpool | ||
| from starlette.datastructures import State | ||
| from starlette.routing import Mount | ||
|
|
@@ -97,6 +99,7 @@ | |
| from vllm.usage.usage_lib import UsageContext | ||
| from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, | ||
| is_valid_ipv6_address, set_ulimit) | ||
| from vllm.v1.metrics.prometheus import get_prometheus_registry | ||
| from vllm.version import __version__ as VLLM_VERSION | ||
|
|
||
| TIMEOUT_KEEP_ALIVE = 5 # seconds | ||
|
|
@@ -323,22 +326,9 @@ async def validate_json_request(raw_request: Request): | |
|
|
||
|
|
||
| def mount_metrics(app: FastAPI): | ||
| # Lazy import for prometheus multiprocessing. | ||
| # We need to set PROMETHEUS_MULTIPROC_DIR environment variable | ||
| # before prometheus_client is imported. | ||
| # See https://prometheus.github.io/client_python/multiprocess/ | ||
| from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app, | ||
| multiprocess) | ||
| from prometheus_fastapi_instrumentator import Instrumentator | ||
|
|
||
| registry = REGISTRY | ||
|
|
||
| prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) | ||
| if prometheus_multiproc_dir_path is not None: | ||
| logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", | ||
| prometheus_multiproc_dir_path) | ||
| registry = CollectorRegistry() | ||
| multiprocess.MultiProcessCollector(registry) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Everything above should be in the prometheus module, returning a |
||
| """Mount prometheus metrics to a FastAPI app.""" | ||
|
|
||
| registry = get_prometheus_registry() | ||
|
|
||
| Instrumentator( | ||
| excluded_handlers=[ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,13 +12,12 @@ | |
| from vllm.logger import init_logger | ||
| from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics | ||
| from vllm.v1.engine import FinishReason | ||
| from vllm.v1.metrics.prometheus import unregister_vllm_metrics | ||
| from vllm.v1.metrics.stats import IterationStats, SchedulerStats | ||
| from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm | ||
|
|
||
| logger = init_logger(__name__) | ||
|
|
||
| _LOCAL_LOGGING_INTERVAL_SEC = 5.0 | ||
|
|
||
| StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"] | ||
|
|
||
|
|
||
|
|
@@ -143,7 +142,8 @@ def log_engine_initialized(self): | |
| class PrometheusStatLogger(StatLoggerBase): | ||
|
|
||
| def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): | ||
| self._unregister_vllm_metrics() | ||
|
|
||
| unregister_vllm_metrics() | ||
| self.vllm_config = vllm_config | ||
| self.engine_index = engine_index | ||
| # Use this flag to hide metrics that were deprecated in | ||
|
|
@@ -168,11 +168,13 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): | |
| self.gauge_scheduler_running = prometheus_client.Gauge( | ||
| name="vllm:num_requests_running", | ||
| documentation="Number of requests in model execution batches.", | ||
| multiprocess_mode="mostrecent", | ||
| labelnames=labelnames).labels(*labelvalues) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're re-spinning again, then a very minor stylistic request ... In the original version this line is all "label stuff" In the new version, it becomes "label stuff, new line, multiproc stuff, label stuff" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry did not quite understand the desired style? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh you mean the order? put multiprocess_mode before label stuff? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok done. |
||
|
|
||
| self.gauge_scheduler_waiting = prometheus_client.Gauge( | ||
| name="vllm:num_requests_waiting", | ||
| documentation="Number of requests waiting to be processed.", | ||
| multiprocess_mode="mostrecent", | ||
| labelnames=labelnames).labels(*labelvalues) | ||
|
|
||
| # | ||
|
|
@@ -181,6 +183,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): | |
| self.gauge_gpu_cache_usage = prometheus_client.Gauge( | ||
| name="vllm:gpu_cache_usage_perc", | ||
| documentation="GPU KV-cache usage. 1 means 100 percent usage.", | ||
| multiprocess_mode="mostrecent", | ||
| labelnames=labelnames).labels(*labelvalues) | ||
|
|
||
| self.counter_gpu_prefix_cache_queries = prometheus_client.Counter( | ||
|
|
@@ -241,6 +244,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): | |
| buckets=build_1_2_5_buckets(max_model_len), | ||
| labelnames=labelnames).labels(*labelvalues) | ||
|
|
||
| # TODO: This metric might be incorrect in case of using multiple | ||
| # api_server counts which uses prometheus mp. | ||
| # See: https://github.com/vllm-project/vllm/pull/18053 | ||
| self.histogram_iteration_tokens = \ | ||
| prometheus_client.Histogram( | ||
| name="vllm:iteration_tokens_total", | ||
|
|
@@ -339,6 +345,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): | |
| # | ||
| # LoRA metrics | ||
| # | ||
|
|
||
| # TODO: This metric might be incorrect in case of using multiple | ||
| # api_server counts which uses prometheus mp. | ||
| self.gauge_lora_info: Optional[prometheus_client.Gauge] = None | ||
| if vllm_config.lora_config is not None: | ||
| self.labelname_max_lora = "max_lora" | ||
|
|
@@ -349,13 +358,16 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): | |
| prometheus_client.Gauge( | ||
| name="vllm:lora_requests_info", | ||
| documentation="Running stats on lora requests.", | ||
| multiprocess_mode="sum", | ||
| labelnames=[ | ||
| self.labelname_max_lora, | ||
| self.labelname_waiting_lora_adapters, | ||
| self.labelname_running_lora_adapters, | ||
| ]) | ||
| ], | ||
| ) | ||
|
|
||
| def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): | ||
|
|
||
| metrics_info = config_obj.metrics_info() | ||
| metrics_info["engine"] = self.engine_index | ||
|
|
||
|
|
@@ -371,7 +383,9 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): | |
| info_gauge = prometheus_client.Gauge( | ||
| name=name, | ||
| documentation=documentation, | ||
| labelnames=metrics_info.keys()).labels(**metrics_info) | ||
| multiprocess_mode="mostrecent", | ||
| labelnames=metrics_info.keys(), | ||
| ).labels(**metrics_info) | ||
| info_gauge.set(1) | ||
|
|
||
| def record(self, scheduler_stats: Optional[SchedulerStats], | ||
|
|
@@ -445,13 +459,6 @@ def record(self, scheduler_stats: Optional[SchedulerStats], | |
| self.gauge_lora_info.labels(**lora_info_labels)\ | ||
| .set_to_current_time() | ||
|
|
||
| @staticmethod | ||
| def _unregister_vllm_metrics(): | ||
| # Unregister any existing vLLM collectors (for CI/CD | ||
| for collector in list(prometheus_client.REGISTRY._collector_to_names): | ||
| if hasattr(collector, "_name") and "vllm" in collector._name: | ||
| prometheus_client.REGISTRY.unregister(collector) | ||
|
|
||
| def log_engine_initialized(self): | ||
| self.log_metrics_info("cache_config", self.vllm_config.cache_config) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import os | ||
| import tempfile | ||
| from typing import Optional | ||
|
|
||
| from prometheus_client import REGISTRY, CollectorRegistry, multiprocess | ||
|
|
||
| from vllm.logger import init_logger | ||
|
|
||
| logger = init_logger(__name__) | ||
|
|
||
| # Global temporary directory for prometheus multiprocessing | ||
| _prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None | ||
|
|
||
|
|
||
| def setup_multiprocess_prometheus(): | ||
| """Set up prometheus multiprocessing directory if not already configured. | ||
|
|
||
| """ | ||
| global _prometheus_multiproc_dir | ||
|
|
||
| if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: | ||
| # Make TemporaryDirectory for prometheus multiprocessing | ||
| # Note: global TemporaryDirectory will be automatically | ||
| # cleaned up upon exit. | ||
| _prometheus_multiproc_dir = tempfile.TemporaryDirectory() | ||
| os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name | ||
| logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s", | ||
| _prometheus_multiproc_dir.name) | ||
| else: | ||
| logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. " | ||
| "This directory must be wiped between vLLM runs or " | ||
| "you will find inaccurate metrics. Unset the variable " | ||
| "and vLLM will properly handle cleanup.") | ||
|
|
||
|
|
||
| def get_prometheus_registry(): | ||
| """Get the appropriate prometheus registry based on multiprocessing | ||
| configuration. | ||
|
|
||
| Returns: | ||
| Registry: A prometheus registry | ||
| """ | ||
| if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None: | ||
| logger.debug("Using multiprocess registry for prometheus metrics") | ||
| registry = CollectorRegistry() | ||
| multiprocess.MultiProcessCollector(registry) | ||
| return registry | ||
|
|
||
| return REGISTRY | ||
|
|
||
|
|
||
| def unregister_vllm_metrics(): | ||
| """Unregister any existing vLLM collectors from the prometheus registry. | ||
|
|
||
| This is useful for testing and CI/CD where metrics may be registered | ||
| multiple times across test runs. | ||
| """ | ||
| registry = get_prometheus_registry() | ||
| # Unregister any existing vLLM collectors | ||
| for collector in list(registry._collector_to_names): | ||
| if hasattr(collector, "_name") and "vllm" in collector._name: | ||
| registry.unregister(collector) | ||
|
|
||
|
|
||
| def shutdown_prometheus(): | ||
| """Shutdown prometheus metrics.""" | ||
| try: | ||
| pid = os.getpid() | ||
| multiprocess.mark_process_dead(pid) | ||
| logger.debug("Marked Prometheus metrics for process %d as dead", pid) | ||
| except Exception as e: | ||
| logger.error("Error during metrics cleanup: %s", str(e)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can't remove.
mount_metricsis added back here, so it's needed. That's why I wanted to keepmount_metricsentirely in prometheus.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doh, I misread, sorry. That's fine, this stuff is in the "api server" category not the "disgusting prometheus multi proc hackery" category 😃