diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index f0b61902eb56..1a47d245b0b1 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -232,9 +232,6 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:gpu_cache_usage_perc", "vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_hits", - "vllm:kv_cache_usage_perc", - "vllm:prefix_cache_queries", - "vllm:prefix_cache_hits", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", @@ -280,9 +277,6 @@ async def test_metrics_counts(server: RemoteOpenAIServer, ] HIDDEN_DEPRECATED_METRICS: list[str] = [ - "vllm:gpu_cache_usage_perc", - "vllm:gpu_prefix_cache_queries", - "vllm:gpu_prefix_cache_hits", "vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_count", @@ -313,7 +307,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool): running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server, use_v1)) + _get_running_metrics_from_api(server)) # Expect no running requests or kvcache usage assert running_requests == 0 @@ -336,7 +330,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Check that we have running requests running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server, use_v1)) + _get_running_metrics_from_api(server)) # Expect running requests and kvcache usage assert running_requests > 0 @@ -355,7 +349,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Verify running and waiting requests counts and KV cache usage are zero running_requests_after, waiting_requests_after, kv_cache_usage_after = ( - _get_running_metrics_from_api(server, use_v1)) + _get_running_metrics_from_api(server)) assert running_requests_after == 0,\ (f"Expected 0 running requests after abort, got " @@ -368,7 +362,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, f"{kv_cache_usage_after}") -def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): +def _get_running_metrics_from_api(server: RemoteOpenAIServer): """Return (running_count, waiting_count, kv_cache_usage)""" response = requests.get(server.url_for("metrics")) @@ -377,9 +371,6 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): # Verify running and waiting requests counts and KV cache usage are zero running_requests, waiting_requests, kv_cache_usage = None, None, None - kv_cache_usage_metric = ("vllm:kv_cache_usage_perc" - if use_v1 else "vllm:gpu_cache_usage_perc") - for family in text_string_to_metric_families(response.text): if family.name == "vllm:num_requests_running": for sample in family.samples: @@ -391,9 +382,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): if sample.name == "vllm:num_requests_waiting": waiting_requests = sample.value break - elif family.name == kv_cache_usage_metric: + elif family.name == "vllm:gpu_cache_usage_perc": for sample in family.samples: - if sample.name == kv_cache_usage_metric: + if sample.name == "vllm:gpu_cache_usage_perc": kv_cache_usage = sample.value break diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f0076b2d81db..99bd05d02a71 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -208,46 +208,40 @@ def __init__(self, # # GPU cache # - # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - gauge_gpu_cache_usage = self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), - multiprocess_mode="mostrecent", - labelnames=labelnames) - self.gauge_gpu_cache_usage = make_per_engine( - gauge_gpu_cache_usage, engine_indexes, model_name) - - # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - counter_gpu_prefix_cache_queries = self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." - ), - labelnames=labelnames) - self.counter_gpu_prefix_cache_queries = make_per_engine( - counter_gpu_prefix_cache_queries, engine_indexes, model_name) - - # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - counter_gpu_prefix_cache_hits = self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames) - self.counter_gpu_prefix_cache_hits = make_per_engine( - counter_gpu_prefix_cache_hits, engine_indexes, model_name) + # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc + # TODO: in 0.10, only enable if show_hidden_metrics=True + gauge_gpu_cache_usage = self._gauge_cls( + name="vllm:gpu_cache_usage_perc", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, + engine_indexes, + model_name) + + # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries + # TODO: in 0.10, only enable if show_hidden_metrics=True + counter_gpu_prefix_cache_queries = self._counter_cls( + name="vllm:gpu_prefix_cache_queries", + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) + + # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits + # TODO: in 0.10, only enable if show_hidden_metrics=True + counter_gpu_prefix_cache_hits = self._counter_cls( + name="vllm:gpu_prefix_cache_hits", + documentation=( + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", @@ -521,17 +515,15 @@ def record(self, self.gauge_scheduler_waiting[engine_idx].set( scheduler_stats.num_waiting_reqs) - if self.show_hidden_metrics: - self.gauge_gpu_cache_usage[engine_idx].set( - scheduler_stats.kv_cache_usage) + self.gauge_gpu_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) self.gauge_kv_cache_usage[engine_idx].set( scheduler_stats.kv_cache_usage) - if self.show_hidden_metrics: - self.counter_gpu_prefix_cache_queries[engine_idx].inc( - scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits[engine_idx].inc( - scheduler_stats.prefix_cache_stats.hits) + self.counter_gpu_prefix_cache_queries[engine_idx].inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits[engine_idx].inc( + scheduler_stats.prefix_cache_stats.hits) self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries)