Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:kv_cache_usage_perc",
"vllm:prefix_cache_queries",
"vllm:prefix_cache_hits",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
Expand Down Expand Up @@ -277,6 +280,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
]

HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
Expand Down Expand Up @@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):

running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server))
_get_running_metrics_from_api(server, use_v1))

# Expect no running requests or kvcache usage
assert running_requests == 0
Expand All @@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,

# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server))
_get_running_metrics_from_api(server, use_v1))

# Expect running requests and kvcache usage
assert running_requests > 0
Expand All @@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,

# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server))
_get_running_metrics_from_api(server, use_v1))

assert running_requests_after == 0,\
(f"Expected 0 running requests after abort, got "
Expand All @@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
f"{kv_cache_usage_after}")


def _get_running_metrics_from_api(server: RemoteOpenAIServer):
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
"""Return (running_count, waiting_count, kv_cache_usage)"""

response = requests.get(server.url_for("metrics"))
Expand All @@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None

kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
if use_v1 else "vllm:gpu_cache_usage_perc")

for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
for sample in family.samples:
Expand All @@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
if sample.name == "vllm:num_requests_waiting":
waiting_requests = sample.value
break
elif family.name == "vllm:gpu_cache_usage_perc":
elif family.name == kv_cache_usage_metric:
for sample in family.samples:
if sample.name == "vllm:gpu_cache_usage_perc":
if sample.name == kv_cache_usage_metric:
kv_cache_usage = sample.value
break

Expand Down
88 changes: 48 additions & 40 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,40 +206,46 @@ def __init__(self,
#
# GPU cache
#
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
# TODO: in 0.10, only enable if show_hidden_metrics=True
gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc",
documentation=(
"GPU KV-cache usage. 1 means 100 percent usage."
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
multiprocess_mode="mostrecent",
labelnames=labelnames)
self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage,
engine_indexes,
model_name)

# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
# TODO: in 0.10, only enable if show_hidden_metrics=True
counter_gpu_prefix_cache_queries = self._counter_cls(
name="vllm:gpu_prefix_cache_queries",
documentation=(
"GPU prefix cache queries, in terms of number of queried"
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."),
labelnames=labelnames)
self.counter_gpu_prefix_cache_queries = make_per_engine(
counter_gpu_prefix_cache_queries, engine_indexes, model_name)

# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
# TODO: in 0.10, only enable if show_hidden_metrics=True
counter_gpu_prefix_cache_hits = self._counter_cls(
name="vllm:gpu_prefix_cache_hits",
documentation=(
"GPU prefix cache hits, in terms of number of cached "
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
labelnames=labelnames)
self.counter_gpu_prefix_cache_hits = make_per_engine(
counter_gpu_prefix_cache_hits, engine_indexes, model_name)
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc",
documentation=(
"GPU KV-cache usage. 1 means 100 percent usage."
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
multiprocess_mode="mostrecent",
labelnames=labelnames)
self.gauge_gpu_cache_usage = make_per_engine(
gauge_gpu_cache_usage, engine_indexes, model_name)

# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
counter_gpu_prefix_cache_queries = self._counter_cls(
name="vllm:gpu_prefix_cache_queries",
documentation=(
"GPU prefix cache queries, in terms of number of queried"
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
labelnames=labelnames)
self.counter_gpu_prefix_cache_queries = make_per_engine(
counter_gpu_prefix_cache_queries, engine_indexes, model_name)

# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
counter_gpu_prefix_cache_hits = self._counter_cls(
name="vllm:gpu_prefix_cache_hits",
documentation=(
"GPU prefix cache hits, in terms of number of cached "
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
labelnames=labelnames)
self.counter_gpu_prefix_cache_hits = make_per_engine(
counter_gpu_prefix_cache_hits, engine_indexes, model_name)

gauge_kv_cache_usage = self._gauge_cls(
name="vllm:kv_cache_usage_perc",
Expand Down Expand Up @@ -513,15 +519,17 @@ def record(self,
self.gauge_scheduler_waiting[engine_idx].set(
scheduler_stats.num_waiting_reqs)

self.gauge_gpu_cache_usage[engine_idx].set(
scheduler_stats.kv_cache_usage)
if self.show_hidden_metrics:
self.gauge_gpu_cache_usage[engine_idx].set(
scheduler_stats.kv_cache_usage)
self.gauge_kv_cache_usage[engine_idx].set(
scheduler_stats.kv_cache_usage)

self.counter_gpu_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries)
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
scheduler_stats.prefix_cache_stats.hits)
if self.show_hidden_metrics:
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries)
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
scheduler_stats.prefix_cache_stats.hits)

self.counter_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries)
Expand Down