diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md index 41e120406a..e9ce50a303 100644 --- a/docs/proposals/003-model-server-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -29,7 +29,7 @@ effort. | ----- | ---- | ------------ | ---- | ---- | ---- | | TotalQueuedRequests | Gauge | The current total number of requests in the queue.| `vllm:num_requests_waiting`| `nv_trt_llm_request_metrics{request_type=waiting}`| `sglang:num_queue_reqs` | TotalRunningRequests | Gauge | The current total number of requests actively being served on the model server.| `vllm:num_requests_running`| `nv_trt_llm_request_metrics{request_type=scheduled}`| `sglang:num_running_reqs` -| KVCacheUtilization| Gauge | The current KV cache utilization in percentage.| `vllm:gpu_cache_usage_perc`| `nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}`| `sglang:token_usage` +| KVCacheUtilization| Gauge | The current KV cache utilization in percentage.| `vllm:kv_cache_usage_perc`| `nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}`| `sglang:token_usage` | [Optional] BlockSize | Labeled | The block size in tokens to allocate memory, used by the prefix cache scorer. If this metric is not available, the BlockSize will be derived from the [prefix plugin config](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/prefix-aware/#customize-the-prefix-cache-plugin).| name: `vllm:cache_config_info`, label name: `block_size`| | | [Optional] NumGPUBlocks| Labeled | The total number of blocks in the HBM KV cache, used by the prefix cache scorer. If this metric is not available, the NumGPUBlocks will be derived from the [prefix plugin config](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/prefix-aware/#customize-the-prefix-cache-plugin).| name: `vllm:cache_config_info`, label name: `num_gpu_blocks`| | diff --git a/pkg/epp/datalayer/metrics/extractor_test.go b/pkg/epp/datalayer/metrics/extractor_test.go index bb408f6db1..bf1a133dec 100644 --- a/pkg/epp/datalayer/metrics/extractor_test.go +++ b/pkg/epp/datalayer/metrics/extractor_test.go @@ -32,7 +32,7 @@ const ( // use hardcoded values - importing causes cycle defaultTotalQueuedRequestsMetric = "vllm:num_requests_waiting" defaultTotalRunningRequestsMetric = "vllm:num_requests_running" - defaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" + defaultKvCacheUsagePercentageMetric = "vllm:kv_cache_usage_perc" defaultLoraInfoMetric = "vllm:lora_requests_info" defaultCacheInfoMetric = "vllm:cache_config_info" ) diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 74c032b95a..5864166bcb 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -79,7 +79,7 @@ const ( DefaultEnablePprof = true // default for --enable-pprof DefaultTotalQueuedRequestsMetric = "vllm:num_requests_waiting" // default for --total-queued-requests-metric DefaultTotalRunningRequestsMetric = "vllm:num_requests_running" // default for --total-running-requests-metric - DefaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" // default for --kv-cache-usage-percentage-metric + DefaultKvCacheUsagePercentageMetric = "vllm:kv_cache_usage_perc" // default for --kv-cache-usage-percentage-metric DefaultLoraInfoMetric = "vllm:lora_requests_info" // default for --lora-info-metric DefaultCacheInfoMetric = "vllm:cache_config_info" // default for --cache-info-metric DefaultCertPath = "" // default for --cert-path