diff --git a/tests/entrypoints/instrumentator/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py index 9f2ad105a380..3216fd5eba9a 100644 --- a/tests/entrypoints/instrumentator/test_metrics.py +++ b/tests/entrypoints/instrumentator/test_metrics.py @@ -77,9 +77,6 @@ def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: i # {metric_family: [(suffix, expected_value)]} return { "vllm:time_to_first_token_seconds": [("_count", num_requests)], - "vllm:time_per_output_token_seconds": [ - ("_count", num_requests * (max_tokens - 1)) - ], "vllm:e2e_request_latency_seconds": [("_count", num_requests)], "vllm:request_queue_time_seconds": [("_count", num_requests)], "vllm:request_inference_time_seconds": [("_count", num_requests)], @@ -203,9 +200,6 @@ async def test_metrics_counts( "vllm:request_params_max_tokens_sum", "vllm:request_params_max_tokens_bucket", "vllm:request_params_max_tokens_count", - "vllm:time_per_output_token_seconds_sum", - "vllm:time_per_output_token_seconds_bucket", - "vllm:time_per_output_token_seconds_count", "vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_count", @@ -238,9 +232,6 @@ async def test_metrics_counts( "vllm:gpu_cache_usage_perc", "vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_hits", - "vllm:time_per_output_token_seconds_sum", - "vllm:time_per_output_token_seconds_bucket", - "vllm:time_per_output_token_seconds_count", ] diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 2213b952c7a8..3a080f01a4d2 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -715,43 +715,6 @@ def __init__( histogram_time_to_first_token, engine_indexes, model_name ) - # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds - # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11 - # TODO: remove in 0.13.0 - if self.show_hidden_metrics: - histogram_time_per_output_token = self._histogram_cls( - name="vllm:time_per_output_token_seconds", - documentation=( - "Histogram of time per output token in seconds." - "DEPRECATED: Use vllm:inter_token_latency_seconds instead." - ), - buckets=[ - 0.01, - 0.025, - 0.05, - 0.075, - 0.1, - 0.15, - 0.2, - 0.3, - 0.4, - 0.5, - 0.75, - 1.0, - 2.5, - 5.0, - 7.5, - 10.0, - 20.0, - 40.0, - 80.0, - ], - labelnames=labelnames, - ) - self.histogram_time_per_output_token = make_per_engine( - histogram_time_per_output_token, engine_indexes, model_name - ) - histogram_inter_token_latency = self._histogram_cls( name="vllm:inter_token_latency_seconds", documentation="Histogram of inter-token latency in seconds.", @@ -1124,8 +1087,6 @@ def record( self.histogram_time_to_first_token[engine_idx].observe(ttft) for itl in iteration_stats.inter_token_latencies_iter: self.histogram_inter_token_latency[engine_idx].observe(itl) - if self.show_hidden_metrics: - self.histogram_time_per_output_token[engine_idx].observe(itl) for finished_request in iteration_stats.finished_requests: self.counter_request_success[finished_request.finish_reason][