vllm-project · majiayu000 · Jan 4, 2026
@@ -77,9 +77,6 @@ def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: i
     # {metric_family: [(suffix, expected_value)]}
     return {
         "vllm:time_to_first_token_seconds": [("_count", num_requests)],
-        "vllm:time_per_output_token_seconds": [
-            ("_count", num_requests * (max_tokens - 1))
-        ],
         "vllm:e2e_request_latency_seconds": [("_count", num_requests)],
         "vllm:request_queue_time_seconds": [("_count", num_requests)],
         "vllm:request_inference_time_seconds": [("_count", num_requests)],
@@ -203,9 +200,6 @@ async def test_metrics_counts(
     "vllm:request_params_max_tokens_sum",
     "vllm:request_params_max_tokens_bucket",
     "vllm:request_params_max_tokens_count",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
@@ -238,9 +232,6 @@ async def test_metrics_counts(
     "vllm:gpu_cache_usage_perc",
     "vllm:gpu_prefix_cache_queries",
     "vllm:gpu_prefix_cache_hits",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
 ]
 
 

@@ -715,43 +715,6 @@ def __init__(
             histogram_time_to_first_token, engine_indexes, model_name
         )
 
-        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
-        # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
-        # TODO: remove in 0.13.0
-        if self.show_hidden_metrics:
-            histogram_time_per_output_token = self._histogram_cls(
-                name="vllm:time_per_output_token_seconds",
-                documentation=(
-                    "Histogram of time per output token in seconds."
-                    "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
-                ),
-                buckets=[
-                    0.01,
-                    0.025,
-                    0.05,
-                    0.075,
-                    0.1,
-                    0.15,
-                    0.2,
-                    0.3,
-                    0.4,
-                    0.5,
-                    0.75,
-                    1.0,
-                    2.5,
-                    5.0,
-                    7.5,
-                    10.0,
-                    20.0,
-                    40.0,
-                    80.0,
-                ],
-                labelnames=labelnames,
-            )
-            self.histogram_time_per_output_token = make_per_engine(
-                histogram_time_per_output_token, engine_indexes, model_name
-            )
-
         histogram_inter_token_latency = self._histogram_cls(
             name="vllm:inter_token_latency_seconds",
             documentation="Histogram of inter-token latency in seconds.",
@@ -1124,8 +1087,6 @@ def record(
             self.histogram_time_to_first_token[engine_idx].observe(ttft)
         for itl in iteration_stats.inter_token_latencies_iter:
             self.histogram_inter_token_latency[engine_idx].observe(itl)
-            if self.show_hidden_metrics:
-                self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason][