diff --git a/src/chat/tests/chat.rs b/src/chat/tests/chat.rs index 7f2f7d68..c9079c55 100644 --- a/src/chat/tests/chat.rs +++ b/src/chat/tests/chat.rs @@ -46,8 +46,7 @@ fn request_output( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } @@ -72,8 +71,7 @@ fn request_output_with_logprobs( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } diff --git a/src/engine-core-client/src/protocol/classfied_outputs.rs b/src/engine-core-client/src/protocol/classfied_outputs.rs index 8afd01f0..4e7b88d3 100644 --- a/src/engine-core-client/src/protocol/classfied_outputs.rs +++ b/src/engine-core-client/src/protocol/classfied_outputs.rs @@ -128,8 +128,7 @@ mod tests { events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, }, @@ -227,8 +226,7 @@ mod tests { events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, }, diff --git a/src/engine-core-client/src/protocol/mod.rs b/src/engine-core-client/src/protocol/mod.rs index 71dc8f11..58abef71 100644 --- a/src/engine-core-client/src/protocol/mod.rs +++ b/src/engine-core-client/src/protocol/mod.rs @@ -11,7 +11,7 @@ use serde_tuple::{Deserialize_tuple, Serialize_tuple}; use thiserror_ext::AsReport; use crate::error::{Error, Result, value_encode_ext}; -use crate::protocol::stats::SchedulerStats; +use crate::protocol::stats::{PrefillStats, SchedulerStats}; // TODO: This module currently mixes reusable frontend-facing semantic types // (for example `FinishReason`, `StopReason`, `RequestOutputKind`, and future @@ -401,7 +401,7 @@ impl EngineCoreUtilityRequest { /// Engine-core output for a single request. /// /// Original Python definition: -/// +/// #[derive(Debug, Clone, PartialEq, Serialize_tuple, Deserialize_tuple, DefaultFromSerde)] pub struct EngineCoreOutput { pub request_id: String, @@ -424,12 +424,10 @@ pub struct EngineCoreOutput { pub kv_transfer_params: Option, #[serde(default)] pub trace_headers: Option, - /// Number of tokens with prefix-cache hits, local plus external. + /// Breakdown of the scheduled prefill computation, set on the first output + /// of a newly scheduled prefill and elided for subsequent decode outputs. #[serde(default)] - pub num_cached_tokens: u32, - /// Number of tokens computed remotely, preserving the original connector count. - #[serde(default)] - pub num_external_computed_tokens: u32, + pub prefill_stats: Option, #[serde(default)] pub routed_experts: Option, /// Number of NaNs seen in logits. Values above zero indicate corruption. @@ -631,8 +629,7 @@ mod tests { events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, }], diff --git a/src/engine-core-client/src/protocol/stats.rs b/src/engine-core-client/src/protocol/stats.rs index d4bfabc2..1bb3d5a2 100644 --- a/src/engine-core-client/src/protocol/stats.rs +++ b/src/engine-core-client/src/protocol/stats.rs @@ -75,6 +75,33 @@ pub struct SpecDecodingStats { pub num_accepted_tokens_per_pos: Vec, } +/// Breakdown of a scheduled prefill computation. +/// +/// Python models this as a plain `@dataclass`, so it is serialized by msgspec +/// as a map (named fields) rather than in the array-like form used by +/// `EngineCoreOutput` itself. +/// +/// Original Python definition: +/// +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct PrefillStats { + /// Total number of tokens to be prefilled. + #[serde(default)] + pub num_prompt_tokens: u32, + /// Tokens to be prefilled locally (actual compute work). + #[serde(default)] + pub num_computed_tokens: u32, + /// Tokens to be prefilled without actual compute work. + #[serde(default)] + pub num_cached_tokens: u32, + /// Tokens to be prefilled from local prefix cache. + #[serde(default)] + pub num_local_cached_tokens: u32, + /// Tokens to be prefilled from external KV transfer. + #[serde(default)] + pub num_external_cached_tokens: u32, +} + /// Stats for debugging the metrics calculation. /// /// Original Python definition: diff --git a/src/engine-core-client/src/tests/client.rs b/src/engine-core-client/src/tests/client.rs index a6e30200..dbd8e3f9 100644 --- a/src/engine-core-client/src/tests/client.rs +++ b/src/engine-core-client/src/tests/client.rs @@ -191,8 +191,7 @@ fn request_output( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } @@ -2262,8 +2261,7 @@ fn python_msgpack_fixtures_match_rust_encoding() { events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, }, diff --git a/src/engine-core-client/src/tests/python_compat.py b/src/engine-core-client/src/tests/python_compat.py index 5f55d8d5..a4c12ce0 100755 --- a/src/engine-core-client/src/tests/python_compat.py +++ b/src/engine-core-client/src/tests/python_compat.py @@ -84,8 +84,7 @@ class EngineCoreOutput( events: object | None = None kv_transfer_params: object | None = None trace_headers: object | None = None - num_cached_tokens: int = 0 - num_external_computed_tokens: int = 0 + prefill_stats: object | None = None routed_experts: object | None = None num_nans_in_logits: int = 0 diff --git a/src/llm/src/request_metrics.rs b/src/llm/src/request_metrics.rs index afbfa3fd..f62a8c26 100644 --- a/src/llm/src/request_metrics.rs +++ b/src/llm/src/request_metrics.rs @@ -1,5 +1,6 @@ use std::time::{SystemTime, UNIX_EPOCH}; +use vllm_engine_core_client::protocol::stats::PrefillStats; use vllm_engine_core_client::protocol::{EngineCoreEvent, EngineCoreEventType, EngineCoreOutput}; use vllm_metrics::{ EngineLabels, FinishedReasonLabels, METRICS, PromptTokenSourceLabels, RequestMetrics, @@ -82,7 +83,9 @@ impl RequestMetricsTracker { output: &EngineCoreOutput, ) { self.last_seen_engine_index = engine_index; - self.latest_num_cached_tokens = output.num_cached_tokens; + if let Some(prefill_stats) = &output.prefill_stats { + self.latest_num_cached_tokens = prefill_stats.num_cached_tokens; + } self.num_generation_tokens += output.new_token_ids.len() as u32; metrics() .generation_tokens @@ -94,13 +97,9 @@ impl RequestMetricsTracker { } if self.is_prefilling { - record_prompt_tokens( - &self.model_name, - engine_index, - self.prompt_len, - output.num_cached_tokens, - output.num_external_computed_tokens, - ); + if let Some(prefill_stats) = &output.prefill_stats { + record_prompt_tokens(&self.model_name, engine_index, prefill_stats); + } self.first_token_latency = received_at - self.arrival_time; observe_time_to_first_token_seconds( &self.model_name, @@ -260,24 +259,15 @@ fn prompt_token_source_labels( } } -fn record_prompt_tokens( - model_name: &str, - engine: u32, - prompt_len: u32, - num_cached_tokens: u32, - num_external_computed_tokens: u32, -) { - let recomputed = u64::from(num_cached_tokens + 1 == prompt_len); - let computed = prompt_len.saturating_sub(num_cached_tokens) as u64; - let external_kv_transfer = num_external_computed_tokens as u64; - let local_cache_hit = (num_cached_tokens as u64) - .saturating_add(recomputed) - .saturating_sub(external_kv_transfer); +fn record_prompt_tokens(model_name: &str, engine: u32, prefill_stats: &PrefillStats) { + let computed = prefill_stats.num_computed_tokens as u64; + let local_cache_hit = prefill_stats.num_local_cached_tokens as u64; + let external_kv_transfer = prefill_stats.num_external_cached_tokens as u64; metrics() .prompt_tokens .get_or_create(&engine_labels(model_name, engine)) - .inc_by(prompt_len as u64); + .inc_by(prefill_stats.num_prompt_tokens as u64); metrics() .prompt_tokens_by_source .get_or_create(&prompt_token_source_labels( @@ -305,11 +295,7 @@ fn record_prompt_tokens( metrics() .prompt_tokens_cached .get_or_create(&engine_labels(model_name, engine)) - .inc_by(num_cached_tokens as u64); - metrics() - .prompt_tokens_recomputed - .get_or_create(&engine_labels(model_name, engine)) - .inc_by(recomputed); + .inc_by(prefill_stats.num_cached_tokens as u64); } fn diff_or_zero(end: f64, start: f64) -> f64 { @@ -337,6 +323,7 @@ pub(crate) fn current_unix_timestamp_secs() -> f64 { #[cfg(test)] mod tests { + use vllm_engine_core_client::protocol::stats::PrefillStats; use vllm_engine_core_client::protocol::{EngineCoreEvent, EngineCoreEventType}; use super::{RequestMetricsTracker, diff_or_zero}; @@ -363,7 +350,13 @@ mod tests { timestamp: 9.0, }, ]), - num_cached_tokens: 4, + prefill_stats: Some(PrefillStats { + num_prompt_tokens: 64, + num_computed_tokens: 60, + num_cached_tokens: 4, + num_local_cached_tokens: 4, + num_external_cached_tokens: 0, + }), ..Default::default() }, ); @@ -379,7 +372,6 @@ mod tests { r#type: EngineCoreEventType::Preempted, timestamp: 10.5, }]), - num_cached_tokens: 4, ..Default::default() }, ); diff --git a/src/llm/tests/generate.rs b/src/llm/tests/generate.rs index 9b73be68..611ff41f 100644 --- a/src/llm/tests/generate.rs +++ b/src/llm/tests/generate.rs @@ -6,6 +6,7 @@ use futures::StreamExt as _; use tokio::time::timeout; use tracing_subscriber::EnvFilter; use uuid::Uuid; +use vllm_engine_core_client::protocol::stats::PrefillStats; use vllm_engine_core_client::protocol::{ EngineCoreEvent, EngineCoreEventType, EngineCoreFinishReason, EngineCoreOutput, EngineCoreOutputs, EngineCoreRequest, EngineCoreSamplingParams, Logprobs, MaybeWireLogprobs, @@ -47,8 +48,7 @@ fn request_output_with_events( events, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } @@ -72,8 +72,7 @@ fn request_output_with_logprobs( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } @@ -98,8 +97,7 @@ fn request_output_with_logprobs_and_kv( events: None, kv_transfer_params, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } @@ -585,21 +583,28 @@ async fn generate_records_request_metrics_in_prometheus_output() { EngineCoreOutputs { engine_index: 4, timestamp: 10.0, - outputs: vec![request_output_with_events( - &request.request_id, - vec![1], - None, - Some(vec![ - EngineCoreEvent { - r#type: EngineCoreEventType::Queued, - timestamp: 8.0, - }, - EngineCoreEvent { - r#type: EngineCoreEventType::Scheduled, - timestamp: 9.0, - }, - ]), - )], + outputs: vec![EngineCoreOutput { + prefill_stats: Some(PrefillStats { + num_prompt_tokens: 2, + num_computed_tokens: 2, + ..Default::default() + }), + ..request_output_with_events( + &request.request_id, + vec![1], + None, + Some(vec![ + EngineCoreEvent { + r#type: EngineCoreEventType::Queued, + timestamp: 8.0, + }, + EngineCoreEvent { + r#type: EngineCoreEventType::Scheduled, + timestamp: 9.0, + }, + ]), + ) + }], ..Default::default() }, ) @@ -658,9 +663,6 @@ async fn generate_records_request_metrics_in_prometheus_output() { assert!(rendered.contains(&format!( "vllm:prompt_tokens_cached_total{{model_name=\"{model_name}\",engine=\"4\"}} 0" ))); - assert!(rendered.contains(&format!( - "vllm:prompt_tokens_recomputed_total{{model_name=\"{model_name}\",engine=\"4\"}} 0" - ))); assert!(rendered.contains(&format!( "vllm:generation_tokens_total{{model_name=\"{model_name}\",engine=\"4\"}} 3" ))); diff --git a/src/metrics/src/request.rs b/src/metrics/src/request.rs index 295e862a..421ff303 100644 --- a/src/metrics/src/request.rs +++ b/src/metrics/src/request.rs @@ -87,7 +87,6 @@ pub struct RequestMetrics { pub prompt_tokens: Family, pub prompt_tokens_by_source: PromptTokenSourceCounterFamily, pub prompt_tokens_cached: Family, - pub prompt_tokens_recomputed: Family, pub generation_tokens: Family, // We intentionally don't support iteration-level histograms for now, since it seems to make @@ -145,13 +144,6 @@ impl RequestMetrics { prompt_tokens_cached.clone(), ); - let prompt_tokens_recomputed = Family::default(); - registry.register( - "vllm:prompt_tokens_recomputed", - "Number of cached prompt tokens recomputed during prefill.", - prompt_tokens_recomputed.clone(), - ); - let generation_tokens = Family::default(); registry.register( "vllm:generation_tokens", @@ -285,7 +277,6 @@ impl RequestMetrics { prompt_tokens, prompt_tokens_by_source, prompt_tokens_cached, - prompt_tokens_recomputed, generation_tokens, request_success, request_prompt_tokens, diff --git a/src/server/src/grpc/tests.rs b/src/server/src/grpc/tests.rs index 460b186b..06862029 100644 --- a/src/server/src/grpc/tests.rs +++ b/src/server/src/grpc/tests.rs @@ -92,8 +92,7 @@ fn request_output( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } diff --git a/src/server/src/routes/http_client_tests.rs b/src/server/src/routes/http_client_tests.rs index 0a3f095c..f6449398 100644 --- a/src/server/src/routes/http_client_tests.rs +++ b/src/server/src/routes/http_client_tests.rs @@ -98,8 +98,7 @@ fn request_output( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } diff --git a/src/server/src/routes/tests.rs b/src/server/src/routes/tests.rs index d7a0e023..d561bca3 100644 --- a/src/server/src/routes/tests.rs +++ b/src/server/src/routes/tests.rs @@ -68,8 +68,7 @@ fn request_output_with_stop_reason( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } @@ -94,8 +93,7 @@ fn request_output_with_logprobs( events: None, kv_transfer_params: None, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, } @@ -121,8 +119,7 @@ fn request_output_with_logprobs_and_kv( events: None, kv_transfer_params, trace_headers: None, - num_cached_tokens: 0, - num_external_computed_tokens: 0, + prefill_stats: None, routed_experts: None, num_nans_in_logits: 0, }