Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/chat/tests/chat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ fn request_output(
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand All @@ -72,8 +71,7 @@ fn request_output_with_logprobs(
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand Down
6 changes: 2 additions & 4 deletions src/engine-core-client/src/protocol/classfied_outputs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ mod tests {
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
},
Expand Down Expand Up @@ -227,8 +226,7 @@ mod tests {
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
},
Expand Down
15 changes: 6 additions & 9 deletions src/engine-core-client/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use serde_tuple::{Deserialize_tuple, Serialize_tuple};
use thiserror_ext::AsReport;

use crate::error::{Error, Result, value_encode_ext};
use crate::protocol::stats::SchedulerStats;
use crate::protocol::stats::{PrefillStats, SchedulerStats};

// TODO: This module currently mixes reusable frontend-facing semantic types
// (for example `FinishReason`, `StopReason`, `RequestOutputKind`, and future
Expand Down Expand Up @@ -401,7 +401,7 @@ impl EngineCoreUtilityRequest {
/// Engine-core output for a single request.
///
/// Original Python definition:
/// <https://github.com/vllm-project/vllm/blob/f22d6e026798a74e6542a52ef776c054f2de572a/vllm/v1/engine/__init__.py#L140-L171>
/// <https://github.com/vllm-project/vllm/blob/d3af8c18317c0dc008d42e4367fbb9045cfb7bf6/vllm/v1/engine/__init__.py#L154-L184>
#[derive(Debug, Clone, PartialEq, Serialize_tuple, Deserialize_tuple, DefaultFromSerde)]
pub struct EngineCoreOutput {
pub request_id: String,
Expand All @@ -424,12 +424,10 @@ pub struct EngineCoreOutput {
pub kv_transfer_params: Option<serde_json::Value>,
#[serde(default)]
pub trace_headers: Option<OpaqueValue>,
/// Number of tokens with prefix-cache hits, local plus external.
/// Breakdown of the scheduled prefill computation, set on the first output
/// of a newly scheduled prefill and elided for subsequent decode outputs.
#[serde(default)]
pub num_cached_tokens: u32,
/// Number of tokens computed remotely, preserving the original connector count.
#[serde(default)]
pub num_external_computed_tokens: u32,
pub prefill_stats: Option<PrefillStats>,
#[serde(default)]
pub routed_experts: Option<OpaqueValue>,
/// Number of NaNs seen in logits. Values above zero indicate corruption.
Expand Down Expand Up @@ -631,8 +629,7 @@ mod tests {
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}],
Expand Down
27 changes: 27 additions & 0 deletions src/engine-core-client/src/protocol/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,33 @@ pub struct SpecDecodingStats {
pub num_accepted_tokens_per_pos: Vec<u64>,
}

/// Breakdown of a scheduled prefill computation.
///
/// Python models this as a plain `@dataclass`, so it is serialized by msgspec
/// as a map (named fields) rather than in the array-like form used by
/// `EngineCoreOutput` itself.
///
/// Original Python definition:
/// <https://github.com/vllm-project/vllm/blob/d3af8c18317c0dc008d42e4367fbb9045cfb7bf6/vllm/v1/metrics/stats.py#L242-L273>
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct PrefillStats {
/// Total number of tokens to be prefilled.
#[serde(default)]
pub num_prompt_tokens: u32,
/// Tokens to be prefilled locally (actual compute work).
#[serde(default)]
pub num_computed_tokens: u32,
/// Tokens to be prefilled without actual compute work.
#[serde(default)]
pub num_cached_tokens: u32,
/// Tokens to be prefilled from local prefix cache.
#[serde(default)]
pub num_local_cached_tokens: u32,
/// Tokens to be prefilled from external KV transfer.
#[serde(default)]
pub num_external_cached_tokens: u32,
}

/// Stats for debugging the metrics calculation.
///
/// Original Python definition:
Expand Down
6 changes: 2 additions & 4 deletions src/engine-core-client/src/tests/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,7 @@ fn request_output(
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand Down Expand Up @@ -2262,8 +2261,7 @@ fn python_msgpack_fixtures_match_rust_encoding() {
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
},
Expand Down
3 changes: 1 addition & 2 deletions src/engine-core-client/src/tests/python_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,7 @@ class EngineCoreOutput(
events: object | None = None
kv_transfer_params: object | None = None
trace_headers: object | None = None
num_cached_tokens: int = 0
num_external_computed_tokens: int = 0
prefill_stats: object | None = None
routed_experts: object | None = None
num_nans_in_logits: int = 0

Expand Down
50 changes: 21 additions & 29 deletions src/llm/src/request_metrics.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::time::{SystemTime, UNIX_EPOCH};

use vllm_engine_core_client::protocol::stats::PrefillStats;
use vllm_engine_core_client::protocol::{EngineCoreEvent, EngineCoreEventType, EngineCoreOutput};
use vllm_metrics::{
EngineLabels, FinishedReasonLabels, METRICS, PromptTokenSourceLabels, RequestMetrics,
Expand Down Expand Up @@ -82,7 +83,9 @@ impl RequestMetricsTracker {
output: &EngineCoreOutput,
) {
self.last_seen_engine_index = engine_index;
self.latest_num_cached_tokens = output.num_cached_tokens;
if let Some(prefill_stats) = &output.prefill_stats {
self.latest_num_cached_tokens = prefill_stats.num_cached_tokens;
}
self.num_generation_tokens += output.new_token_ids.len() as u32;
metrics()
.generation_tokens
Expand All @@ -94,13 +97,9 @@ impl RequestMetricsTracker {
}

if self.is_prefilling {
record_prompt_tokens(
&self.model_name,
engine_index,
self.prompt_len,
output.num_cached_tokens,
output.num_external_computed_tokens,
);
if let Some(prefill_stats) = &output.prefill_stats {
record_prompt_tokens(&self.model_name, engine_index, prefill_stats);
}
Comment on lines +100 to +102
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If prefill_stats is missing from the first output (e.g., due to an unexpected backend response or protocol mismatch), the vllm:prompt_tokens counter will not be incremented at all, while the vllm:request_prompt_tokens histogram in record_finished will still be updated using self.prompt_len. This creates a discrepancy between cumulative counters and request histograms. Consider providing a fallback that records the total prompt tokens even if the breakdown is missing.

Suggested change
if let Some(prefill_stats) = &output.prefill_stats {
record_prompt_tokens(&self.model_name, engine_index, prefill_stats);
}
if let Some(prefill_stats) = &output.prefill_stats {
record_prompt_tokens(&self.model_name, engine_index, prefill_stats);
} else {
metrics()
.prompt_tokens
.get_or_create(&engine_labels(&self.model_name, engine_index))
.inc_by(self.prompt_len as u64);
}

Comment thread
njhill marked this conversation as resolved.
self.first_token_latency = received_at - self.arrival_time;
observe_time_to_first_token_seconds(
&self.model_name,
Expand Down Expand Up @@ -260,24 +259,15 @@ fn prompt_token_source_labels(
}
}

fn record_prompt_tokens(
model_name: &str,
engine: u32,
prompt_len: u32,
num_cached_tokens: u32,
num_external_computed_tokens: u32,
) {
let recomputed = u64::from(num_cached_tokens + 1 == prompt_len);
let computed = prompt_len.saturating_sub(num_cached_tokens) as u64;
let external_kv_transfer = num_external_computed_tokens as u64;
let local_cache_hit = (num_cached_tokens as u64)
.saturating_add(recomputed)
.saturating_sub(external_kv_transfer);
fn record_prompt_tokens(model_name: &str, engine: u32, prefill_stats: &PrefillStats) {
let computed = prefill_stats.num_computed_tokens as u64;
let local_cache_hit = prefill_stats.num_local_cached_tokens as u64;
let external_kv_transfer = prefill_stats.num_external_cached_tokens as u64;

metrics()
.prompt_tokens
.get_or_create(&engine_labels(model_name, engine))
.inc_by(prompt_len as u64);
.inc_by(prefill_stats.num_prompt_tokens as u64);
metrics()
.prompt_tokens_by_source
.get_or_create(&prompt_token_source_labels(
Expand Down Expand Up @@ -305,11 +295,7 @@ fn record_prompt_tokens(
metrics()
.prompt_tokens_cached
.get_or_create(&engine_labels(model_name, engine))
.inc_by(num_cached_tokens as u64);
metrics()
.prompt_tokens_recomputed
.get_or_create(&engine_labels(model_name, engine))
.inc_by(recomputed);
.inc_by(prefill_stats.num_cached_tokens as u64);
}

fn diff_or_zero(end: f64, start: f64) -> f64 {
Expand Down Expand Up @@ -337,6 +323,7 @@ pub(crate) fn current_unix_timestamp_secs() -> f64 {

#[cfg(test)]
mod tests {
use vllm_engine_core_client::protocol::stats::PrefillStats;
use vllm_engine_core_client::protocol::{EngineCoreEvent, EngineCoreEventType};

use super::{RequestMetricsTracker, diff_or_zero};
Expand All @@ -363,7 +350,13 @@ mod tests {
timestamp: 9.0,
},
]),
num_cached_tokens: 4,
prefill_stats: Some(PrefillStats {
num_prompt_tokens: 64,
num_computed_tokens: 60,
num_cached_tokens: 4,
num_local_cached_tokens: 4,
num_external_cached_tokens: 0,
}),
..Default::default()
},
);
Expand All @@ -379,7 +372,6 @@ mod tests {
r#type: EngineCoreEventType::Preempted,
timestamp: 10.5,
}]),
num_cached_tokens: 4,
..Default::default()
},
);
Expand Down
50 changes: 26 additions & 24 deletions src/llm/tests/generate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use futures::StreamExt as _;
use tokio::time::timeout;
use tracing_subscriber::EnvFilter;
use uuid::Uuid;
use vllm_engine_core_client::protocol::stats::PrefillStats;
use vllm_engine_core_client::protocol::{
EngineCoreEvent, EngineCoreEventType, EngineCoreFinishReason, EngineCoreOutput,
EngineCoreOutputs, EngineCoreRequest, EngineCoreSamplingParams, Logprobs, MaybeWireLogprobs,
Expand Down Expand Up @@ -47,8 +48,7 @@ fn request_output_with_events(
events,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand All @@ -72,8 +72,7 @@ fn request_output_with_logprobs(
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand All @@ -98,8 +97,7 @@ fn request_output_with_logprobs_and_kv(
events: None,
kv_transfer_params,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand Down Expand Up @@ -585,21 +583,28 @@ async fn generate_records_request_metrics_in_prometheus_output() {
EngineCoreOutputs {
engine_index: 4,
timestamp: 10.0,
outputs: vec![request_output_with_events(
&request.request_id,
vec![1],
None,
Some(vec![
EngineCoreEvent {
r#type: EngineCoreEventType::Queued,
timestamp: 8.0,
},
EngineCoreEvent {
r#type: EngineCoreEventType::Scheduled,
timestamp: 9.0,
},
]),
)],
outputs: vec![EngineCoreOutput {
prefill_stats: Some(PrefillStats {
num_prompt_tokens: 2,
num_computed_tokens: 2,
..Default::default()
}),
..request_output_with_events(
&request.request_id,
vec![1],
None,
Some(vec![
EngineCoreEvent {
r#type: EngineCoreEventType::Queued,
timestamp: 8.0,
},
EngineCoreEvent {
r#type: EngineCoreEventType::Scheduled,
timestamp: 9.0,
},
]),
)
}],
..Default::default()
},
)
Expand Down Expand Up @@ -658,9 +663,6 @@ async fn generate_records_request_metrics_in_prometheus_output() {
assert!(rendered.contains(&format!(
"vllm:prompt_tokens_cached_total{{model_name=\"{model_name}\",engine=\"4\"}} 0"
)));
assert!(rendered.contains(&format!(
"vllm:prompt_tokens_recomputed_total{{model_name=\"{model_name}\",engine=\"4\"}} 0"
)));
assert!(rendered.contains(&format!(
"vllm:generation_tokens_total{{model_name=\"{model_name}\",engine=\"4\"}} 3"
)));
Expand Down
9 changes: 0 additions & 9 deletions src/metrics/src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ pub struct RequestMetrics {
pub prompt_tokens: Family<EngineLabels, U64Counter>,
pub prompt_tokens_by_source: PromptTokenSourceCounterFamily,
pub prompt_tokens_cached: Family<EngineLabels, U64Counter>,
pub prompt_tokens_recomputed: Family<EngineLabels, U64Counter>,
pub generation_tokens: Family<EngineLabels, U64Counter>,

// We intentionally don't support iteration-level histograms for now, since it seems to make
Expand Down Expand Up @@ -145,13 +144,6 @@ impl RequestMetrics {
prompt_tokens_cached.clone(),
);

let prompt_tokens_recomputed = Family::default();
registry.register(
"vllm:prompt_tokens_recomputed",
"Number of cached prompt tokens recomputed during prefill.",
prompt_tokens_recomputed.clone(),
);

let generation_tokens = Family::default();
registry.register(
"vllm:generation_tokens",
Expand Down Expand Up @@ -285,7 +277,6 @@ impl RequestMetrics {
prompt_tokens,
prompt_tokens_by_source,
prompt_tokens_cached,
prompt_tokens_recomputed,
generation_tokens,
request_success,
request_prompt_tokens,
Expand Down
3 changes: 1 addition & 2 deletions src/server/src/grpc/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ fn request_output(
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand Down
3 changes: 1 addition & 2 deletions src/server/src/routes/http_client_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ fn request_output(
events: None,
kv_transfer_params: None,
trace_headers: None,
num_cached_tokens: 0,
num_external_computed_tokens: 0,
prefill_stats: None,
routed_experts: None,
num_nans_in_logits: 0,
}
Expand Down
Loading
Loading