Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions src/routers/http/vllm_pd_router.rs
Original file line number Diff line number Diff line change
Expand Up @@ -636,26 +636,29 @@ impl VllmPDRouter {

Ok(response)
} else {
// No logprobs merging needed - return decode response as-is
// No logprobs merging needed - stream decode response as-is (handles both streaming
// and non-streaming). Using bytes_stream() avoids buffering the entire SSE stream
// into memory, which would cause streaming requests to return an empty reply.
debug!(
"No logprobs merging needed (streaming={}, needs_logprobs={})",
is_streaming, needs_logprobs
);

let status = decode_response.status();
let headers = decode_response.headers().clone();
let body = decode_response
.bytes()
.await
.map_err(|e| format!("Failed to read decode response: {}", e))?;

let mut response_builder = axum::http::Response::builder().status(status);
for (name, value) in headers.iter() {
response_builder = response_builder.header(name, value);
// Skip hop-by-hop headers that must not be forwarded as-is; axum will
// set transfer-encoding and content-length correctly for the streamed body.
if name != "transfer-encoding" && name != "content-length" {
response_builder = response_builder.header(name, value);
}
}

let body = axum::body::Body::from_stream(decode_response.bytes_stream());
let response = response_builder
.body(axum::body::Body::from(body))
.body(body)
.map_err(|e| format!("Failed to build response: {}", e))?;

Ok(response)
Expand Down