diff --git a/lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs b/lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs index 848f9d2cdb4..3d295a4733b 100644 --- a/lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs +++ b/lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs @@ -549,7 +549,21 @@ impl Leader for KvConnectorLeader { "request_finished called for request_id: {request_id} but slot is not found" ); self.inflight_requests.remove(&request_id); - return Ok(false); + // We must return `true` here even though the leader slot is gone. + // + // Within a single call to vLLM's `update_from_output()`, two things + // happen in sequence: + // 1. Stopped requests call _free_request() → request_finished() (here) + // 2. Worker's finished_sending is processed by _update_from_kv_xfer_finished() + // + // The worker's get_finished() ran during the forward pass and may have + // reported this request in finished_sending for this same step. If we + // return `false`, vLLM deletes the request from self.requests at step 1. + // Then step 2 hits `assert req_id in self.requests` and crashes. + // + // Returning `true` keeps the request in self.requests so step 2 can + // process finished_sending and call _free_blocks() properly. + return Ok(true); } // grab the slot diff --git a/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs b/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs index 2744c4511ed..0a32e04ac40 100644 --- a/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs +++ b/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs @@ -409,11 +409,15 @@ impl Worker for KvConnectorWorker { tracing::debug!(request_id, "request slot is not finished"); } } else { - // made this condition more strict slot existence checks were added as a prerequesite - // to be added to the maybe_finished_offloading set. - panic!( - "request slot missing for {request_id}; however, it was present when added to the maybe finished offloading set" + // Slot was removed between when we added it to + // maybe_finished_offloading and now. Signal completion so + // vLLM can free the request via _free_blocks(). + tracing::warn!( + request_id, + "request slot missing from maybe_finished_offloading set; \ + signaling completion" ); + is_finished_offloading.insert(request_id.clone()); } } @@ -443,9 +447,15 @@ impl Worker for KvConnectorWorker { tracing::debug!(request_id, "request slot is not finished"); } } else { - panic!( - "request slot missing for {request_id}; however, it was present when added to the maybe finished onboarding set" + // Slot was removed between when we added it to + // maybe_finished_onboarding and now. Signal completion so + // vLLM can free the request via _free_blocks(). + tracing::warn!( + request_id, + "request slot missing from maybe_finished_onboarding set; \ + signaling completion" ); + is_finished_onboarding.insert(request_id.clone()); } }