fix: DIS-706 skip offloading the G1 matched blocks during offloading (#3299)

richardhuo-nv · nv-tusharma · commit 606b7f31d520 · 2025-10-20T14:17:58.000-07:00
Signed-off-by: richardhuo-nv &lt;rihuo@nvidia.com&gt;
diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
@@ -115,6 +115,7 @@ pub trait Slot: std::fmt::Debug {
         tokens: &[u32],
         block_ids: &[usize],
         computed_position: usize,
+        is_new_request: bool,
     ) -> Result<(), SlotError>;
 
     fn record_start_iteration(&mut self, iteration: u64) -> Result<(), SlotError>;
@@ -592,6 +593,7 @@ impl Slot for VllmConnectorSlot {
         tokens: &[u32],
         block_ids: &[usize],
         computed_position: usize,
+        is_new_request: bool,
     ) -> Result<(), SlotError> {
         // TRTLLM's KV Connector Manager will have (computed_position - external matches)
         // in onborading case
@@ -630,10 +632,21 @@ impl Slot for VllmConnectorSlot {
             self.device_blocks.extend(block_ids);
         }
 
+        // This approach is fragile, but it’s the only way currently to skip evaluating
+        // the device matched blocks and to avoid offloading them again.
+        // TODO: Consider adding an indicator in the scheduler output to distinguish between
+        // matched and unmatched device blocks/tokens from the scheduler.
+        let maybe_have_device_matched_blocks =
+            is_new_request && computed_position > 0 && self.evaluated_blocks == 0;
+
+        if maybe_have_device_matched_blocks {
+            self.evaluated_blocks = (computed_position + 1) / self.block_size;
+        }
+
         let num_candidate_blocks =
-            ((computed_position + 1) / self.block_size) - self.evaluated_blocks;
+            ((computed_position + 1) / self.block_size).saturating_sub(self.evaluated_blocks);
 
-        if num_candidate_blocks != 0 {
+        if num_candidate_blocks > 0 {
             // do we have a mechanism for skipping gpu cache hit blocks?  not sure yet.
             // for now, offload all the blocks to the host
             let offload_block_ids: Vec<usize> = self
diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs
@@ -334,6 +334,7 @@ impl Leader for KvConnectorLeader {
                 &new_req.prompt_token_ids,
                 &new_req.block_ids,
                 new_req.num_computed_tokens,
+                true,
             )?;
 
             if let Some(pending_ops) = slot.take_pending_operations() {
@@ -364,6 +365,7 @@ impl Leader for KvConnectorLeader {
                 &cached_req.new_token_ids,
                 &cached_req.new_block_ids,
                 cached_req.num_computed_tokens,
+                false,
             )?;
 
             if let Some(pending_ops) = slot.take_pending_operations() {
diff --git a/lib/llm/src/block_manager/offload.rs b/lib/llm/src/block_manager/offload.rs
@@ -739,7 +739,7 @@ mod tests {
         let disk_pool = if let Some(disk_blocks) = disk_blocks {
             config.num_blocks = disk_blocks;
             Some(build_layout(
-                config,
+                config.clone(),
                 layout_type,
                 agent,
                 &DiskAllocator,