OffloadingConnector: Fix GPU block tracking bug

orozery · orozery · commit 00f13472d45d · 2025-09-30T09:12:03.000+03:00
This commit fixes a bug in the offloading connector that may result
in incorrect GPU block tracking per request.
It occurs when blocks cannot be allocated on the offloaded medium (prepare_store fails),
and the scheduler output has multiple requests, some of them with new GPU block IDs.
Before this commit, the connector simply returned without processing the rest of the requests,
and their GPU block IDs.

Signed-off-by: Or Ozeri &lt;oro@il.ibm.com&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -278,8 +278,9 @@ def _get_reqs_to_store(self, scheduler_output: SchedulerOutput):
                 req, start_idx=start_block_idx, end_idx=num_blocks)
             store_output = self.manager.prepare_store(new_block_hashes)
             if store_output is None:
-                logger.warning("Cannot store %s blocks", num_new_blocks)
-                break
+                logger.warning("Request %s: cannot store %s blocks", req_id,
+                               num_new_blocks)
+                continue
 
             self._next_stored_block_idx[req_id] = num_blocks