Skip to content

Commit 606b7f3

Browse files
richardhuo-nvnv-tusharma
authored andcommitted
fix: DIS-706 skip offloading the G1 matched blocks during offloading (#3299)
Signed-off-by: richardhuo-nv <[email protected]>
1 parent 28c1ad4 commit 606b7f3

File tree

3 files changed

+18
-3
lines changed

3 files changed

+18
-3
lines changed

lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ pub trait Slot: std::fmt::Debug {
115115
tokens: &[u32],
116116
block_ids: &[usize],
117117
computed_position: usize,
118+
is_new_request: bool,
118119
) -> Result<(), SlotError>;
119120

120121
fn record_start_iteration(&mut self, iteration: u64) -> Result<(), SlotError>;
@@ -592,6 +593,7 @@ impl Slot for VllmConnectorSlot {
592593
tokens: &[u32],
593594
block_ids: &[usize],
594595
computed_position: usize,
596+
is_new_request: bool,
595597
) -> Result<(), SlotError> {
596598
// TRTLLM's KV Connector Manager will have (computed_position - external matches)
597599
// in onborading case
@@ -630,10 +632,21 @@ impl Slot for VllmConnectorSlot {
630632
self.device_blocks.extend(block_ids);
631633
}
632634

635+
// This approach is fragile, but it’s the only way currently to skip evaluating
636+
// the device matched blocks and to avoid offloading them again.
637+
// TODO: Consider adding an indicator in the scheduler output to distinguish between
638+
// matched and unmatched device blocks/tokens from the scheduler.
639+
let maybe_have_device_matched_blocks =
640+
is_new_request && computed_position > 0 && self.evaluated_blocks == 0;
641+
642+
if maybe_have_device_matched_blocks {
643+
self.evaluated_blocks = (computed_position + 1) / self.block_size;
644+
}
645+
633646
let num_candidate_blocks =
634-
((computed_position + 1) / self.block_size) - self.evaluated_blocks;
647+
((computed_position + 1) / self.block_size).saturating_sub(self.evaluated_blocks);
635648

636-
if num_candidate_blocks != 0 {
649+
if num_candidate_blocks > 0 {
637650
// do we have a mechanism for skipping gpu cache hit blocks? not sure yet.
638651
// for now, offload all the blocks to the host
639652
let offload_block_ids: Vec<usize> = self

lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ impl Leader for KvConnectorLeader {
334334
&new_req.prompt_token_ids,
335335
&new_req.block_ids,
336336
new_req.num_computed_tokens,
337+
true,
337338
)?;
338339

339340
if let Some(pending_ops) = slot.take_pending_operations() {
@@ -364,6 +365,7 @@ impl Leader for KvConnectorLeader {
364365
&cached_req.new_token_ids,
365366
&cached_req.new_block_ids,
366367
cached_req.num_computed_tokens,
368+
false,
367369
)?;
368370

369371
if let Some(pending_ops) = slot.take_pending_operations() {

lib/llm/src/block_manager/offload.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -739,7 +739,7 @@ mod tests {
739739
let disk_pool = if let Some(disk_blocks) = disk_blocks {
740740
config.num_blocks = disk_blocks;
741741
Some(build_layout(
742-
config,
742+
config.clone(),
743743
layout_type,
744744
agent,
745745
&DiskAllocator,

0 commit comments

Comments
 (0)