Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions tensorrt_llm/_torch/pyexecutor/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,16 +536,8 @@ def get_num_kv_blocks(self, num_tokens: int) -> int:
return (num_tokens + self.tokens_per_block - 1) // self.tokens_per_block

def get_num_available_tokens(self, max_num_draft_tokens: int = 0) -> int:
if self.max_attention_window_vec and len(
self.max_attention_window_vec) > 1:
# VSWA case, the available tokens should the the minimum of the available tokens for each window size
min_free_blocks = min(self.impl.get_kv_cache_stats().
num_free_blocks_per_window_size.values())
res = min_free_blocks * self.tokens_per_block - self.num_extra_kv_tokens - max_num_draft_tokens
else:
res = (self.get_num_free_blocks() * self.tokens_per_block -
self.num_extra_kv_tokens - max_num_draft_tokens)
return res
return (self.get_num_free_blocks() * self.tokens_per_block -
self.num_extra_kv_tokens - max_num_draft_tokens)

def get_buffers(self, layer_idx: int) -> Optional[torch.Tensor]:
layer_offset = self.layer_offsets[layer_idx]
Expand Down Expand Up @@ -732,6 +724,8 @@ def calculate_max_num_blocks_from_cpp(

# VSWA on Torch backend has not supported the cross attention.
is_cross_attention = False
# check model config
assert model_config.layer_types is not None, "layer_types have to be set correctly for VSWA"

# Construct WorldConfig from self.mapping
world_config_cpp = WorldConfig(
Expand Down