Skip to content

Commit 973da05

Browse files
committed
refresh block_Size
Signed-off-by: NickLucche <[email protected]>
1 parent 5d5eda7 commit 973da05

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

vllm/attention/layer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@ def __init__(
172172
kv_cache_dtype = "auto"
173173
block_size = 16
174174
calculate_kv_scales = False
175-
self.block_size = block_size
176175
self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype(
177176
kv_cache_dtype, vllm_config.model_config
178177
)
@@ -424,22 +423,24 @@ def get_attn_backend(self) -> type[AttentionBackend]:
424423
return self.attn_backend
425424

426425
def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
426+
# Block size may get updated after model loading, refresh it
427+
block_size = vllm_config.cache_config.block_size
427428
# Should not be called for enc-dec or encoder-only attention.
428429
assert self.attn_type == AttentionType.DECODER
429430
if self.sliding_window is not None:
430431
assert not vllm_config.model_config.use_mla, (
431432
"MLA is not supported for slidingwindow"
432433
)
433434
return SlidingWindowSpec(
434-
block_size=self.block_size,
435+
block_size=block_size,
435436
num_kv_heads=self.num_kv_heads,
436437
head_size=self.head_size,
437438
dtype=self.kv_cache_torch_dtype,
438439
sliding_window=self.sliding_window,
439440
)
440441
else:
441442
return FullAttentionSpec(
442-
block_size=self.block_size,
443+
block_size=block_size,
443444
num_kv_heads=self.num_kv_heads,
444445
head_size=self.head_size,
445446
dtype=self.kv_cache_torch_dtype,
@@ -639,7 +640,6 @@ def __init__(
639640
block_size = 16
640641
calculate_kv_scales = False
641642
self.kv_cache_dtype = kv_cache_dtype
642-
self.block_size = block_size
643643

644644
dtype = torch.get_default_dtype()
645645
self.attn_backend = get_attn_backend(
@@ -810,7 +810,7 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
810810
self.kv_cache_dtype, vllm_config.model_config.dtype
811811
)
812812
return MLAAttentionSpec(
813-
block_size=self.block_size,
813+
block_size=vllm_config.cache_config.block_size,
814814
num_kv_heads=1,
815815
head_size=self.head_size,
816816
dtype=kv_cache_dtype,

0 commit comments

Comments
 (0)