@@ -172,7 +172,6 @@ def __init__(
172172 kv_cache_dtype = "auto"
173173 block_size = 16
174174 calculate_kv_scales = False
175- self .block_size = block_size
176175 self .kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype (
177176 kv_cache_dtype , vllm_config .model_config
178177 )
@@ -424,22 +423,24 @@ def get_attn_backend(self) -> type[AttentionBackend]:
424423 return self .attn_backend
425424
426425 def get_kv_cache_spec (self , vllm_config : VllmConfig ) -> KVCacheSpec :
426+ # Block size may get updated after model loading, refresh it
427+ block_size = vllm_config .cache_config .block_size
427428 # Should not be called for enc-dec or encoder-only attention.
428429 assert self .attn_type == AttentionType .DECODER
429430 if self .sliding_window is not None :
430431 assert not vllm_config .model_config .use_mla , (
431432 "MLA is not supported for slidingwindow"
432433 )
433434 return SlidingWindowSpec (
434- block_size = self . block_size ,
435+ block_size = block_size ,
435436 num_kv_heads = self .num_kv_heads ,
436437 head_size = self .head_size ,
437438 dtype = self .kv_cache_torch_dtype ,
438439 sliding_window = self .sliding_window ,
439440 )
440441 else :
441442 return FullAttentionSpec (
442- block_size = self . block_size ,
443+ block_size = block_size ,
443444 num_kv_heads = self .num_kv_heads ,
444445 head_size = self .head_size ,
445446 dtype = self .kv_cache_torch_dtype ,
@@ -639,7 +640,6 @@ def __init__(
639640 block_size = 16
640641 calculate_kv_scales = False
641642 self .kv_cache_dtype = kv_cache_dtype
642- self .block_size = block_size
643643
644644 dtype = torch .get_default_dtype ()
645645 self .attn_backend = get_attn_backend (
@@ -810,7 +810,7 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
810810 self .kv_cache_dtype , vllm_config .model_config .dtype
811811 )
812812 return MLAAttentionSpec (
813- block_size = self .block_size ,
813+ block_size = vllm_config . cache_config .block_size ,
814814 num_kv_heads = 1 ,
815815 head_size = self .head_size ,
816816 dtype = kv_cache_dtype ,
0 commit comments