From b00f29bdf78a30982a04345388f89a2b064acfb7 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Mon, 18 May 2026 11:23:03 +0300 Subject: [PATCH] fix kernel block size, #1439 Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/attention/backends/hpu_attn.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm_gaudi/v1/attention/backends/hpu_attn.py b/vllm_gaudi/v1/attention/backends/hpu_attn.py index 1a36a7d56c..5f6f10acf7 100644 --- a/vllm_gaudi/v1/attention/backends/hpu_attn.py +++ b/vllm_gaudi/v1/attention/backends/hpu_attn.py @@ -35,10 +35,11 @@ def get_metadata_cls() -> type["AttentionMetadata"]: @staticmethod def get_supported_kernel_block_sizes() -> list[Union[int, MultipleOf]]: - # 128 is the standard HPU kernel block size; 528 is required for - # Granite 4.0-H (granitemoehybrid) without prefix caching (16-token - # FA alignment), 768 with prefix caching (chunk-aligned). - return [128, 528, 768] + # 16 is supported for testing/smaller models; 128 is the standard HPU + # kernel block size; 528 is required for Granite 4.0-H + # (granitemoehybrid) without prefix caching (16-token FA alignment), + # 768 with prefix caching (chunk-aligned). + return [16, 128, 528, 768] @classmethod def get_preferred_block_size(cls, default_block_size: int) -> int: