From b00f29bdf78a30982a04345388f89a2b064acfb7 Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iboiko@habana.ai>
Date: Mon, 18 May 2026 11:23:03 +0300
Subject: [PATCH] fix kernel block size, #1439

Signed-off-by: Iryna Boiko <iboiko@habana.ai>
---
 vllm_gaudi/v1/attention/backends/hpu_attn.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm_gaudi/v1/attention/backends/hpu_attn.py b/vllm_gaudi/v1/attention/backends/hpu_attn.py
index 1a36a7d56c..5f6f10acf7 100644
--- a/vllm_gaudi/v1/attention/backends/hpu_attn.py
+++ b/vllm_gaudi/v1/attention/backends/hpu_attn.py
@@ -35,10 +35,11 @@ def get_metadata_cls() -> type["AttentionMetadata"]:
 
     @staticmethod
     def get_supported_kernel_block_sizes() -> list[Union[int, MultipleOf]]:
-        # 128 is the standard HPU kernel block size; 528 is required for
-        # Granite 4.0-H (granitemoehybrid) without prefix caching (16-token
-        # FA alignment), 768 with prefix caching (chunk-aligned).
-        return [128, 528, 768]
+        # 16 is supported for testing/smaller models; 128 is the standard HPU
+        # kernel block size; 528 is required for Granite 4.0-H
+        # (granitemoehybrid) without prefix caching (16-token FA alignment),
+        # 768 with prefix caching (chunk-aligned).
+        return [16, 128, 528, 768]
 
     @classmethod
     def get_preferred_block_size(cls, default_block_size: int) -> int: