From 90e602f2be1f701962815ea8825ce635efb39ec7 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Tue, 3 Mar 2026 22:55:39 +0100
Subject: [PATCH 1/9] add 1056 block_size to triton fallback

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/v1/attention/backends/rocm_attn.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index b53170c98976..64b5f629e65c 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -174,11 +174,12 @@ def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
         # but vLLM should allow support for non-standard sizes via the Triton path,
         # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
         # where the Triton kernel under rocm_atten does not support inference
-        # for a non-standard qwen3-next model with a block_size of 544.
+        # for a non-standard qwen3-next model with a block_size or qwen3_5
+	# with a block_size of 1056
         # We have fixed the Triton kernel so that the standard model uses the original
         # bit-addressing logic, while the non-standard model
         # uses our optimized kernel logic.
-        return [16, 32, 544]
+        return [16, 32, 544, 1056]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:

From 2dbcd4b4aea5cb5ef0b3d462e4aac5cb7413339d Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Tue, 3 Mar 2026 23:30:12 +0100
Subject: [PATCH 2/9] precommit

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/v1/attention/backends/rocm_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 64b5f629e65c..d92421ac6dd6 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -175,7 +175,7 @@ def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
         # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
         # where the Triton kernel under rocm_atten does not support inference
         # for a non-standard qwen3-next model with a block_size or qwen3_5
-	# with a block_size of 1056
+        # with a block_size of 1056
         # We have fixed the Triton kernel so that the standard model uses the original
         # bit-addressing logic, while the non-standard model
         # uses our optimized kernel logic.

From bd4b501ffe2438dbfec65290cede0d748d3bae21 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Tue, 3 Mar 2026 23:40:19 +0100
Subject: [PATCH 3/9] precommit doc

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 docs/design/attention_backends.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index e726d99256f5..5479ac3af9b0 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -173,7 +173,7 @@ Priority is **1 = highest** (tried first).
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
 | `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
-| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
+| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544, 1056 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
 | `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
 | `TRITON_ATTN` |  | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
 

From 6212617ea77c42d2f7fade9a4ba118d4fb12e1b0 Mon Sep 17 00:00:00 2001
From: Jartx <sagformas@epdcenter.es>
Date: Tue, 3 Mar 2026 23:49:05 +0100
Subject: [PATCH 4/9] Clarify comment on non-standard model block sizes

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/v1/attention/backends/rocm_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index d92421ac6dd6..bdb9d1f51e52 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -174,7 +174,7 @@ def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
         # but vLLM should allow support for non-standard sizes via the Triton path,
         # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
         # where the Triton kernel under rocm_atten does not support inference
-        # for a non-standard qwen3-next model with a block_size or qwen3_5
+        # for a non-standard qwen3-next model with a block_size of 544 or qwen3_5
         # with a block_size of 1056
         # We have fixed the Triton kernel so that the standard model uses the original
         # bit-addressing logic, while the non-standard model

From 9c161aeeee2fab4cb39f62f17337d10a27853941 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Wed, 4 Mar 2026 12:09:35 +0100
Subject: [PATCH 5/9] qwen3.5 27b

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 docs/design/attention_backends.md       | 2 +-
 vllm/v1/attention/backends/rocm_attn.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index 5479ac3af9b0..a33a0df0b46a 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -173,7 +173,7 @@ Priority is **1 = highest** (tried first).
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
 | `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
-| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544, 1056 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
+| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544, 784, 1056 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
 | `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
 | `TRITON_ATTN` |  | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
 
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index bdb9d1f51e52..074a9f2a7f90 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -175,11 +175,11 @@ def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
         # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
         # where the Triton kernel under rocm_atten does not support inference
         # for a non-standard qwen3-next model with a block_size of 544 or qwen3_5
-        # with a block_size of 1056
+        # with a block_size of 1056 and 784
         # We have fixed the Triton kernel so that the standard model uses the original
         # bit-addressing logic, while the non-standard model
         # uses our optimized kernel logic.
-        return [16, 32, 544, 1056]
+        return [16, 32, 544, 784, 1056]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:

From 5848396c25751d1671371470e41cae1f9c709a9e Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Wed, 4 Mar 2026 12:10:10 +0100
Subject: [PATCH 6/9] qwen3.5 27b

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/v1/attention/backends/rocm_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 074a9f2a7f90..849003dbb445 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -175,7 +175,7 @@ def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
         # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
         # where the Triton kernel under rocm_atten does not support inference
         # for a non-standard qwen3-next model with a block_size of 544 or qwen3_5
-        # with a block_size of 1056 and 784
+        # with a block_size of 784 and 1056
         # We have fixed the Triton kernel so that the standard model uses the original
         # bit-addressing logic, while the non-standard model
         # uses our optimized kernel logic.

From 1bbc515b08285f846cc8d0d8fefe61212029b9ac Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Thu, 5 Mar 2026 11:30:22 +0100
Subject: [PATCH 7/9] allow multiple of 16 via triton path

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 docs/design/attention_backends.md       |  2 +-
 vllm/v1/attention/backends/rocm_attn.py | 19 +++++++------------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index a33a0df0b46a..4c2ab4a1c27a 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -173,7 +173,7 @@ Priority is **1 = highest** (tried first).
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
 | `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
-| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544, 784, 1056 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
+| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
 | `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
 | `TRITON_ATTN` |  | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
 
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 849003dbb445..f39aac6290aa 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -166,20 +166,15 @@ class RocmAttentionBackend(AttentionBackend):
 
     @staticmethod
     def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
-        # ROCM paged attention kernel only supports block sizes 16 and 32
+        # ROCM paged attention native C++ kernel only supports block sizes 16 and 32
         # due to shared memory (LDS) constraints on AMD GPUs.
         # See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
-
-        # However, The limitations in [16, 32] are reasonable for a native C++ kernel,
-        # but vLLM should allow support for non-standard sizes via the Triton path,
-        # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
-        # where the Triton kernel under rocm_atten does not support inference
-        # for a non-standard qwen3-next model with a block_size of 544 or qwen3_5
-        # with a block_size of 784 and 1056
-        # We have fixed the Triton kernel so that the standard model uses the original
-        # bit-addressing logic, while the non-standard model
-        # uses our optimized kernel logic.
-        return [16, 32, 544, 784, 1056]
+        # However, vLLM allows support for any multiple of 16 via the Triton path.
+        # As addressed in PR: https://github.com/vllm-project/vllm/pull/31380,
+        # non-standard models (like qwen3-next with block_size 544, or qwen3_5
+        # with 784 and 1056) are dynamically routed to our optimized Triton kernel
+        # in `do_kv_cache_update`.
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:

From cd8be20410aac0cf7da5893b7f80f8ec0a7acaf1 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Fri, 6 Mar 2026 11:57:06 +0100
Subject: [PATCH 8/9] control blocks

Signed-off-by: JartX <sagformas@epdcenter.es>

Co-authored-by: akaratza <akaratza@amd.com>
---
 vllm/v1/attention/backends/rocm_attn.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 9ed9f296212d..1d0dc81dc2c5 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -453,11 +453,9 @@ def do_kv_cache_update(
         # Get the actual block_size from value_cache
         # value_cache shape: [num_blocks, num_heads, head_size, block_size]
         block_size = value_cache.shape[3]
-        # Determine if it is a power of 2
-        is_pow2 = block_size > 0 and (block_size & (block_size - 1) == 0)
 
-        if is_pow2:
-            # Normal 16, 32, 64, etc., use vLLM native HIP C++ logic
+        if block_size in (16, 32):
+            # Normal 16, 32, use vLLM native HIP C++ logic
             PagedAttention.write_to_paged_cache(
                 key,
                 value,
@@ -469,7 +467,7 @@ def do_kv_cache_update(
                 layer._v_scale,
             )
         else:
-            # Case B: Non-standard blocks (e.g., 544 in Qwen3),
+            # Case B: Non-standard blocks (e.g., 64, 128, 544 in Qwen3Next or Qwen3.5 ),
             # force using our modified Triton logic
             triton_reshape_and_cache_flash(
                 key,

From cc9db3852471e8a40a042d5a15fafb416ccc9409 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Tue, 10 Mar 2026 23:02:32 +0100
Subject: [PATCH 9/9] remove redundant code

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/v1/attention/backends/rocm_attn.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index f9946e3e86cf..1d0dc81dc2c5 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -184,12 +184,6 @@ def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
         # in `do_kv_cache_update`.
         return [MultipleOf(16)]
 
-    @classmethod
-    def supports_block_size(cls, block_size: int | None) -> bool:
-        if block_size is None:
-            return True
-        return block_size in (16, 32, 544)
-
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 80, 96, 128, 160, 192, 224, 256]