Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions csrc/cpu/cpu_attn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#ifdef __aarch64__
#include "cpu_attn_neon.hpp"
// NEON requires head_dim to be a multiple of 32
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok...now we created too much reduandant template instantiations, requires to reorgnize the dispatch procedure.

Copy link
Copy Markdown
Contributor Author

@R3hankhan123 R3hankhan123 Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a future PR?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

#define NEON_DISPATCH(...) \
case cpu_attention::ISA::NEON: { \
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
Expand All @@ -36,7 +37,9 @@
switch (HEAD_DIM) { \
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
Expand Down
2 changes: 1 addition & 1 deletion csrc/cpu/cpu_attn_amx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
const int32_t q_heads_per_kv, const int64_t q_num_stride,
const int64_t q_head_stride, const float scale) {
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
// static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
constexpr int64_t head_elem_num_pre_block =
AMX_TILE_ROW_BYTES / sizeof(scalar_t);
Expand Down
2 changes: 1 addition & 1 deletion csrc/cpu/cpu_attn_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
constexpr static ISA ISAType = ISA::NEON;
constexpr static bool scale_on_logits = false; // apply scale on q_buffer

static_assert(HeadDim % HeadDimAlignment == 0);
// static_assert(HeadDim % HeadDimAlignment == 0);
// the gemm micro kernel is Mx8
static_assert(HeadDimAlignment % 8 == 0);
static_assert(BlockSizeAlignment % 8 == 0);
Expand Down
10 changes: 7 additions & 3 deletions vllm/v1/attention/backends/cpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_supported_dtypes(cls) -> list[torch.dtype]:

@classmethod
def get_supported_head_sizes(cls) -> list[int]:
return [32, 64, 96, 128, 160, 192, 224, 256]
return [32, 64, 80, 96, 112, 128, 160, 192, 224, 256]
Comment on lines 44 to +45
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

While support for head sizes 80 and 112 has been added, the automated tests in tests/kernels/attention/test_cpu_attn.py have not been updated to include these new sizes. Please update the HEAD_SIZES list in the test file to ensure the new functionality is covered.


@staticmethod
def get_name() -> str:
Expand Down Expand Up @@ -137,7 +137,7 @@ def __init__(
if self.window_size is None:
self.window_size = -1
self.block_size = vllm_config.cache_config.block_size
self.isa = _get_attn_isa(self.dtype, self.block_size)
self.isa = _get_attn_isa(self.dtype, self.block_size, self.head_dim)
self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)

def build(
Expand Down Expand Up @@ -484,7 +484,11 @@ def _make_sliding_window_bias(
return attn_biases


def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
def _get_attn_isa(
dtype: torch.dtype, block_size: int, head_size: int | None = None
) -> str:
if head_size is not None and head_size % 32 != 0 and head_size % 16 == 0:
return "vec16"
Comment on lines +487 to +491
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The logic to force vec16 for specific head sizes is duplicated here and in the C++ backend. The C++ code in csrc/cpu/cpu_attn.cpp already handles this fallback in get_scheduler_metadata, cpu_attn_reshape_and_cache, and cpu_attention_with_kv_cache using the requires_vec16_fallback helper.

To maintain a single source of truth and avoid redundancy, this logic should only reside in the C++ backend. Please remove the head_size check from this function and revert its signature.

Also, the call to this function in CPUAttentionMetadataBuilder.__init__ (line 140) should be reverted to self.isa = _get_attn_isa(self.dtype, self.block_size).

def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:

supports_amx = torch._C._cpu._is_amx_tile_supported()
if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
return "amx"
Expand Down