Enable AMD Radeon GPU Custom Paged Attention on v1

hyoon1 · hyoon1 · commit 7af074dddfb6 · 2025-04-28T10:59:20.000-04:00
Signed-off-by: Hosang Yoon &lt;hosang.yoon@amd.com&gt;
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -17,8 +17,6 @@
 NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
-GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-ON_NAVI = "gfx1" in GPU_ARCH
 
 
 @torch.inference_mode()
@@ -88,7 +86,7 @@ def main(
     if version == "v2":
         if current_platform.is_rocm():
             global PARTITION_SIZE
-            if not args.custom_paged_attn and not ON_NAVI:
+            if not args.custom_paged_attn and not current_platform.is_navi():
                 PARTITION_SIZE = 1024
             else:
                 PARTITION_SIZE = PARTITION_SIZE_ROCM
@@ -168,13 +166,13 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         scale,
                         block_tables,
                         seq_lens,
+                        None,
                         block_size,
                         max_seq_len,
                         alibi_slopes,
                         kv_cache_dtype,
                         k_scale,
                         v_scale,
-                        ON_NAVI,
                     )
             else:
                 raise ValueError(f"Invalid version: {version}")
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
@@ -1581,6 +1581,7 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
   }
 }
 
+// clang-format off
 template <typename scalar_t, typename cache_t,
           vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
           int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
@@ -1594,6 +1595,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
@@ -1604,6 +1606,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
                                    // head_size]
     OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
     int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
   constexpr int NWARPS = NUM_THREADS / WARP_SIZE;  // 8 warps on gfx11
   const int warpid = threadIdx.x / WARP_SIZE;
   const int laneid = threadIdx.x % WARP_SIZE;
@@ -1613,6 +1616,13 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
   const int rowid = laneid / 16;
 
   const int seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
+
   const int partition_idx = blockIdx.y;
 
   constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
@@ -1671,12 +1681,14 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
   // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens
   // across 2 rows x 8 tokens per lane
 
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+
   if (GQA_RATIO == 1) {
     const int local_qhead_idx = lane16id % GQA_RATIO;
     const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-    const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
     const scalar_t* q_ptr =
-        q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
     if (lane16id < GQA_RATIO) {
   #pragma unroll
       for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) {
@@ -1690,9 +1702,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     // fetch Q in shared across warps and then write to registers
     const int local_qhead_idx = 2 * warpid + rowid;
     const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-    const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
     const scalar_t* q_ptr =
-        q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
 
     const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
     if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@@ -2024,6 +2035,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
@@ -2050,15 +2062,24 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
-  const int num_heads = gridDim.x;
-  const int head_idx = blockIdx.x;
-  const int seq_idx = blockIdx.y;
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
   // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2221,7 +2242,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
   const float inv_global_exp_sum =
       __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
   acc *= inv_global_exp_sum;
-  OUTT* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
   out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
 }
 
@@ -2328,6 +2353,7 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
   }
 }
 
+// clang-format off
 template <typename scalar_t, typename cache_t,
           vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
           int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
@@ -2341,6 +2367,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
@@ -2351,6 +2378,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
                                    // head_size]
     OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
     int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
   constexpr int NWARPS = NUM_THREADS / WARP_SIZE;  // 8 warps on gfx11
   const int warpid = threadIdx.x / WARP_SIZE;
   const int laneid = threadIdx.x % WARP_SIZE;
@@ -2360,6 +2388,12 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
   const int rowid = laneid / 16;
 
   const int seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
   const int partition_idx = blockIdx.y;
 
   constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
@@ -2419,11 +2453,13 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
   // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens
   // across 2 rows x 8 tokens per lane
 
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+
   if (GQA_RATIO == 1) {
     const int local_qhead_idx = lane16id % GQA_RATIO;
     const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-    const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
-    const scalar_t* q_ptr = q + seq_idx64 * q_stride +
+    const scalar_t* q_ptr = q + query_start_off * q_stride +
                             global_qhead_idx * HEAD_SIZE +
                             rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD;
     if (lane16id < GQA_RATIO) {
@@ -2439,9 +2475,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     // fetch Q in shared across warps and then write to registers
     const int local_qhead_idx = 2 * warpid + rowid;
     const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-    const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
     const scalar_t* q_ptr =
-        q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
 
     const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
     if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@@ -2736,6 +2771,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
@@ -2762,15 +2798,24 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
-  const int num_heads = gridDim.x;
-  const int head_idx = blockIdx.x;
-  const int seq_idx = blockIdx.y;
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
   // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2933,7 +2978,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
   const float inv_global_exp_sum =
       __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
   acc *= inv_global_exp_sum;
-  OUTT* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
   out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
 }
 
@@ -3201,16 +3250,24 @@ void paged_attention_custom_launcher_navi(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
-    torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_seqs = query.size(0);
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_seqs = block_tables.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
   int max_num_blocks_per_seq = block_tables.size(1);
   int q_stride = query.stride(0);
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
   // NOTE: Navi does not support alibi_slopes.
   const float* alibi_slopes_ptr = nullptr;
 
@@ -3363,14 +3420,14 @@ void paged_attention_custom_launcher_navi(
     paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
                                     PSIZE, ALIBI_ENABLED>(                    \
         out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-        num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
-        alibi_slopes, k_scale, v_scale);                                      \
+        num_kv_heads, scale, block_tables, context_lens, query_start_loc,     \
+        max_context_len, alibi_slopes, k_scale, v_scale);                     \
   } else {                                                                    \
     paged_attention_custom_launcher_navi<T, KVT, KV_DTYPE, BLK_SIZE,          \
                                          HEAD_SIZE, T, PSIZE, ALIBI_ENABLED>( \
         out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-        num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
-        alibi_slopes, k_scale, v_scale);                                      \
+        num_kv_heads, scale, block_tables, context_lens, query_start_loc,     \
+        max_context_len, alibi_slopes, k_scale, v_scale);                     \
   }
 
 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,      \
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
@@ -148,12 +148,7 @@ def test_paged_attention(
             or (version == "rocm" and head_size not in (64, 128))):
         pytest.skip()
 
-    is_rocm_navi = False
-    if current_platform.is_rocm():
-        is_rocm_navi = "gfx1" in torch.cuda.get_device_properties(
-            "cuda").gcnArchName
-
-    if (version == "rocm" and is_rocm_navi
+    if (version == "rocm" and current_platform.is_navi()
             and (kv_cache_dtype == "fp8" or head_size != 128
                  or block_size != 16 or use_alibi)):
         pytest.skip()
@@ -285,20 +280,20 @@ def test_paged_attention(
                 scale,
                 block_tables,
                 seq_lens,
+                None,
                 block_size,
                 max_seq_len,
                 alibi_slopes,
                 kv_cache_dtype,
                 k_scale,
                 v_scale,
-                is_rocm_navi,
             )
 
             opcheck(torch.ops._rocm_C.paged_attention,
                     (output, exp_sums, max_logits, tmp_output, query,
                      key_cache, value_cache, num_kv_heads, scale, block_tables,
-                     seq_lens, block_size, max_seq_len, alibi_slopes,
-                     kv_cache_dtype, k_scale, v_scale, is_rocm_navi),
+                     seq_lens, None, block_size, max_seq_len, alibi_slopes,
+                     kv_cache_dtype, k_scale, v_scale),
                     cond=(head_size == HEAD_SIZES[0]
                           and block_size == BLOCK_SIZES[0]))
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -119,12 +119,25 @@ def paged_attention_rocm(
     v_scale: torch.Tensor,
     is_navi: bool = False,
 ) -> None:
-    torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
-                                      key_cache, value_cache, num_kv_heads,
-                                      scale, block_tables, seq_lens,
-                                      query_start_loc, block_size, max_seq_len,
-                                      alibi_slopes, kv_cache_dtype, k_scale,
-                                      v_scale, is_navi)
+    torch.ops._rocm_C.paged_attention(out,
+                                      exp_sum,
+                                      max_logits,
+                                      tmp_out,
+                                      query,
+                                      key_cache,
+                                      value_cache,
+                                      num_kv_heads,
+                                      scale,
+                                      block_tables,
+                                      seq_lens,
+                                      query_start_loc,
+                                      block_size,
+                                      max_seq_len,
+                                      alibi_slopes,
+                                      kv_cache_dtype,
+                                      k_scale,
+                                      v_scale,
+                                      is_navi=current_platform.is_navi())
 
 
 def mla_decode_kvcache_cpu(
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -908,7 +908,6 @@ def forward(
                     self.kv_cache_dtype,
                     layer._k_scale,
                     layer._v_scale,
-                    _ON_NAVI,
                 )
             else:
                 output[num_prefill_tokens:] = paged_attn.forward_decode(
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -283,7 +283,8 @@ def chunked_prefill_paged_decode(
     use_custom = use_rocm_custom_paged_attention(query.dtype, head_size,
                                                  block_size,
                                                  num_queries_per_kv,
-                                                 max_seq_len, sliding_window)
+                                                 max_seq_len, sliding_window,
+                                                 kv_cache_dtype, alibi_slopes)
     if use_custom:
         _PARTITION_SIZE_ROCM = 256
         max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py

Original file line number	Diff line number	Diff line change
`@@ -908,7 +908,6 @@ def forward(`
`908`	`908`	`self.kv_cache_dtype,`
`909`	`909`	`layer._k_scale,`
`910`	`910`	`layer._v_scale,`
`911`		`- _ON_NAVI,`
`912`	`911`	`)`
`913`	`912`	`else:`
`914`	`913`	`output[num_prefill_tokens:] = paged_attn.forward_decode(`