optimize

suquark · suquark · commit e21845e1f036 · 2023-04-08T13:44:57.000-07:00
optimize with shared memory

better number of threads

update test

temp disable test

update
diff --git a/benchmark/benchmark_attention.py b/benchmark/benchmark_attention.py
@@ -6,7 +6,7 @@
 from flash_attn.flash_attn_interface import _flash_attn_forward
 import torch
 
-from cacheflow import attention_ops
+from cacheflow import attention_ops, cache_ops
 
 
 def benchmark(name, f, num_warmup = 10, num_iters = 100):
@@ -43,7 +43,7 @@ def benchmark_multi_query_cached_kv_attention(
     num_total_tokens = cu_query_lens[-1]
     qkv = torch.randn(
         num_total_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
-    query, _, _ = qkv.unbind(dim=1)
+    query, key, value = qkv.unbind(dim=1)  # NOTE: this will not make a copy.
 
     # Create key and value cache.
     x = 16 // torch.tensor([], dtype=dtype).element_size()
@@ -72,21 +72,53 @@ def benchmark_multi_query_cached_kv_attention(
     scale = float(1.0 / (head_size ** 0.5))
     output = torch.empty(
         num_total_tokens, num_heads, head_size, dtype=dtype, device='cuda')
+    
+    num_kv_tokens = sum(context_lens)
+    cu_context_lens = [0]
+    for context_len in context_lens:
+        cu_context_lens.append(cu_context_lens[-1] + context_len)
+    cpu_context_lens = torch.tensor(cu_context_lens, dtype=torch.int, device='cpu')
+    cu_context_lens = cpu_context_lens.cuda()
+    ref_output = torch.empty_like(output)
 
     # Run our implementation.
     def run_ours():
-        attention_ops.multi_query_cached_kv_attention(
-            cu_query_lens,
-            output,
-            query,
+        cache_ops.gather_cached_kv(
+            qkv,
             key_cache,
             value_cache,
-            scale,
+            cu_context_lens,
+            cpu_context_lens,
             block_tables,
-            context_len_tensor,
-            block_size,
+        )
+
+        _flash_attn_forward(
+            query,
+            key,
+            value,
+            ref_output,
+            cu_query_lens,
+            cu_context_lens,
+            max(query_lens),
             max_context_len,
+            dropout_p=0.0,
+            softmax_scale=scale,
+            causal=True,
+            return_softmax=False,
         )
+
+        # attention_ops.multi_query_cached_kv_attention(
+        #     cu_query_lens,
+        #     output,
+        #     query,
+        #     key_cache,
+        #     value_cache,
+        #     scale,
+        #     block_tables,
+        #     context_len_tensor,
+        #     block_size,
+        #     max_context_len,
+        # )
     benchmark('Ours', run_ours)
 
     # Upper bound: Flash attention.
diff --git a/csrc/cache.cpp b/csrc/cache.cpp
@@ -20,6 +20,14 @@ void reshape_and_cache(
   torch::Tensor& value_cache,
   torch::Tensor& slot_mapping);
 
+void gather_cached_kv(
+  torch::Tensor& qkv_out,
+  torch::Tensor& key_cache,
+  torch::Tensor& value_cache,
+  torch::Tensor& cu_seqlens_k,
+  torch::Tensor& seqlens_k,
+  torch::Tensor& block_tables);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def(
     "swap_blocks",
@@ -33,4 +41,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "reshape_and_cache",
     &reshape_and_cache,
     "Reshape the key and value tensors and cache them");
+  m.def(
+    "gather_cached_kv",
+    &gather_cached_kv,
+    "Gather key and value from the cache into contiguous QKV tensors");
 }
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -176,6 +176,183 @@ __global__ void reshape_and_cache_kernel(
   }
 }
 
+// Grid: (num_blocks, num_heads).
+template<typename scalar_t>
+__global__ void gather_cached_kv_kernel(
+  scalar_t* __restrict__ out,             // [cu_seqlens_k[-1], 3(QKV), num_heads, head_size]
+  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_heads, head_size/x, block_size, x]
+  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_heads, head_size, block_size]
+  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
+  const int* __restrict__ cu_seqlens_k,   // aka 'cu_seqlens_k' in '_flash_attn_forward', or 'context_lens' in cacheflow
+  const int num_seqs,
+  const int max_num_blocks_per_seq,
+  const int head_size,
+  const int block_size) {
+    // Each CUDA gird is mapped to (num_blocks, num_heads).
+    const int block_idx = blockIdx.x;
+    const int num_blocks = gridDim.x;
+    const int head_idx = blockIdx.y;
+    const int num_heads = gridDim.y;
+    // Each CUDA block is responsible for (head_size, block_size).
+    const int thread_idx = threadIdx.x;
+    const int num_threads = blockDim.x;
+    // in the original attention kernel, each thread group fetch x elements at a time.
+    constexpr int x = 16 / sizeof(scalar_t);
+
+    // the index of the sequence this thread is working on.
+    int seq_idx;
+    // the index of the block in the sequence this thread is working on.
+    int local_block_idx;
+    // calculate the sequence index and block index in the sequence.
+    int num_total_blocks = 0;
+#pragma unroll
+    for (int i = 0; i < num_seqs; ++i) {
+      int context_len = cu_seqlens_k[i + 1] - cu_seqlens_k[i];
+      int num_blocks = (context_len + block_size - 1) / block_size;
+      num_total_blocks += num_blocks;
+      if (num_total_blocks > block_idx) {
+        seq_idx = i;
+        local_block_idx = block_idx - (num_total_blocks - num_blocks);
+        break;
+      }
+    }
+    // const int context_len = cu_seqlens_k[seq_idx];
+    // const int num_blocks = (context_len + block_size - 1) / block_size;
+    const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
+    const int physical_block_number = block_table[local_block_idx];
+
+    // number of chunks handled by a CUDA block.
+    const int n_chunks = (head_size * block_size + (num_threads - 1)) / num_threads;
+    const int physical_cache_offset = (physical_block_number * num_heads + head_idx) * head_size * block_size;
+
+    // The common output pointer base used by both key and value:
+    scalar_t* common_out = out + (block_idx * block_size) * 3 * num_heads * head_size
+                               + head_idx * head_size;
+    // key is the second tensor in QKV, so qkv_offset = 1
+    scalar_t* key_out = common_out + 1 * num_heads * head_size;
+    // value is the third tensor in QKV, so qkv_offset = 2
+    scalar_t* value_out = common_out + 2 * num_heads * head_size;
+
+    // process key in chunks
+#pragma unroll
+    for (int chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) {
+      const int offset = chunk_idx * num_threads + thread_idx;
+      if (offset >= head_size * block_size) {
+        break;
+      }
+      // calculate offsets in [head_size/x, block_size, x]
+      const int head_offset = offset / x / block_size;
+      const int block_offset = offset / x % block_size;
+      const int x_offset = offset % x;
+
+      const scalar_t* k_ptr = k_cache + physical_cache_offset + offset;
+      scalar_t* out_ptr = key_out + block_offset * 3 * num_heads * head_size 
+                                  + head_offset * x + x_offset;
+      *out_ptr = __ldg(k_ptr);
+    }
+
+    // process value in chunks
+#pragma unroll
+    for (int chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) {
+      const int offset = chunk_idx * num_threads + thread_idx;
+      if (offset >= head_size * block_size) {
+        break;
+      }
+      // calculate offsets in [head_size, block_size]
+      const int head_offset = offset / block_size;
+      const int block_offset = offset % block_size;
+
+      const scalar_t* v_ptr = v_cache + physical_cache_offset + offset;
+      scalar_t* out_ptr = value_out + block_offset * 3 * num_heads * head_size + head_offset;
+      *out_ptr = __ldg(v_ptr);
+    }
+}
+
+
+// Grid: (num_blocks, block_size).
+template<typename scalar_t>
+__global__ void gather_cached_kv_kernel_2(
+  scalar_t* __restrict__ out,             // [cu_seqlens_k[-1], 3(QKV), num_heads, head_size]
+  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_heads, head_size/x, block_size, x]
+  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_heads, head_size, block_size]
+  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
+  const int* __restrict__ cu_seqlens_k,   // aka 'cu_seqlens_k' in '_flash_attn_forward', or 'context_lens' in cacheflow
+  const int num_seqs,
+  const int max_num_blocks_per_seq,
+  const int num_heads,
+  const int head_size) {
+    // Each CUDA gird is mapped to (num_blocks, num_heads).
+    const int block_idx = blockIdx.x;
+    const int num_blocks = gridDim.x;
+    const int block_offset = blockIdx.y;
+    const int block_size = gridDim.y;
+    // Each CUDA block is responsible for (head_size, block_size).
+    const int thread_idx = threadIdx.x;
+    const int num_threads = blockDim.x;
+    // in the original attention kernel, each thread group fetch x elements at a time.
+    constexpr int x = 16 / sizeof(scalar_t);
+
+    // the index of the sequence this thread is working on.
+    int seq_idx;
+    // the index of the block in the sequence this thread is working on.
+    int local_block_idx;
+    // calculate the sequence index and block index in the sequence.
+    int num_total_blocks = 0;
+#pragma unroll
+    for (int i = 0; i < num_seqs; ++i) {
+      int context_len = cu_seqlens_k[i + 1] - cu_seqlens_k[i];
+      int num_blocks = (context_len + block_size - 1) / block_size;
+      num_total_blocks += num_blocks;
+      if (num_total_blocks > block_idx) {
+        seq_idx = i;
+        local_block_idx = block_idx - (num_total_blocks - num_blocks);
+        break;
+      }
+    }
+
+    // const int context_len = cu_seqlens_k[seq_idx];
+    // const int num_blocks = (context_len + block_size - 1) / block_size;
+    const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
+    const int physical_block_number = block_table[local_block_idx];
+    const int physical_cache_offset = physical_block_number * num_heads * head_size * block_size;
+
+    // The common output pointer base used by both key and value:
+    scalar_t* common_out = out + (block_idx * block_size + block_offset) * 3 * num_heads * head_size;
+    // key is the second tensor in QKV, so qkv_offset = 1
+    scalar_t* key_out = common_out + 1 * num_heads * head_size;
+    // value is the third tensor in QKV, so qkv_offset = 2
+    scalar_t* value_out = common_out + 2 * num_heads * head_size;
+
+    // process key in chunks
+#pragma unroll
+    for (int i = threadIdx.x; i < num_heads * head_size; i += blockDim.x) {
+      // calculate offsets in [num_heads, head_size/x, x]
+      const int head_idx = i / x / (head_size / x);
+      const int head_offset = i / x % (head_size / x);
+      const int x_offset = i % x;
+
+      const scalar_t* k_ptr = k_cache + physical_cache_offset
+                              + head_idx * (head_size/x) * block_size * x
+                              + head_offset * block_size * x
+                              + block_offset * x
+                              + x_offset;
+      key_out[head_idx * head_size + head_offset * x + x_offset] = __ldg(k_ptr);
+    }
+
+    // process value in chunks
+#pragma unroll
+    for (int i = threadIdx.x; i < num_heads * head_size; i += blockDim.x) {
+      // calculate offsets in [num_heads, head_size]
+      const int head_idx = i / head_size;
+      const int head_offset = i % head_size;
+
+      const scalar_t* v_ptr = v_cache + physical_cache_offset
+                              + i * block_size   // equal to (head_idx * head_size + head_offset) * block_size
+                              + block_offset;
+      value_out[i] = __ldg(v_ptr);
+    }
+}
+
 } // namespace cacheflow
 
 void reshape_and_cache(
@@ -215,3 +392,96 @@ void reshape_and_cache(
         x);
     });
 }
+
+/*
+// same group of threads will be working on the same block
+void gather_cached_kv(
+  torch::Tensor& qkv_out,         // [cu_seqlens_k[-1], 3(QKV), num_heads, head_size]
+  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
+  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
+  torch::Tensor& cu_seqlens_k,    // aka 'cu_seqlens_k' in '_flash_attn_forward', or 'context_lens' in cacheflow
+  torch::Tensor& seqlens_k,       // CPU version of 'cu_seqlens_k'
+  torch::Tensor& block_tables) {  // [num_seqs, max_num_blocks_per_seq]
+    const int num_seqs = cu_seqlens_k.size(0) - 1;
+    const int num_heads = value_cache.size(1);
+    const int head_size = value_cache.size(2);
+    const int block_size = value_cache.size(3);
+    // const int x = key_cache.size(4);
+    const int max_num_blocks_per_seq = block_tables.size(1);
+    const int* context_lens_ptr = cu_seqlens_k.data_ptr<int>();
+    const int* cpu_context_lens_ptr = seqlens_k.data_ptr<int>();
+
+    // calculate the total amount of blocks
+    int num_total_blocks = 0;
+    for (int i = 0; i < num_seqs; ++i) {
+      int context_len = cpu_context_lens_ptr[i + 1] - cpu_context_lens_ptr[i];
+      int num_blocks = (context_len + block_size - 1) / block_size;
+      num_total_blocks += num_blocks;
+    }
+
+    constexpr int NUM_THREADS = 256;
+    dim3 grid(num_total_blocks, num_heads);
+    dim3 block(NUM_THREADS);
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      key_cache.scalar_type(),
+      "gather_cached_kv_kernel",
+      [&] {
+        cacheflow::gather_cached_kv_kernel<scalar_t><<<grid, block, 0, stream>>>(
+          qkv_out.data_ptr<scalar_t>(),
+          key_cache.data_ptr<scalar_t>(),
+          value_cache.data_ptr<scalar_t>(),
+          block_tables.data_ptr<int>(),
+          cu_seqlens_k.data_ptr<int>(),
+          num_seqs,
+          max_num_blocks_per_seq,
+          head_size,
+          block_size);
+      });
+}
+*/
+
+// same group of threads will be working on the same block
+void gather_cached_kv(
+  torch::Tensor& qkv_out,         // [cu_seqlens_k[-1], 3(QKV), num_heads, head_size]
+  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
+  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
+  torch::Tensor& cu_seqlens_k,    // aka 'cu_seqlens_k' in '_flash_attn_forward', or 'context_lens' in cacheflow
+  torch::Tensor& seqlens_k,       // CPU version of 'cu_seqlens_k'
+  torch::Tensor& block_tables) {  // [num_seqs, max_num_blocks_per_seq]
+    const int num_seqs = cu_seqlens_k.size(0) - 1;
+    const int num_heads = value_cache.size(1);
+    const int head_size = value_cache.size(2);
+    const int block_size = value_cache.size(3);
+    // const int x = key_cache.size(4);
+    const int max_num_blocks_per_seq = block_tables.size(1);
+    const int* context_lens_ptr = cu_seqlens_k.data_ptr<int>();
+    const int* cpu_context_lens_ptr = seqlens_k.data_ptr<int>();
+
+    // calculate the total amount of blocks
+    int num_total_blocks = 0;
+    for (int i = 0; i < num_seqs; ++i) {
+      int context_len = cpu_context_lens_ptr[i + 1] - cpu_context_lens_ptr[i];
+      int num_blocks = (context_len + block_size - 1) / block_size;
+      num_total_blocks += num_blocks;
+    }
+
+    dim3 grid(num_total_blocks, block_size);
+    dim3 block(std::min(num_heads * head_size, 512));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      key_cache.scalar_type(),
+      "gather_cached_kv_kernel_2",
+      [&] {
+        cacheflow::gather_cached_kv_kernel_2<scalar_t><<<grid, block, 0, stream>>>(
+          qkv_out.data_ptr<scalar_t>(),
+          key_cache.data_ptr<scalar_t>(),
+          value_cache.data_ptr<scalar_t>(),
+          block_tables.data_ptr<int>(),
+          cu_seqlens_k.data_ptr<int>(),
+          num_seqs,
+          max_num_blocks_per_seq,
+          num_heads,
+          head_size);
+      });
+}
diff --git a/tests/kernels/attention.py b/tests/kernels/attention.py