support blha and cache kv quant

PaddlePaddle · Dec 20, 2023 · c43c61a · c43c61a
1 parent 92134e1
commit c43c61a
Show file tree

Hide file tree

Showing 216 changed files with 15,815 additions and 2,621 deletions.
diff --git a/.github/codecov.yml b/.github/codecov.yml
@@ -1,8 +1,14 @@
+codecov:
+  notify:
+    require_ci_to_pass: yes
+
 coverage:
   status:
     project:
-      default:
-        informational: true
+      default: 
+      target: 75% # overall project Coverage < 75%  CI will fail
+      informational: true
     patch:
-      default:
-        informational: true
+      default: 
+      target: 90% # lines adjusted  Coverage < 90%  CI will fail
+      informational: true
diff --git a/csrc/generation/dequant_int8.cu b/csrc/generation/dequant_int8.cu
@@ -0,0 +1,166 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include<stdlib.h>
+#include<string.h>
+#include<sys/types.h>
+#include<sys/stat.h>
+#include<unistd.h>
+#include<fcntl.h>
+#include<sys/mman.h>
+#include<stdio.h>
+#include<algorithm>
+
+
+constexpr int DequantKernelVecSize = 4;
+
+template <typename data_t>
+inline HOSTDEVICE data_t roundWithTiesToEven(data_t x) {
+  data_t xLower = floor(x);
+  data_t xUpper = ceil(x);
+  // x is in interval [xl,xu]. Choose closest of two bounds, breaking ties to
+  // even.
+  data_t dLower = x - xLower;
+  data_t dUpper = xUpper - x;
+  return static_cast<data_t>(
+      (dLower == dUpper ? fmod(xLower, 2.0F) == 0.0F : dLower < dUpper)
+          ? xLower
+          : xUpper);
+}
+
+template <typename data_t, int VecSize>
+__global__ void DequantKernel(data_t* output,
+                              const int32_t* input,
+                              const int m,  // batch size
+                              const int n,  // hidden
+                              const float* dequant_out_scale_data) {
+  int numel = m * n;
+  int stride = blockDim.x * gridDim.x * VecSize;
+  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
+  int col_id = idx % n;
+
+  AlignedVector<int32_t, VecSize> in_vec;
+  AlignedVector<float, VecSize> out_scale_vec;
+  AlignedVector<data_t, VecSize> out_vec;
+
+  for (; idx < numel; idx += stride) {
+    Load<int32_t, VecSize>(input + idx, &in_vec);
+    Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      out_vec[i] =
+          static_cast<data_t>(static_cast<float>(in_vec[i]) * out_scale_vec[i]);
+    }
+
+    Store<data_t, VecSize>(out_vec, output + idx);
+  }
+}
+
+template <paddle::DataType D>
+std::vector<paddle::Tensor> DispatchLaunchDequantInt8(const paddle::Tensor& input,
+                                    const paddle::Tensor& scale) {
+    typedef PDTraits<D> traits_;
+    typedef typename traits_::DataType DataType_;
+    typedef typename traits_::data_t data_t;
+
+    std::vector<int64_t> input_shape = input.shape();
+
+    auto output=paddle::full(input_shape, 0, D, input.place());
+    int64_t m = input_shape[0];
+    int64_t n = input_shape[1];
+
+    int64_t numel = m*n;
+    constexpr int64_t thread_per_block = 512;
+    int64_t block_per_grid = (numel / DequantKernelVecSize + thread_per_block - 1) / thread_per_block;
+    auto stream = input.stream();
+
+    DequantKernel<DataType_, DequantKernelVecSize>
+        <<<block_per_grid, thread_per_block, 0, stream>>>(
+            reinterpret_cast<DataType_*>(output.data<data_t>()),
+            reinterpret_cast<const int32_t*>(input.data<int32_t>()), m, n, 
+            reinterpret_cast<const float*>(scale.data<float>()));
+
+
+    return {output};
+
+}
+
+
+std::vector<paddle::Tensor> LaunchDequantInt8(const paddle::Tensor& input,
+                                              const paddle::Tensor& scale,
+                                              std::string dtype) {
+    paddle::DataType data_type;
+
+    if (dtype == "float32")
+        data_type = paddle::DataType::FLOAT32;
+    else if (dtype == "bfloat16")
+        data_type = paddle::DataType::BFLOAT16;
+    else if (dtype ==  "float16")
+        data_type = paddle::DataType::FLOAT16;
+    else 
+        PD_THROW(
+                "NOT supported data type. "
+                "Only bfloat16, float16 and float32 are supported. ");
+
+    switch (data_type) {
+        case paddle::DataType::BFLOAT16: 
+            return DispatchLaunchDequantInt8<paddle::DataType::BFLOAT16>(input, scale);
+            break;
+        case paddle::DataType::FLOAT16: 
+            return DispatchLaunchDequantInt8<paddle::DataType::FLOAT16>(input, scale);
+            break;
+        case paddle::DataType::FLOAT32: 
+            return DispatchLaunchDequantInt8<paddle::DataType::FLOAT32>(input, scale);
+            break;
+        default:
+            break;
+    }
+}
+
+std::vector<paddle::Tensor> DequantInt8(const paddle::Tensor& input,
+                                        const paddle::Tensor& out_scale,
+                                        std::string dtype
+                                        ) {
+    return LaunchDequantInt8(input, out_scale, dtype);
+}
+
+std::vector<std::vector<int64_t>> DequantInt8Shape(const std::vector<int64_t>& input_shape) {
+    return {input_shape};
+}
+
+std::vector<paddle::DataType> DequantInt8Dtype(const paddle::DataType& input_dtype, const paddle::DataType& out_scale_dtype, std::string dtype) {
+    paddle::DataType data_type;
+    if (dtype == "float32")
+        data_type = paddle::DataType::FLOAT32;
+    else if (dtype == "bfloat16")
+        data_type = paddle::DataType::BFLOAT16;
+    else if (dtype ==  "float16")
+        data_type = paddle::DataType::FLOAT16;
+    else 
+        PD_THROW(
+                "NOT supported data type. "
+                "Only bfloat16, float16 and float32 are supported. ");
+
+    return {data_type};
+}
+
+PD_BUILD_OP(dequant_int8)
+    .Inputs({"intput","out_scale"})
+    .Outputs({"output"})
+    .Attrs({"dtype: std::string"})
+    .SetKernelFn(PD_KERNEL(DequantInt8))
+    .SetInferShapeFn(PD_INFER_SHAPE(DequantInt8Shape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(DequantInt8Dtype));
diff --git a/csrc/generation/encode_rotary_qk.cu b/csrc/generation/encode_rotary_qk.cu
@@ -105,6 +105,7 @@ void LaunchRotaryQK(const paddle::Tensor& q,
 
     const int32_t batch_size = q.shape()[0];
     const int32_t head_num = q.shape()[1];
+    const int32_t kv_head_num = kv.shape()[1];
     const int32_t seq_len = q.shape()[2];
     const int32_t dim_head = q.shape()[3];
 
@@ -147,15 +148,16 @@ void LaunchRotaryQK(const paddle::Tensor& q,
             head_num,
             seq_len * rotary_emb_dims,
             last_dim);
-        RotaryKernel<<<grid, BlockSize, 0, cu_stream>>>(
+        dim3 grid_k(batch_size, kv_head_num, seq_len * rotary_emb_dims);
+        RotaryKernel<<<grid_k, BlockSize, 0, cu_stream>>>(
             k_data,
             cos_emb,
             sin_emb,
             seq_lens.data<int>()/*sequence_lengths*/,
             k_out_data,
             rotary_emb_dims,
             batch_size,
-            head_num,
+            kv_head_num,
             seq_len * rotary_emb_dims,
             last_dim);
     } else {

diff --git a/csrc/generation/get_max_len.cu b/csrc/generation/get_max_len.cu
@@ -0,0 +1,95 @@
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include "helper.h"
+#include <sys/mman.h>
+#include <fstream>
+
+template <int THREADBLOCK_SIZE>
+__global__ void GetMaxLenKernel(const int *seq_lens,
+                                int *max_len,
+                                const int batch_size) {
+  const int tid = threadIdx.x;
+
+  typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int max_len_this_thread = 0;
+  for (int i = tid; i < batch_size; i += blockDim.x) {
+    max_len_this_thread = max(seq_lens[i], max_len_this_thread);
+  }
+  int total =
+      BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
+  if (tid == 0) {
+    *max_len = total;
+  }
+}
+
+// void write_mmap(const std::string& name, const std::string& value) {
+//     int len = value.length();
+//     // 打开文件
+//     int fd = open(name.data(), O_RDWR | O_CREAT, 00777);
+//     // lseek将文件指针往后移动 len - 1 位
+//     lseek(fd, len - 1, SEEK_END);
+//     // 预先写入一个空字符；mmap不能扩展文件长度，这里相当于预先给文件长度，准备一个空架子
+//     write(fd, " ", 1);
+//     // 建立映射
+//     char *buffer = (char *) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+//     // 关闭文件
+//     close(fd);
+//     // 将 data 复制到 buffer 里
+//     memcpy(buffer, data, len);
+//     // 关闭映射
+//     munmap(buffer, len)
+// }
+
+
+void GetMaxLen(const paddle::Tensor& seq_lens_encoder, const paddle::Tensor& seq_lens_decoder) {
+    constexpr int blockSize = 128;
+    int batch_size = seq_lens_encoder.shape()[0];
+    auto cu_stream = seq_lens_encoder.stream();
+
+    auto max_len_encoder = paddle::empty({1}, paddle::DataType::INT32, seq_lens_encoder.place());
+    auto max_len_decoder = paddle::empty({1}, paddle::DataType::INT32, seq_lens_encoder.place());
+
+
+    GetMaxLenKernel<blockSize><<<1, blockSize, 0, cu_stream>>>(
+        seq_lens_encoder.data<int>(), max_len_encoder.data<int>(), batch_size);
+    GetMaxLenKernel<blockSize><<<1, blockSize, 0, cu_stream>>>(
+        seq_lens_decoder.data<int>(), max_len_decoder.data<int>(), batch_size);
+
+
+    int max_len_encoder_data = max_len_encoder.copy_to(paddle::CPUPlace(), true).data<int>()[0];
+    int max_len_decoder_data = max_len_decoder.copy_to(paddle::CPUPlace(), true).data<int>()[0];
+
+
+    // char tmp_1[10];
+    // itoa(max_len_encoder_data, tmp_1, 10);
+    // char tmp_2[10];
+    // itoa(max_len_decoder_data, tmp_2, 10);
+
+    // std::string max_len_encoder_str = std::to_string(max_len_encoder_data);
+    // std::string max_len_decoder_str = std::to_string(max_len_decoder_data);
+
+    // int s = setenv("FLAGS_max_enc_len_this_time_data", max_len_encoder_str.data(),1);
+    // printf("set env %d\n", s);
+    // s = setenv("FLAGS_max_dec_len_this_time_data", max_len_decoder_str.data(),1);
+
+    // auto env = getenv("FLAGS_max_enc_len_this_time_data");
+    // printf("get env %s\n", env);
+
+
+    std::ofstream outfile;
+    outfile.open("max_len.txt", std::ios::out);
+
+    outfile << max_len_encoder_data << "\n" << max_len_decoder_data;
+
+    outfile.close();
+}
+
+
+PD_BUILD_OP(get_max_len)
+    .Inputs({"seq_lens_encoder", "seq_lens_decoder"})
+    .Outputs({"seq_lens_encoder_out", "seq_lens_decoder_out"})
+    .SetInplaceMap({{"seq_lens_encoder", "seq_lens_encoder_out"}, {"seq_lens_decoder", "seq_lens_decoder_out"}})
+    .SetKernelFn(PD_KERNEL(GetMaxLen));
diff --git a/csrc/generation/get_output.cc b/csrc/generation/get_output.cc
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/msg.h>
+#include <sys/types.h>
+#include "paddle/extension.h"
+
+#define MAX_BSZ 512
+
+struct msgdata {
+    long mtype;
+    int mtext[MAX_BSZ + 2];   // stop_flag, bsz, tokens
+};
+
+void GetOutput(const paddle::Tensor& x,
+               int64_t rank_id,
+               bool wait_flag) {
+  if (rank_id > 0) return;
+
+  static struct msgdata msg_rcv;
+
+  static key_t key = ftok("./", 1);
+
+  static int msgid = msgget(key, IPC_CREAT | 0666);
+
+  int64_t *out_data = const_cast<int64_t*>(x.data<int64_t>());
+  int ret = -1;
+  if (!wait_flag) {
+    ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, IPC_NOWAIT);
+  } else {
+    ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, 0);
+  }
+  if(ret == -1)
+	{
+    // read none
+    out_data[0] = -2;
+    out_data[1] = 0;
+		return;
+	}
+
+  int bsz = msg_rcv.mtext[1];
+
+  for (int64_t i = 0; i < bsz + 2; i++) {
+    out_data[i] = (int64_t)msg_rcv.mtext[i];
+  }
+  return;
+}
+
+PD_BUILD_OP(get_output)
+    .Inputs({"x"})
+    .Attrs({"rank_id: int64_t",
+            "wait_flag: bool"})
+    .Outputs({"x_out"})
+    .SetInplaceMap({{"x", "x_out"}})
+    .SetKernelFn(PD_KERNEL(GetOutput));