Skip to content

Commit

Permalink
support blha and cache kv quant
Browse files Browse the repository at this point in the history
  • Loading branch information
RichardWooSJTU committed Dec 20, 2023
1 parent 92134e1 commit c43c61a
Show file tree
Hide file tree
Showing 216 changed files with 15,815 additions and 2,621 deletions.
14 changes: 10 additions & 4 deletions .github/codecov.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
codecov:
notify:
require_ci_to_pass: yes

coverage:
status:
project:
default:
informational: true
default:
target: 75% # overall project Coverage < 75% CI will fail
informational: true
patch:
default:
informational: true
default:
target: 90% # lines adjusted Coverage < 90% CI will fail
informational: true
166 changes: 166 additions & 0 deletions csrc/generation/dequant_int8.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "helper.h"
#include<stdlib.h>
#include<string.h>
#include<sys/types.h>
#include<sys/stat.h>
#include<unistd.h>
#include<fcntl.h>
#include<sys/mman.h>
#include<stdio.h>
#include<algorithm>


constexpr int DequantKernelVecSize = 4;

template <typename data_t>
inline HOSTDEVICE data_t roundWithTiesToEven(data_t x) {
data_t xLower = floor(x);
data_t xUpper = ceil(x);
// x is in interval [xl,xu]. Choose closest of two bounds, breaking ties to
// even.
data_t dLower = x - xLower;
data_t dUpper = xUpper - x;
return static_cast<data_t>(
(dLower == dUpper ? fmod(xLower, 2.0F) == 0.0F : dLower < dUpper)
? xLower
: xUpper);
}

template <typename data_t, int VecSize>
__global__ void DequantKernel(data_t* output,
const int32_t* input,
const int m, // batch size
const int n, // hidden
const float* dequant_out_scale_data) {
int numel = m * n;
int stride = blockDim.x * gridDim.x * VecSize;
int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
int col_id = idx % n;

AlignedVector<int32_t, VecSize> in_vec;
AlignedVector<float, VecSize> out_scale_vec;
AlignedVector<data_t, VecSize> out_vec;

for (; idx < numel; idx += stride) {
Load<int32_t, VecSize>(input + idx, &in_vec);
Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);

#pragma unroll
for (int i = 0; i < VecSize; ++i) {
out_vec[i] =
static_cast<data_t>(static_cast<float>(in_vec[i]) * out_scale_vec[i]);
}

Store<data_t, VecSize>(out_vec, output + idx);
}
}

template <paddle::DataType D>
std::vector<paddle::Tensor> DispatchLaunchDequantInt8(const paddle::Tensor& input,
const paddle::Tensor& scale) {
typedef PDTraits<D> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;

std::vector<int64_t> input_shape = input.shape();

auto output=paddle::full(input_shape, 0, D, input.place());
int64_t m = input_shape[0];
int64_t n = input_shape[1];

int64_t numel = m*n;
constexpr int64_t thread_per_block = 512;
int64_t block_per_grid = (numel / DequantKernelVecSize + thread_per_block - 1) / thread_per_block;
auto stream = input.stream();

DequantKernel<DataType_, DequantKernelVecSize>
<<<block_per_grid, thread_per_block, 0, stream>>>(
reinterpret_cast<DataType_*>(output.data<data_t>()),
reinterpret_cast<const int32_t*>(input.data<int32_t>()), m, n,
reinterpret_cast<const float*>(scale.data<float>()));


return {output};

}


std::vector<paddle::Tensor> LaunchDequantInt8(const paddle::Tensor& input,
const paddle::Tensor& scale,
std::string dtype) {
paddle::DataType data_type;

if (dtype == "float32")
data_type = paddle::DataType::FLOAT32;
else if (dtype == "bfloat16")
data_type = paddle::DataType::BFLOAT16;
else if (dtype == "float16")
data_type = paddle::DataType::FLOAT16;
else
PD_THROW(
"NOT supported data type. "
"Only bfloat16, float16 and float32 are supported. ");

switch (data_type) {
case paddle::DataType::BFLOAT16:
return DispatchLaunchDequantInt8<paddle::DataType::BFLOAT16>(input, scale);
break;
case paddle::DataType::FLOAT16:
return DispatchLaunchDequantInt8<paddle::DataType::FLOAT16>(input, scale);
break;
case paddle::DataType::FLOAT32:
return DispatchLaunchDequantInt8<paddle::DataType::FLOAT32>(input, scale);
break;
default:
break;
}
}

std::vector<paddle::Tensor> DequantInt8(const paddle::Tensor& input,
const paddle::Tensor& out_scale,
std::string dtype
) {
return LaunchDequantInt8(input, out_scale, dtype);
}

std::vector<std::vector<int64_t>> DequantInt8Shape(const std::vector<int64_t>& input_shape) {
return {input_shape};
}

std::vector<paddle::DataType> DequantInt8Dtype(const paddle::DataType& input_dtype, const paddle::DataType& out_scale_dtype, std::string dtype) {
paddle::DataType data_type;
if (dtype == "float32")
data_type = paddle::DataType::FLOAT32;
else if (dtype == "bfloat16")
data_type = paddle::DataType::BFLOAT16;
else if (dtype == "float16")
data_type = paddle::DataType::FLOAT16;
else
PD_THROW(
"NOT supported data type. "
"Only bfloat16, float16 and float32 are supported. ");

return {data_type};
}

PD_BUILD_OP(dequant_int8)
.Inputs({"intput","out_scale"})
.Outputs({"output"})
.Attrs({"dtype: std::string"})
.SetKernelFn(PD_KERNEL(DequantInt8))
.SetInferShapeFn(PD_INFER_SHAPE(DequantInt8Shape))
.SetInferDtypeFn(PD_INFER_DTYPE(DequantInt8Dtype));
6 changes: 4 additions & 2 deletions csrc/generation/encode_rotary_qk.cu
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ void LaunchRotaryQK(const paddle::Tensor& q,

const int32_t batch_size = q.shape()[0];
const int32_t head_num = q.shape()[1];
const int32_t kv_head_num = kv.shape()[1];
const int32_t seq_len = q.shape()[2];
const int32_t dim_head = q.shape()[3];

Expand Down Expand Up @@ -147,15 +148,16 @@ void LaunchRotaryQK(const paddle::Tensor& q,
head_num,
seq_len * rotary_emb_dims,
last_dim);
RotaryKernel<<<grid, BlockSize, 0, cu_stream>>>(
dim3 grid_k(batch_size, kv_head_num, seq_len * rotary_emb_dims);
RotaryKernel<<<grid_k, BlockSize, 0, cu_stream>>>(
k_data,
cos_emb,
sin_emb,
seq_lens.data<int>()/*sequence_lengths*/,
k_out_data,
rotary_emb_dims,
batch_size,
head_num,
kv_head_num,
seq_len * rotary_emb_dims,
last_dim);
} else {
Expand Down
95 changes: 95 additions & 0 deletions csrc/generation/get_max_len.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#include <cstdlib>
#include <cstdio>
#include <string>
#include "helper.h"
#include <sys/mman.h>
#include <fstream>

template <int THREADBLOCK_SIZE>
__global__ void GetMaxLenKernel(const int *seq_lens,
int *max_len,
const int batch_size) {
const int tid = threadIdx.x;

typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;

int max_len_this_thread = 0;
for (int i = tid; i < batch_size; i += blockDim.x) {
max_len_this_thread = max(seq_lens[i], max_len_this_thread);
}
int total =
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
if (tid == 0) {
*max_len = total;
}
}

// void write_mmap(const std::string& name, const std::string& value) {
// int len = value.length();
// // 打开文件
// int fd = open(name.data(), O_RDWR | O_CREAT, 00777);
// // lseek将文件指针往后移动 len - 1 位
// lseek(fd, len - 1, SEEK_END);
// // 预先写入一个空字符;mmap不能扩展文件长度,这里相当于预先给文件长度,准备一个空架子
// write(fd, " ", 1);
// // 建立映射
// char *buffer = (char *) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
// // 关闭文件
// close(fd);
// // 将 data 复制到 buffer 里
// memcpy(buffer, data, len);
// // 关闭映射
// munmap(buffer, len)
// }


void GetMaxLen(const paddle::Tensor& seq_lens_encoder, const paddle::Tensor& seq_lens_decoder) {
constexpr int blockSize = 128;
int batch_size = seq_lens_encoder.shape()[0];
auto cu_stream = seq_lens_encoder.stream();

auto max_len_encoder = paddle::empty({1}, paddle::DataType::INT32, seq_lens_encoder.place());
auto max_len_decoder = paddle::empty({1}, paddle::DataType::INT32, seq_lens_encoder.place());


GetMaxLenKernel<blockSize><<<1, blockSize, 0, cu_stream>>>(
seq_lens_encoder.data<int>(), max_len_encoder.data<int>(), batch_size);
GetMaxLenKernel<blockSize><<<1, blockSize, 0, cu_stream>>>(
seq_lens_decoder.data<int>(), max_len_decoder.data<int>(), batch_size);


int max_len_encoder_data = max_len_encoder.copy_to(paddle::CPUPlace(), true).data<int>()[0];
int max_len_decoder_data = max_len_decoder.copy_to(paddle::CPUPlace(), true).data<int>()[0];


// char tmp_1[10];
// itoa(max_len_encoder_data, tmp_1, 10);
// char tmp_2[10];
// itoa(max_len_decoder_data, tmp_2, 10);

// std::string max_len_encoder_str = std::to_string(max_len_encoder_data);
// std::string max_len_decoder_str = std::to_string(max_len_decoder_data);

// int s = setenv("FLAGS_max_enc_len_this_time_data", max_len_encoder_str.data(),1);
// printf("set env %d\n", s);
// s = setenv("FLAGS_max_dec_len_this_time_data", max_len_decoder_str.data(),1);

// auto env = getenv("FLAGS_max_enc_len_this_time_data");
// printf("get env %s\n", env);


std::ofstream outfile;
outfile.open("max_len.txt", std::ios::out);

outfile << max_len_encoder_data << "\n" << max_len_decoder_data;

outfile.close();
}


PD_BUILD_OP(get_max_len)
.Inputs({"seq_lens_encoder", "seq_lens_decoder"})
.Outputs({"seq_lens_encoder_out", "seq_lens_decoder_out"})
.SetInplaceMap({{"seq_lens_encoder", "seq_lens_encoder_out"}, {"seq_lens_decoder", "seq_lens_decoder_out"}})
.SetKernelFn(PD_KERNEL(GetMaxLen));
55 changes: 55 additions & 0 deletions csrc/generation/get_output.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include <stdio.h>
#include <string.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/types.h>
#include "paddle/extension.h"

#define MAX_BSZ 512

struct msgdata {
long mtype;
int mtext[MAX_BSZ + 2]; // stop_flag, bsz, tokens
};

void GetOutput(const paddle::Tensor& x,
int64_t rank_id,
bool wait_flag) {
if (rank_id > 0) return;

static struct msgdata msg_rcv;

static key_t key = ftok("./", 1);

static int msgid = msgget(key, IPC_CREAT | 0666);

int64_t *out_data = const_cast<int64_t*>(x.data<int64_t>());
int ret = -1;
if (!wait_flag) {
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, IPC_NOWAIT);
} else {
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, 0);
}
if(ret == -1)
{
// read none
out_data[0] = -2;
out_data[1] = 0;
return;
}

int bsz = msg_rcv.mtext[1];

for (int64_t i = 0; i < bsz + 2; i++) {
out_data[i] = (int64_t)msg_rcv.mtext[i];
}
return;
}

PD_BUILD_OP(get_output)
.Inputs({"x"})
.Attrs({"rank_id: int64_t",
"wait_flag: bool"})
.Outputs({"x_out"})
.SetInplaceMap({{"x", "x_out"}})
.SetKernelFn(PD_KERNEL(GetOutput));
Loading

0 comments on commit c43c61a

Please sign in to comment.