-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
92134e1
commit c43c61a
Showing
216 changed files
with
15,815 additions
and
2,621 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,14 @@ | ||
codecov: | ||
notify: | ||
require_ci_to_pass: yes | ||
|
||
coverage: | ||
status: | ||
project: | ||
default: | ||
informational: true | ||
default: | ||
target: 75% # overall project Coverage < 75% CI will fail | ||
informational: true | ||
patch: | ||
default: | ||
informational: true | ||
default: | ||
target: 90% # lines adjusted Coverage < 90% CI will fail | ||
informational: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "helper.h" | ||
#include<stdlib.h> | ||
#include<string.h> | ||
#include<sys/types.h> | ||
#include<sys/stat.h> | ||
#include<unistd.h> | ||
#include<fcntl.h> | ||
#include<sys/mman.h> | ||
#include<stdio.h> | ||
#include<algorithm> | ||
|
||
|
||
constexpr int DequantKernelVecSize = 4; | ||
|
||
template <typename data_t> | ||
inline HOSTDEVICE data_t roundWithTiesToEven(data_t x) { | ||
data_t xLower = floor(x); | ||
data_t xUpper = ceil(x); | ||
// x is in interval [xl,xu]. Choose closest of two bounds, breaking ties to | ||
// even. | ||
data_t dLower = x - xLower; | ||
data_t dUpper = xUpper - x; | ||
return static_cast<data_t>( | ||
(dLower == dUpper ? fmod(xLower, 2.0F) == 0.0F : dLower < dUpper) | ||
? xLower | ||
: xUpper); | ||
} | ||
|
||
template <typename data_t, int VecSize> | ||
__global__ void DequantKernel(data_t* output, | ||
const int32_t* input, | ||
const int m, // batch size | ||
const int n, // hidden | ||
const float* dequant_out_scale_data) { | ||
int numel = m * n; | ||
int stride = blockDim.x * gridDim.x * VecSize; | ||
int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize; | ||
int col_id = idx % n; | ||
|
||
AlignedVector<int32_t, VecSize> in_vec; | ||
AlignedVector<float, VecSize> out_scale_vec; | ||
AlignedVector<data_t, VecSize> out_vec; | ||
|
||
for (; idx < numel; idx += stride) { | ||
Load<int32_t, VecSize>(input + idx, &in_vec); | ||
Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec); | ||
|
||
#pragma unroll | ||
for (int i = 0; i < VecSize; ++i) { | ||
out_vec[i] = | ||
static_cast<data_t>(static_cast<float>(in_vec[i]) * out_scale_vec[i]); | ||
} | ||
|
||
Store<data_t, VecSize>(out_vec, output + idx); | ||
} | ||
} | ||
|
||
template <paddle::DataType D> | ||
std::vector<paddle::Tensor> DispatchLaunchDequantInt8(const paddle::Tensor& input, | ||
const paddle::Tensor& scale) { | ||
typedef PDTraits<D> traits_; | ||
typedef typename traits_::DataType DataType_; | ||
typedef typename traits_::data_t data_t; | ||
|
||
std::vector<int64_t> input_shape = input.shape(); | ||
|
||
auto output=paddle::full(input_shape, 0, D, input.place()); | ||
int64_t m = input_shape[0]; | ||
int64_t n = input_shape[1]; | ||
|
||
int64_t numel = m*n; | ||
constexpr int64_t thread_per_block = 512; | ||
int64_t block_per_grid = (numel / DequantKernelVecSize + thread_per_block - 1) / thread_per_block; | ||
auto stream = input.stream(); | ||
|
||
DequantKernel<DataType_, DequantKernelVecSize> | ||
<<<block_per_grid, thread_per_block, 0, stream>>>( | ||
reinterpret_cast<DataType_*>(output.data<data_t>()), | ||
reinterpret_cast<const int32_t*>(input.data<int32_t>()), m, n, | ||
reinterpret_cast<const float*>(scale.data<float>())); | ||
|
||
|
||
return {output}; | ||
|
||
} | ||
|
||
|
||
std::vector<paddle::Tensor> LaunchDequantInt8(const paddle::Tensor& input, | ||
const paddle::Tensor& scale, | ||
std::string dtype) { | ||
paddle::DataType data_type; | ||
|
||
if (dtype == "float32") | ||
data_type = paddle::DataType::FLOAT32; | ||
else if (dtype == "bfloat16") | ||
data_type = paddle::DataType::BFLOAT16; | ||
else if (dtype == "float16") | ||
data_type = paddle::DataType::FLOAT16; | ||
else | ||
PD_THROW( | ||
"NOT supported data type. " | ||
"Only bfloat16, float16 and float32 are supported. "); | ||
|
||
switch (data_type) { | ||
case paddle::DataType::BFLOAT16: | ||
return DispatchLaunchDequantInt8<paddle::DataType::BFLOAT16>(input, scale); | ||
break; | ||
case paddle::DataType::FLOAT16: | ||
return DispatchLaunchDequantInt8<paddle::DataType::FLOAT16>(input, scale); | ||
break; | ||
case paddle::DataType::FLOAT32: | ||
return DispatchLaunchDequantInt8<paddle::DataType::FLOAT32>(input, scale); | ||
break; | ||
default: | ||
break; | ||
} | ||
} | ||
|
||
std::vector<paddle::Tensor> DequantInt8(const paddle::Tensor& input, | ||
const paddle::Tensor& out_scale, | ||
std::string dtype | ||
) { | ||
return LaunchDequantInt8(input, out_scale, dtype); | ||
} | ||
|
||
std::vector<std::vector<int64_t>> DequantInt8Shape(const std::vector<int64_t>& input_shape) { | ||
return {input_shape}; | ||
} | ||
|
||
std::vector<paddle::DataType> DequantInt8Dtype(const paddle::DataType& input_dtype, const paddle::DataType& out_scale_dtype, std::string dtype) { | ||
paddle::DataType data_type; | ||
if (dtype == "float32") | ||
data_type = paddle::DataType::FLOAT32; | ||
else if (dtype == "bfloat16") | ||
data_type = paddle::DataType::BFLOAT16; | ||
else if (dtype == "float16") | ||
data_type = paddle::DataType::FLOAT16; | ||
else | ||
PD_THROW( | ||
"NOT supported data type. " | ||
"Only bfloat16, float16 and float32 are supported. "); | ||
|
||
return {data_type}; | ||
} | ||
|
||
PD_BUILD_OP(dequant_int8) | ||
.Inputs({"intput","out_scale"}) | ||
.Outputs({"output"}) | ||
.Attrs({"dtype: std::string"}) | ||
.SetKernelFn(PD_KERNEL(DequantInt8)) | ||
.SetInferShapeFn(PD_INFER_SHAPE(DequantInt8Shape)) | ||
.SetInferDtypeFn(PD_INFER_DTYPE(DequantInt8Dtype)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#include <cstdlib> | ||
#include <cstdio> | ||
#include <string> | ||
#include "helper.h" | ||
#include <sys/mman.h> | ||
#include <fstream> | ||
|
||
template <int THREADBLOCK_SIZE> | ||
__global__ void GetMaxLenKernel(const int *seq_lens, | ||
int *max_len, | ||
const int batch_size) { | ||
const int tid = threadIdx.x; | ||
|
||
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce; | ||
__shared__ typename BlockReduce::TempStorage temp_storage; | ||
|
||
int max_len_this_thread = 0; | ||
for (int i = tid; i < batch_size; i += blockDim.x) { | ||
max_len_this_thread = max(seq_lens[i], max_len_this_thread); | ||
} | ||
int total = | ||
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>()); | ||
if (tid == 0) { | ||
*max_len = total; | ||
} | ||
} | ||
|
||
// void write_mmap(const std::string& name, const std::string& value) { | ||
// int len = value.length(); | ||
// // 打开文件 | ||
// int fd = open(name.data(), O_RDWR | O_CREAT, 00777); | ||
// // lseek将文件指针往后移动 len - 1 位 | ||
// lseek(fd, len - 1, SEEK_END); | ||
// // 预先写入一个空字符;mmap不能扩展文件长度,这里相当于预先给文件长度,准备一个空架子 | ||
// write(fd, " ", 1); | ||
// // 建立映射 | ||
// char *buffer = (char *) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||
// // 关闭文件 | ||
// close(fd); | ||
// // 将 data 复制到 buffer 里 | ||
// memcpy(buffer, data, len); | ||
// // 关闭映射 | ||
// munmap(buffer, len) | ||
// } | ||
|
||
|
||
void GetMaxLen(const paddle::Tensor& seq_lens_encoder, const paddle::Tensor& seq_lens_decoder) { | ||
constexpr int blockSize = 128; | ||
int batch_size = seq_lens_encoder.shape()[0]; | ||
auto cu_stream = seq_lens_encoder.stream(); | ||
|
||
auto max_len_encoder = paddle::empty({1}, paddle::DataType::INT32, seq_lens_encoder.place()); | ||
auto max_len_decoder = paddle::empty({1}, paddle::DataType::INT32, seq_lens_encoder.place()); | ||
|
||
|
||
GetMaxLenKernel<blockSize><<<1, blockSize, 0, cu_stream>>>( | ||
seq_lens_encoder.data<int>(), max_len_encoder.data<int>(), batch_size); | ||
GetMaxLenKernel<blockSize><<<1, blockSize, 0, cu_stream>>>( | ||
seq_lens_decoder.data<int>(), max_len_decoder.data<int>(), batch_size); | ||
|
||
|
||
int max_len_encoder_data = max_len_encoder.copy_to(paddle::CPUPlace(), true).data<int>()[0]; | ||
int max_len_decoder_data = max_len_decoder.copy_to(paddle::CPUPlace(), true).data<int>()[0]; | ||
|
||
|
||
// char tmp_1[10]; | ||
// itoa(max_len_encoder_data, tmp_1, 10); | ||
// char tmp_2[10]; | ||
// itoa(max_len_decoder_data, tmp_2, 10); | ||
|
||
// std::string max_len_encoder_str = std::to_string(max_len_encoder_data); | ||
// std::string max_len_decoder_str = std::to_string(max_len_decoder_data); | ||
|
||
// int s = setenv("FLAGS_max_enc_len_this_time_data", max_len_encoder_str.data(),1); | ||
// printf("set env %d\n", s); | ||
// s = setenv("FLAGS_max_dec_len_this_time_data", max_len_decoder_str.data(),1); | ||
|
||
// auto env = getenv("FLAGS_max_enc_len_this_time_data"); | ||
// printf("get env %s\n", env); | ||
|
||
|
||
std::ofstream outfile; | ||
outfile.open("max_len.txt", std::ios::out); | ||
|
||
outfile << max_len_encoder_data << "\n" << max_len_decoder_data; | ||
|
||
outfile.close(); | ||
} | ||
|
||
|
||
PD_BUILD_OP(get_max_len) | ||
.Inputs({"seq_lens_encoder", "seq_lens_decoder"}) | ||
.Outputs({"seq_lens_encoder_out", "seq_lens_decoder_out"}) | ||
.SetInplaceMap({{"seq_lens_encoder", "seq_lens_encoder_out"}, {"seq_lens_decoder", "seq_lens_decoder_out"}}) | ||
.SetKernelFn(PD_KERNEL(GetMaxLen)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#include <stdio.h> | ||
#include <string.h> | ||
#include <sys/ipc.h> | ||
#include <sys/msg.h> | ||
#include <sys/types.h> | ||
#include "paddle/extension.h" | ||
|
||
#define MAX_BSZ 512 | ||
|
||
struct msgdata { | ||
long mtype; | ||
int mtext[MAX_BSZ + 2]; // stop_flag, bsz, tokens | ||
}; | ||
|
||
void GetOutput(const paddle::Tensor& x, | ||
int64_t rank_id, | ||
bool wait_flag) { | ||
if (rank_id > 0) return; | ||
|
||
static struct msgdata msg_rcv; | ||
|
||
static key_t key = ftok("./", 1); | ||
|
||
static int msgid = msgget(key, IPC_CREAT | 0666); | ||
|
||
int64_t *out_data = const_cast<int64_t*>(x.data<int64_t>()); | ||
int ret = -1; | ||
if (!wait_flag) { | ||
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, IPC_NOWAIT); | ||
} else { | ||
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, 0); | ||
} | ||
if(ret == -1) | ||
{ | ||
// read none | ||
out_data[0] = -2; | ||
out_data[1] = 0; | ||
return; | ||
} | ||
|
||
int bsz = msg_rcv.mtext[1]; | ||
|
||
for (int64_t i = 0; i < bsz + 2; i++) { | ||
out_data[i] = (int64_t)msg_rcv.mtext[i]; | ||
} | ||
return; | ||
} | ||
|
||
PD_BUILD_OP(get_output) | ||
.Inputs({"x"}) | ||
.Attrs({"rank_id: int64_t", | ||
"wait_flag: bool"}) | ||
.Outputs({"x_out"}) | ||
.SetInplaceMap({{"x", "x_out"}}) | ||
.SetKernelFn(PD_KERNEL(GetOutput)); |
Oops, something went wrong.