Skip to content

Commit 7ebb24d

Browse files
committed
Merge branch 'develop' into fix43
2 parents eb8564d + 12326b6 commit 7ebb24d

File tree

59 files changed

+2040
-342
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2040
-342
lines changed

README.md

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**,
5757
- [Iluvatar GPU](./docs/get_started/installation/iluvatar_gpu.md)
5858
- [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
5959
- [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
60+
- [MetaX GPU](./docs/get_started/installation/metax_gpu.md.md)
6061

61-
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU and MetaX GPU are currently under development and testing. Stay tuned for updates!
62+
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU are currently under development and testing. Stay tuned for updates!
6263

6364
## Get Started
6465

@@ -68,20 +69,12 @@ Learn how to use FastDeploy through our documentation:
6869
- [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
6970
- [Offline Inference Development](./docs/offline_inference.md)
7071
- [Online Service Deployment](./docs/online_serving/README.md)
71-
- [Full Supported Models List](./docs/supported_models.md)
7272
- [Best Practices](./docs/best_practices/README.md)
7373

7474
## Supported Models
7575

76-
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
77-
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
78-
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 ||||||128K |
79-
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 |||||| 128K |
80-
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP || WIP || WIP |128K |
81-
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 ||| WIP || WIP |128K |
82-
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 ||||||128K |
83-
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 ||||||128K |
84-
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 |||||| 128K |
76+
Learn how to download models, enable using the torch format, and more:
77+
- [Full Supported Models List](./docs/supported_models.md)
8578

8679
## Advanced Usage
8780

README_CN.md

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ FastDeploy 支持在**英伟达(NVIDIA)GPU**、**昆仑芯(Kunlunxin)XPU
5555
- [天数 CoreX](./docs/zh/get_started/installation/iluvatar_gpu.md)
5656
- [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
5757
- [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
58+
- [沐曦 GPU](./docs/zh/get_started/installation/metax_gpu.md.md)
5859

59-
**注意:** 我们正在积极拓展硬件支持范围。目前,包括昇腾(Ascend)NPU 和 沐曦(MetaX)GPU 在内的其他硬件平台正在开发测试中。敬请关注更新!
60+
**注意:** 我们正在积极拓展硬件支持范围。目前,包括昇腾(Ascend)NPU 等其他硬件平台正在开发测试中。敬请关注更新!
6061

6162
## 入门指南
6263

@@ -66,20 +67,12 @@ FastDeploy 支持在**英伟达(NVIDIA)GPU**、**昆仑芯(Kunlunxin)XPU
6667
- [ERNIE-4.5-VL 部署](./docs/zh/get_started/ernie-4.5-vl.md)
6768
- [离线推理](./docs/zh/offline_inference.md)
6869
- [在线服务](./docs/zh/online_serving/README.md)
69-
- [模型支持列表](./docs/zh/supported_models.md)
7070
- [最佳实践](./docs/zh/best_practices/README.md)
7171

7272
## 支持模型列表
7373

74-
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
75-
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
76-
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 ||||||128K |
77-
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 |||||| 128K |
78-
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP || WIP || WIP |128K |
79-
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 ||| WIP || WIP |128K |
80-
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 ||||||128K |
81-
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 ||||||128K |
82-
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 |||||| 128K |
74+
通过我们的文档了解如何下载模型,如何支持torch格式等:
75+
- [模型支持列表](./docs/zh/supported_models.md)
8376

8477
## 进阶用法
8578

custom_ops/cpu_ops/set_value_by_flags.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
#include "paddle/extension.h"
1616

17-
void set_value_by_flag_and_id(const bool *stop_flags,
17+
void set_value_by_flags_and_idx(const bool *stop_flags,
1818
int64_t *pre_ids_all,
1919
const int64_t *input_ids,
2020
const int *seq_lens_encoder,
@@ -50,7 +50,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
5050
int length = pre_ids_all_shape[1];
5151
int length_input_ids = input_ids.shape()[1];
5252

53-
set_value_by_flag_and_id(stop_flags.data<bool>(),
53+
set_value_by_flags_and_idx(stop_flags.data<bool>(),
5454
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
5555
input_ids.data<int64_t>(),
5656
seq_lens_encoder.data<int>(),

custom_ops/cpu_ops/update_inputs.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ void update_inputs_kernel(bool *not_need_stop,
4646
not_need_stop[0] = stop_sum < stop_nums[0];
4747
}
4848

49-
void UpdateInputes(const paddle::Tensor &stop_flags,
49+
void UpdateInputs(const paddle::Tensor &stop_flags,
5050
const paddle::Tensor &not_need_stop,
5151
const paddle::Tensor &seq_lens_this_time,
5252
const paddle::Tensor &seq_lens_encoder,
@@ -90,4 +90,4 @@ PD_BUILD_STATIC_OP(update_inputs_cpu)
9090
{"seq_lens_encoder", "seq_lens_encoder_out"},
9191
{"seq_lens_decoder", "seq_lens_decoder_out"},
9292
{"input_ids", "input_ids_out"}})
93-
.SetKernelFn(PD_KERNEL(UpdateInputes));
93+
.SetKernelFn(PD_KERNEL(UpdateInputs));

custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,6 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel(
684684
// block_size, head_size // 2]
685685
T* __restrict__ qkv_out,
686686
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
687-
const int* __restrict__ batch_id_per_token, // [num_tokens]
688687
const int* __restrict__ cu_seqlens_q,
689688
const int* __restrict__ seq_lens, // [bsz]
690689
const int* __restrict__ seq_lens_encoder, // [bsz]

custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -565,7 +565,6 @@ void DecoderWriteCacheWithRoPEKernel(
565565
value_cache_out->data<uint8_t>(),
566566
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
567567
block_tables.data<int>(),
568-
batch_id_per_token.data<int>(),
569568
cu_seqlens_q.data<int>(),
570569
seq_lens.data<int>(),
571570
seq_lens_encoder.data<int>(),
@@ -729,7 +728,6 @@ void DecoderWriteCacheWithRoPEKernel(
729728
value_cache_out->data<uint8_t>(),
730729
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
731730
block_tables.data<int>(),
732-
batch_id_per_token.data<int>(),
733731
cu_seqlens_q.data<int>(),
734732
seq_lens.data<int>(),
735733
seq_lens_encoder.data<int>(),

custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,8 @@ __global__ void GQAVariableLengthRotaryQKNormKernel(
449449
const int half_lastdim = last_dim / 2;
450450
const int offset = (q_num_head + kv_num_head) * last_dim;
451451
const int all_head_num = elem_cnt / last_dim;
452-
for (int gloabl_hi = global_warp_idx; gloabl_hi < all_head_num; gloabl_hi += all_warp_num) {
453-
int64_t linear_index = gloabl_hi * last_dim + threadIdx.x * VecSize;
452+
for (int global_hi = global_warp_idx; global_hi < all_head_num; global_hi += all_warp_num) {
453+
int64_t linear_index = global_hi * last_dim + threadIdx.x * VecSize;
454454
const int token_idx = linear_index / offset;
455455
const int ori_bi = batch_id_per_token[token_idx];
456456
if (seq_lens[ori_bi] == 0) continue;

custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ __global__ void append_cache_kv_c16(
217217

218218
// load k_smem 64 rows 128 cols
219219
for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
220-
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
220+
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 once, need 2 iter
221221
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
222222
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
223223
k_smem_offset_w =
@@ -235,7 +235,7 @@ __global__ void append_cache_kv_c16(
235235
// deal k_smem 64 rows 128 cols
236236
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
237237
uint32_t row_idx = wid * 16 + tid / 4;
238-
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
238+
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 once, need 8 iter
239239
uint32_t col_idx = fy * 16 + tid % 4 * 2;
240240
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, kv_frag);
241241
// layout
@@ -278,7 +278,7 @@ __global__ void append_cache_kv_c16(
278278

279279
// load v_smem 64 rows 128 cols
280280
for (int fz = 0; fz < 4; fz++) { // // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
281-
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
281+
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 once, need 2 iter
282282
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
283283
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
284284
v_smem_offset_w =
@@ -296,7 +296,7 @@ __global__ void append_cache_kv_c16(
296296
// deal v_smem 64 rows 128 cols
297297
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
298298
uint32_t row_idx = wid * 16 + tid / 4;
299-
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
299+
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 once, need 8 iter
300300
uint32_t col_idx = fy * 16 + tid % 4 * 2;
301301
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, kv_frag);
302302
// layout
@@ -400,7 +400,7 @@ __global__ void append_cache_kv_c8(
400400

401401
// load v_smem 64 rows, 128 cols
402402
for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
403-
for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 noce, need 1 iter
403+
for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 once, need 1 iter
404404
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
405405
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
406406
k_smem_offset_w =
@@ -418,7 +418,7 @@ __global__ void append_cache_kv_c8(
418418
// deal k_smem 64 rows, 128 cols
419419
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
420420
uint32_t row_idx = wid * 16 + tid / 4;
421-
for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 noce, need 4 iter
421+
for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 once, need 4 iter
422422
uint32_t col_idx = fy * 32 + tid % 4 * 2;
423423
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
424424
// layout
@@ -466,7 +466,7 @@ __global__ void append_cache_kv_c8(
466466
tid % 4 * num_elems_per_128b<CacheT>();
467467
// load v_smem 128 rows 64 cols
468468
for (int fy = 0; fy < 4; fy++) { // 8 rows pre warp once, 32 rows all 4 warps once, need 4 iter
469-
for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 noce, need 1 iter
469+
for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 once, need 1 iter
470470
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
471471
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
472472
v_smem_offset_w =
@@ -485,7 +485,7 @@ __global__ void append_cache_kv_c8(
485485
// deal v_smem 128 rows 64 cols
486486
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
487487
uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
488-
for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 noce, need 2 iter
488+
for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 once, need 2 iter
489489
uint32_t kv_idx = fz * 32 + tid % 4 * 2;
490490
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
491491
// layout
@@ -614,7 +614,7 @@ __global__ void append_cache_kv_c4(
614614

615615
// load k_smem 64 rows 128 cols
616616
for (int fz = 0; fz < 2; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
617-
for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 noce, need 1 iter
617+
for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 once, need 1 iter
618618
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
619619
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
620620
k_smem_offset_w =
@@ -632,7 +632,7 @@ __global__ void append_cache_kv_c4(
632632
// deal k_smem 64 rows 128 cols
633633
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
634634
uint32_t row_idx = wid * 16 + tid / 4;
635-
for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 noce, need 2 iter
635+
for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 once, need 2 iter
636636
uint32_t col_idx = fy * 64 + tid % 4 * 2;
637637
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
638638

@@ -685,7 +685,7 @@ __global__ void append_cache_kv_c4(
685685
tid % 2 * num_elems_per_128b<CacheT>();
686686
// load v_smem 128 rows 64 rows
687687
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
688-
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
688+
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 once, need 1 iter
689689
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
690690
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
691691
v_smem_offset_w =
@@ -704,7 +704,7 @@ __global__ void append_cache_kv_c4(
704704
// deal v_smem 128 rows 64 cols
705705
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
706706
uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
707-
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
707+
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 once, need 1 iter
708708
uint32_t kv_idx = fz * 64 + tid % 4 * 2;
709709
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
710710
// layout

custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_decoder_attn.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ __global__ __launch_bounds__(Kernel_traits::kNThreads) void moba_decoder_attenti
383383

384384

385385
template<typename Kernel_traits, typename ParamType>
386-
inline __device__ float caluate_logit_scale(const int partition_num, const int pack_max_partition_num, ParamType &params, char * shared_mem, const int seq_len, const int *qk_gate_topk_idx_ptr) {
386+
inline __device__ float calculate_logit_scale(const int partition_num, const int pack_max_partition_num, ParamType &params, char * shared_mem, const int seq_len, const int *qk_gate_topk_idx_ptr) {
387387
constexpr int32_t kNFloatPacksize = 16 / sizeof(float);
388388
constexpr int32_t kNReduceThreads = Kernel_traits::kNReduceThreads;
389389
const int32_t bi = blockIdx.z;
@@ -524,7 +524,7 @@ __global__ void __launch_bounds__(Kernel_traits::kNReduceThreads) moba_decoder_a
524524
const int kv_head_idx = head_idx / Kernel_traits::kGqaGroupSize;
525525
const int * qk_gate_topk_idx_ptr = params.qk_gate_topk_idx_ptr + (bi * params.kv_head_num + kv_head_idx) * Kernel_traits::kMaxN;
526526

527-
float inv_global_exp_sum = caluate_logit_scale<Kernel_traits>(partition_num, pack_max_partition_num, params, shared_mem, seq_len, qk_gate_topk_idx_ptr);
527+
float inv_global_exp_sum = calculate_logit_scale<Kernel_traits>(partition_num, pack_max_partition_num, params, shared_mem, seq_len, qk_gate_topk_idx_ptr);
528528

529529

530530
using T_vec = Vec<cuteType, kNReducePacksize>;

custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ __global__ void write_encoder_cachekv_c16(
4040

4141
if (seq_len == 0) return;
4242

43-
const int ramian_tokens = seq_len - block_idx;
43+
const int remain_tokens = seq_len - block_idx;
4444

4545
const int32_t *block_table_now = block_tables + bidb * max_blocks_per_seq;
4646
const uint32_t physical_block_number = block_table_now[blockIdx.x + seq_len_decoder[bidb] / kBlockSize];
@@ -51,7 +51,7 @@ __global__ void write_encoder_cachekv_c16(
5151

5252
#pragma unroll
5353
for (int i = row_idx; i < kBlockSize; i += 128 / (kHeadDim / kPackSize)) {
54-
if (i < ramian_tokens) {
54+
if (i < remain_tokens) {
5555
*reinterpret_cast<float4*>(cache + i * kHeadDim) = *reinterpret_cast<const float4*>(k_input + base_load_idx + i * kv_head_num * kHeadDim);
5656
}
5757
}
@@ -62,7 +62,7 @@ __global__ void write_encoder_cachekv_c16(
6262

6363
#pragma unroll
6464
for (int i = row_idx; i < kBlockSize; i += 128 / (kHeadDim / kPackSize)) {
65-
if (i < ramian_tokens) {
65+
if (i < remain_tokens) {
6666
*reinterpret_cast<float4*>(cache + i * kHeadDim) = *reinterpret_cast<const float4*>(v_input + base_load_idx + i * kv_head_num * kHeadDim);
6767
}
6868
}

0 commit comments

Comments
 (0)