PaddlePaddle
diff --git a/‎README.md‎
Lines changed: 4 additions & 11 deletions b/‎README.md‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎README_CN.md‎
Lines changed: 4 additions & 11 deletions b/‎README_CN.md‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎custom_ops/cpu_ops/set_value_by_flags.cc‎
Lines changed: 2 additions & 2 deletions b/‎custom_ops/cpu_ops/set_value_by_flags.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎custom_ops/cpu_ops/update_inputs.cc‎
Lines changed: 2 additions & 2 deletions b/‎custom_ops/cpu_ops/update_inputs.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh‎
Lines changed: 0 additions & 1 deletion b/‎custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu‎
Lines changed: 0 additions & 2 deletions b/‎custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh‎
Lines changed: 2 additions & 2 deletions b/‎custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu‎
Lines changed: 12 additions & 12 deletions b/‎custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_decoder_attn.cu‎
Lines changed: 2 additions & 2 deletions b/‎custom_ops/gpu_ops/moba_attn/moba_decoder_attn/moba_decoder_attn.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu‎
Lines changed: 3 additions & 3 deletions b/‎custom_ops/gpu_ops/moba_attn/moba_encoder_attn/moba_encoder_write_cache.cu‎
Lines changed: 3 additions & 3 deletions
@@ -57,8 +57,9 @@ FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**,
 - [Iluvatar GPU](./docs/get_started/installation/iluvatar_gpu.md)
 - [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
 - [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
+- [MetaX GPU](./docs/get_started/installation/metax_gpu.md.md)
 
-**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU and MetaX GPU are currently under development and testing. Stay tuned for updates!
+**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU are currently under development and testing. Stay tuned for updates!
 
 ## Get Started
 
@@ -68,20 +69,12 @@ Learn how to use FastDeploy through our documentation:
 - [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
 - [Offline Inference Development](./docs/offline_inference.md)
 - [Online Service Deployment](./docs/online_serving/README.md)
-- [Full Supported Models List](./docs/supported_models.md)
 - [Best Practices](./docs/best_practices/README.md)
 
 ## Supported Models
 
-| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching |  MTP | CUDA Graph | Maximum Context Length |
-|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
-|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| ✅ |128K |
-|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| ✅ | 128K |
-|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
-|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
-|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ✅ | ✅|128K |
-|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ✅ |  ✅ |  ✅ | ❌  | ✅|128K |
-|ERNIE-4.5-0.3B | BF16/WINT8/FP8  |  ✅ |  ✅ |  ✅ | ❌ | ✅| 128K |
+Learn how to download models, enable using the torch format, and more:
+- [Full Supported Models List](./docs/supported_models.md)
 
 ## Advanced Usage
 
 
@@ -55,8 +55,9 @@ FastDeploy 支持在**英伟达（NVIDIA）GPU**、**昆仑芯（Kunlunxin）XPU
 - [天数 CoreX](./docs/zh/get_started/installation/iluvatar_gpu.md)
 - [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
 - [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
+- [沐曦 GPU](./docs/zh/get_started/installation/metax_gpu.md.md)
 
-**注意:** 我们正在积极拓展硬件支持范围。目前，包括昇腾（Ascend）NPU 和 沐曦（MetaX）GPU 在内的其他硬件平台正在开发测试中。敬请关注更新！
+**注意:** 我们正在积极拓展硬件支持范围。目前，包括昇腾（Ascend）NPU 等其他硬件平台正在开发测试中。敬请关注更新！
 
 ## 入门指南
 
@@ -66,20 +67,12 @@ FastDeploy 支持在**英伟达（NVIDIA）GPU**、**昆仑芯（Kunlunxin）XPU
 - [ERNIE-4.5-VL 部署](./docs/zh/get_started/ernie-4.5-vl.md)
 - [离线推理](./docs/zh/offline_inference.md)
 - [在线服务](./docs/zh/online_serving/README.md)
-- [模型支持列表](./docs/zh/supported_models.md)
 - [最佳实践](./docs/zh/best_practices/README.md)
 
 ## 支持模型列表
 
-| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching |  MTP | CUDA Graph | Maximum Context Length |
-|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
-|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| ✅ |128K |
-|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| ✅ | 128K |
-|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
-|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
-|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ✅ | ✅|128K |
-|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ✅ |  ✅ |  ✅ | ❌  | ✅|128K |
-|ERNIE-4.5-0.3B | BF16/WINT8/FP8  |  ✅ |  ✅ |  ✅ | ❌ | ✅| 128K |
+通过我们的文档了解如何下载模型，如何支持torch格式等：
+- [模型支持列表](./docs/zh/supported_models.md)
 
 ## 进阶用法
 
 
@@ -14,7 +14,7 @@
 
 #include "paddle/extension.h"
 
-void set_value_by_flag_and_id(const bool *stop_flags,
+void set_value_by_flags_and_idx(const bool *stop_flags,
                               int64_t *pre_ids_all,
                               const int64_t *input_ids,
                               const int *seq_lens_encoder,
@@ -50,7 +50,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
     int length = pre_ids_all_shape[1];
     int length_input_ids = input_ids.shape()[1];
 
-    set_value_by_flag_and_id(stop_flags.data<bool>(),
+    set_value_by_flags_and_idx(stop_flags.data<bool>(),
                              const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
                              input_ids.data<int64_t>(),
                              seq_lens_encoder.data<int>(),
 
@@ -46,7 +46,7 @@ void update_inputs_kernel(bool *not_need_stop,
     not_need_stop[0] = stop_sum < stop_nums[0];
 }
 
-void UpdateInputes(const paddle::Tensor &stop_flags,
+void UpdateInputs(const paddle::Tensor &stop_flags,
                    const paddle::Tensor &not_need_stop,
                    const paddle::Tensor &seq_lens_this_time,
                    const paddle::Tensor &seq_lens_encoder,
@@ -90,4 +90,4 @@ PD_BUILD_STATIC_OP(update_inputs_cpu)
                     {"seq_lens_encoder", "seq_lens_encoder_out"},
                     {"seq_lens_decoder", "seq_lens_decoder_out"},
                     {"input_ids", "input_ids_out"}})
-    .SetKernelFn(PD_KERNEL(UpdateInputes));
+    .SetKernelFn(PD_KERNEL(UpdateInputs));
@@ -684,7 +684,6 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel(
                                         // block_size, head_size // 2]
     T* __restrict__ qkv_out,
     const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ batch_id_per_token,  // [num_tokens]
     const int* __restrict__ cu_seqlens_q,
     const int* __restrict__ seq_lens,          // [bsz]
     const int* __restrict__ seq_lens_encoder,  // [bsz]
 
@@ -565,7 +565,6 @@ void DecoderWriteCacheWithRoPEKernel(
               value_cache_out->data<uint8_t>(),
               reinterpret_cast<DataType_*>(qkv_out->data<T>()),
               block_tables.data<int>(),
-              batch_id_per_token.data<int>(),
               cu_seqlens_q.data<int>(),
               seq_lens.data<int>(),
               seq_lens_encoder.data<int>(),
@@ -729,7 +728,6 @@ void DecoderWriteCacheWithRoPEKernel(
               value_cache_out->data<uint8_t>(),
               reinterpret_cast<DataType_*>(qkv_out->data<T>()),
               block_tables.data<int>(),
-              batch_id_per_token.data<int>(),
               cu_seqlens_q.data<int>(),
               seq_lens.data<int>(),
               seq_lens_encoder.data<int>(),
 
@@ -449,8 +449,8 @@ __global__ void GQAVariableLengthRotaryQKNormKernel(
   const int half_lastdim = last_dim / 2;
   const int offset = (q_num_head + kv_num_head) * last_dim;
   const int all_head_num = elem_cnt / last_dim;
-  for (int gloabl_hi = global_warp_idx; gloabl_hi < all_head_num; gloabl_hi += all_warp_num) {
-    int64_t linear_index = gloabl_hi * last_dim + threadIdx.x * VecSize;
+  for (int global_hi = global_warp_idx; global_hi < all_head_num; global_hi += all_warp_num) {
+    int64_t linear_index = global_hi * last_dim + threadIdx.x * VecSize;
     const int token_idx = linear_index / offset;
     const int ori_bi = batch_id_per_token[token_idx];
     if (seq_lens[ori_bi] == 0) continue;
 
@@ -217,7 +217,7 @@ __global__ void append_cache_kv_c16(
 
   // load k_smem 64 rows 128 cols
   for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
-    for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
+    for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 once, need 2 iter
       k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
             k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
       k_smem_offset_w =
@@ -235,7 +235,7 @@ __global__ void append_cache_kv_c16(
   // deal k_smem 64 rows 128 cols
   for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
     uint32_t row_idx = wid * 16 + tid / 4;
-    for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
+    for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 once, need 8 iter
       uint32_t col_idx = fy * 16 + tid % 4 * 2;
       k_smem.ldmatrix_m8n8x4(k_smem_offset_r, kv_frag);
       // layout
@@ -278,7 +278,7 @@ __global__ void append_cache_kv_c16(
 
   // load v_smem 64 rows 128 cols
   for (int fz = 0; fz < 4; fz++) { // // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
-    for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
+    for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 once, need 2 iter
       v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
             v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
       v_smem_offset_w =
@@ -296,7 +296,7 @@ __global__ void append_cache_kv_c16(
   // deal v_smem 64 rows 128 cols
   for (int fz = 0; fz < 1; fz++) { //  16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
     uint32_t row_idx = wid * 16 + tid / 4;
-    for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
+    for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 once, need 8 iter
       uint32_t col_idx = fy * 16 + tid % 4 * 2;
       v_smem.ldmatrix_m8n8x4(v_smem_offset_r, kv_frag);
       // layout
@@ -400,7 +400,7 @@ __global__ void append_cache_kv_c8(
 
   // load v_smem 64 rows, 128 cols
   for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
-    for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 noce, need 1 iter
+    for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 once, need 1 iter
       k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
             k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
       k_smem_offset_w =
@@ -418,7 +418,7 @@ __global__ void append_cache_kv_c8(
   // deal k_smem 64 rows, 128 cols
   for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
     uint32_t row_idx = wid * 16 + tid / 4;
-    for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 noce, need 4 iter
+    for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 once, need 4 iter
       uint32_t col_idx = fy * 32 + tid % 4 * 2;
       k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
       // layout
@@ -466,7 +466,7 @@ __global__ void append_cache_kv_c8(
                           tid % 4 * num_elems_per_128b<CacheT>();
   // load v_smem 128 rows 64 cols
   for (int fy = 0; fy < 4; fy++) { // 8 rows pre warp once, 32 rows all 4 warps once, need 4 iter
-    for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 noce, need 1 iter
+    for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 once, need 1 iter
       v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
               v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
       v_smem_offset_w =
@@ -485,7 +485,7 @@ __global__ void append_cache_kv_c8(
   // deal v_smem 128 rows 64 cols
   for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
     uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
-    for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 noce, need 2 iter
+    for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 once, need 2 iter
       uint32_t kv_idx = fz * 32 + tid % 4 * 2;
       v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
       // layout
@@ -614,7 +614,7 @@ __global__ void append_cache_kv_c4(
 
   // load k_smem 64 rows 128 cols
   for (int fz = 0; fz < 2; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
-    for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 noce, need 1 iter
+    for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 once, need 1 iter
       k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
             k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
       k_smem_offset_w =
@@ -632,7 +632,7 @@ __global__ void append_cache_kv_c4(
   // deal k_smem 64 rows 128 cols
   for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
     uint32_t row_idx = wid * 16 + tid / 4;
-    for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 noce, need 2 iter
+    for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 once, need 2 iter
       uint32_t col_idx = fy * 64 + tid % 4 * 2;
       k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
 
@@ -685,7 +685,7 @@ __global__ void append_cache_kv_c4(
                           tid % 2 * num_elems_per_128b<CacheT>();
   // load v_smem 128 rows 64 rows
   for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
-    for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
+    for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 once, need 1 iter
       v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
               v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
       v_smem_offset_w =
@@ -704,7 +704,7 @@ __global__ void append_cache_kv_c4(
   // deal v_smem 128 rows 64 cols
   for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
     uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
-    for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
+    for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 once, need 1 iter
       uint32_t kv_idx = fz * 64 + tid % 4 * 2;
       v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
       // layout
 
@@ -383,7 +383,7 @@ __global__ __launch_bounds__(Kernel_traits::kNThreads) void moba_decoder_attenti
 
 
 template<typename Kernel_traits, typename ParamType>
-inline __device__ float caluate_logit_scale(const int partition_num, const int pack_max_partition_num, ParamType &params, char * shared_mem, const int seq_len, const int *qk_gate_topk_idx_ptr) {
+inline __device__ float calculate_logit_scale(const int partition_num, const int pack_max_partition_num, ParamType &params, char * shared_mem, const int seq_len, const int *qk_gate_topk_idx_ptr) {
     constexpr int32_t kNFloatPacksize = 16 / sizeof(float);
     constexpr int32_t kNReduceThreads = Kernel_traits::kNReduceThreads;
     const int32_t bi = blockIdx.z;
@@ -524,7 +524,7 @@ __global__ void __launch_bounds__(Kernel_traits::kNReduceThreads) moba_decoder_a
     const int kv_head_idx = head_idx / Kernel_traits::kGqaGroupSize;
     const int * qk_gate_topk_idx_ptr = params.qk_gate_topk_idx_ptr + (bi * params.kv_head_num + kv_head_idx) * Kernel_traits::kMaxN;
 
-    float inv_global_exp_sum = caluate_logit_scale<Kernel_traits>(partition_num, pack_max_partition_num, params, shared_mem, seq_len, qk_gate_topk_idx_ptr);
+    float inv_global_exp_sum = calculate_logit_scale<Kernel_traits>(partition_num, pack_max_partition_num, params, shared_mem, seq_len, qk_gate_topk_idx_ptr);
 
 
     using T_vec = Vec<cuteType, kNReducePacksize>;
 
@@ -40,7 +40,7 @@ __global__ void write_encoder_cachekv_c16(
 
     if (seq_len == 0) return;
 
-    const int ramian_tokens = seq_len - block_idx;
+    const int remain_tokens = seq_len - block_idx;
 
     const int32_t *block_table_now = block_tables + bidb * max_blocks_per_seq;
     const uint32_t physical_block_number = block_table_now[blockIdx.x + seq_len_decoder[bidb] / kBlockSize];
@@ -51,7 +51,7 @@ __global__ void write_encoder_cachekv_c16(
 
         #pragma unroll
         for (int i = row_idx; i < kBlockSize; i += 128 / (kHeadDim / kPackSize)) {
-            if (i < ramian_tokens) {
+            if (i < remain_tokens) {
                 *reinterpret_cast<float4*>(cache + i * kHeadDim) = *reinterpret_cast<const float4*>(k_input + base_load_idx + i * kv_head_num * kHeadDim);
             }
         }
@@ -62,7 +62,7 @@ __global__ void write_encoder_cachekv_c16(
 
         #pragma unroll
         for (int i = row_idx; i < kBlockSize; i += 128 / (kHeadDim / kPackSize)) {
-            if (i < ramian_tokens) {
+            if (i < remain_tokens) {
                 *reinterpret_cast<float4*>(cache + i * kHeadDim) = *reinterpret_cast<const float4*>(v_input + base_load_idx + i * kv_head_num * kHeadDim);
             }
         }
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ __global__ void write_encoder_cachekv_c16(`
`40`	`40`
`41`	`41`	`if (seq_len == 0) return;`
`42`	`42`
`43`		`- const int ramian_tokens = seq_len - block_idx;`
	`43`	`+ const int remain_tokens = seq_len - block_idx;`
`44`	`44`
`45`	`45`	`const int32_t block_table_now = block_tables + bidb max_blocks_per_seq;`
`46`	`46`	`const uint32_t physical_block_number = block_table_now[blockIdx.x + seq_len_decoder[bidb] / kBlockSize];`
`@@ -51,7 +51,7 @@ __global__ void write_encoder_cachekv_c16(`
`51`	`51`
`52`	`52`	`#pragma unroll`
`53`	`53`	`for (int i = row_idx; i < kBlockSize; i += 128 / (kHeadDim / kPackSize)) {`
`54`		`- if (i < ramian_tokens) {`
	`54`	`+ if (i < remain_tokens) {`
`55`	`55`	`reinterpret_cast<float4>(cache + i * kHeadDim) = reinterpret_cast<const float4>(k_input + base_load_idx + i * kv_head_num * kHeadDim);`
`56`	`56`	`}`
`57`	`57`	`}`
`@@ -62,7 +62,7 @@ __global__ void write_encoder_cachekv_c16(`
`62`	`62`
`63`	`63`	`#pragma unroll`
`64`	`64`	`for (int i = row_idx; i < kBlockSize; i += 128 / (kHeadDim / kPackSize)) {`
`65`		`- if (i < ramian_tokens) {`
	`65`	`+ if (i < remain_tokens) {`
`66`	`66`	`reinterpret_cast<float4>(cache + i * kHeadDim) = reinterpret_cast<const float4>(v_input + base_load_idx + i * kv_head_num * kHeadDim);`
`67`	`67`	`}`
`68`	`68`	`}`