Merge branch 'PaddlePaddle:develop' into develop

DesmonDay · Dec 3, 2024 · 6c6a6f1 · 6c6a6f1
2 parents 2b553b4 + 2b3d7bf
commit 6c6a6f1
Show file tree

Hide file tree

Showing 32 changed files with 800 additions and 147 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: 'slm/model_zoo/gpt-3'
+exclude: 'slm/model_zoo/gpt-3;csrc/third_party'
 repos:
 # For Python files
 -   repo: https://github.com/psf/black.git
@@ -61,4 +61,4 @@ repos:
         entry: python scripts/codestyle/check_dead_links.py
         language: python
         files: \.(md|markdown|rst)$
-        pass_filenames: true
+        pass_filenames: true
diff --git a/README.md b/README.md
@@ -207,6 +207,22 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py
 ```
 
 更多大模型全流程步骤，请参考[飞桨大模型套件](./llm)介绍。
+另外我们还提供了快速微调方式, 无需 clone 源代码：
+
+```python
+from paddlenlp.trl import SFTConfig, SFTTrainer
+from datasets import load_dataset
+
+dataset = load_dataset("ZHUI/alpaca_demo", split="train")
+
+training_args = SFTConfig(output_dir="Qwen/Qwen2.5-0.5B-SFT", device="gpu")
+trainer = SFTTrainer(
+    args=training_args,
+    model="Qwen/Qwen2.5-0.5B",
+    train_dataset=dataset,
+)
+trainer.train()
+```
 
 更多 PaddleNLP 内容可参考：
 

diff --git a/csrc/gpu/quant_int8.cu b/csrc/gpu/quant_int8.cu
@@ -65,7 +65,11 @@ __forceinline__ __device__ hip_bfloat16 add_mul<hip_bfloat16>(hip_bfloat16 a, hi
 #else
 template<>
 __forceinline__ __device__ __nv_bfloat16 add_mul<__nv_bfloat16>(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+  #if __CUDA_ARCH__ >= 800
     return __hmul(__hadd(a, b), c);
+  #else
+    return (static_cast<float>(a) + static_cast<float>(b)) * static_cast<float>(c);
+  #endif
 }
 #endif
 

diff --git a/csrc/setup_cuda.py b/csrc/setup_cuda.py
@@ -57,8 +57,7 @@ def strtobool(v):
 
 def get_gencode_flags():
     if not strtobool(os.getenv("FLAG_LLM_PDC", "False")):
-        prop = paddle.device.cuda.get_device_properties()
-        cc = prop.major * 10 + prop.minor
+        cc = get_sm_version()
         return ["-gencode", "arch=compute_{0},code=sm_{0}".format(cc)]
     else:
         # support more cuda archs
@@ -75,6 +74,7 @@ def get_gencode_flags():
 gencode_flags = get_gencode_flags()
 library_path = os.environ.get("LD_LIBRARY_PATH", "/usr/local/cuda/lib64")
 
+sm_version = get_sm_version()
 
 sources = [
     "./gpu/save_with_output.cc",
@@ -102,16 +102,11 @@ def get_gencode_flags():
     "./gpu/dequant_int8.cu",
     "./gpu/flash_attn_bwd.cc",
     "./gpu/tune_cublaslt_gemm.cu",
-    "./gpu/append_attention.cu",
-    "./gpu/append_attn/get_block_shape_and_split_kv_block.cu",
-    "./gpu/append_attn/decoder_write_cache_with_rope_kernel.cu",
-    "./gpu/append_attn/speculate_write_cache_with_rope_kernel.cu",
     "./gpu/sample_kernels/top_p_sampling_reject.cu",
     "./gpu/update_inputs_v2.cu",
     "./gpu/set_preids_token_penalty_multi_scores.cu",
     "./gpu/speculate_decoding_kernels/ngram_match.cc",
 ]
-sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu")
 sources += find_end_files("./gpu/speculate_decoding_kernels", ".cu")
 
 nvcc_compile_args = gencode_flags
@@ -138,6 +133,14 @@ def get_gencode_flags():
 if cc >= 80:
     sources += ["gpu/int8_gemm_with_cutlass/gemm_dequant.cu"]
 
+    sources += [
+        "./gpu/append_attention.cu",
+        "./gpu/append_attn/get_block_shape_and_split_kv_block.cu",
+        "./gpu/append_attn/decoder_write_cache_with_rope_kernel.cu",
+        "./gpu/append_attn/speculate_write_cache_with_rope_kernel.cu",
+    ]
+    sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu")
+
 if cc >= 89 and cuda_version >= 12.4:
     os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py")
     os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")

diff --git a/csrc/utils/tune_cublaslt_int8_gemm.py b/csrc/utils/tune_cublaslt_int8_gemm.py
@@ -23,7 +23,7 @@
 n1 = [6144, 4096, 28672, 4096]
 
 # llama3.1-405b mp=8
-k2 = [16384, 16384, 16384, 6656]
+k2 = [16384, 2048, 16384, 6656]
 n2 = [2560, 16384, 13312, 16384]
 
 # qwen2-1.5b
@@ -43,5 +43,5 @@
 
 # shape 计算公式
 # [qkv, out_linear, ffn1, ffn2]
-# k = [hidden_size, hidden_size, hidden_size, intermediate_size//mp_size]
+# k = [hidden_size, hidden_size//mp_size, hidden_size, intermediate_size//mp_size]
 # n = [((num_attention_heads//mp_size)+2*(num_key_value_heads//mp_size))*(hidden_size//num_attention_heads), hidden_size, 2*(intermediate_size//mp_size), hidden_size]
diff --git a/llm/experimental/ernie-3.5-se/modeling.py b/llm/experimental/ernie-3.5-se/modeling.py
@@ -135,7 +135,7 @@ class BFloatFInfo:
 
 def masked_fill(x, mask, value):
     y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
+    return paddle.where(mask.to("bool"), y, x)
 
 
 def scaled_dot_product_attention(

diff --git a/llm/predict/export_model.py b/llm/predict/export_model.py
@@ -58,7 +58,7 @@ def main():
         tensor_parallel_rank = hcg.get_model_parallel_rank()
 
     # set predictor type
-    predictor = create_predictor(predictor_args, model_args, tensor_parallel_degree, tensor_parallel_rank)
+    predictor = create_predictor(predictor_args, model_args)
     predictor.model.eval()
 
     predictor.model.to_static(