Skip to content

Commit

Permalink
Merge branch 'PaddlePaddle:develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
DesmonDay authored Dec 3, 2024
2 parents 2b553b4 + 2b3d7bf commit 6c6a6f1
Show file tree
Hide file tree
Showing 32 changed files with 800 additions and 147 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
exclude: 'slm/model_zoo/gpt-3'
exclude: 'slm/model_zoo/gpt-3;csrc/third_party'
repos:
# For Python files
- repo: https://github.com/psf/black.git
Expand Down Expand Up @@ -61,4 +61,4 @@ repos:
entry: python scripts/codestyle/check_dead_links.py
language: python
files: \.(md|markdown|rst)$
pass_filenames: true
pass_filenames: true
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,22 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py
```

更多大模型全流程步骤,请参考[飞桨大模型套件](./llm)介绍。
另外我们还提供了快速微调方式, 无需 clone 源代码:

```python
from paddlenlp.trl import SFTConfig, SFTTrainer
from datasets import load_dataset

dataset = load_dataset("ZHUI/alpaca_demo", split="train")

training_args = SFTConfig(output_dir="Qwen/Qwen2.5-0.5B-SFT", device="gpu")
trainer = SFTTrainer(
args=training_args,
model="Qwen/Qwen2.5-0.5B",
train_dataset=dataset,
)
trainer.train()
```

更多 PaddleNLP 内容可参考:

Expand Down
4 changes: 4 additions & 0 deletions csrc/gpu/quant_int8.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ __forceinline__ __device__ hip_bfloat16 add_mul<hip_bfloat16>(hip_bfloat16 a, hi
#else
template<>
__forceinline__ __device__ __nv_bfloat16 add_mul<__nv_bfloat16>(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
#if __CUDA_ARCH__ >= 800
return __hmul(__hadd(a, b), c);
#else
return (static_cast<float>(a) + static_cast<float>(b)) * static_cast<float>(c);
#endif
}
#endif

Expand Down
17 changes: 10 additions & 7 deletions csrc/setup_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ def strtobool(v):

def get_gencode_flags():
if not strtobool(os.getenv("FLAG_LLM_PDC", "False")):
prop = paddle.device.cuda.get_device_properties()
cc = prop.major * 10 + prop.minor
cc = get_sm_version()
return ["-gencode", "arch=compute_{0},code=sm_{0}".format(cc)]
else:
# support more cuda archs
Expand All @@ -75,6 +74,7 @@ def get_gencode_flags():
gencode_flags = get_gencode_flags()
library_path = os.environ.get("LD_LIBRARY_PATH", "/usr/local/cuda/lib64")

sm_version = get_sm_version()

sources = [
"./gpu/save_with_output.cc",
Expand Down Expand Up @@ -102,16 +102,11 @@ def get_gencode_flags():
"./gpu/dequant_int8.cu",
"./gpu/flash_attn_bwd.cc",
"./gpu/tune_cublaslt_gemm.cu",
"./gpu/append_attention.cu",
"./gpu/append_attn/get_block_shape_and_split_kv_block.cu",
"./gpu/append_attn/decoder_write_cache_with_rope_kernel.cu",
"./gpu/append_attn/speculate_write_cache_with_rope_kernel.cu",
"./gpu/sample_kernels/top_p_sampling_reject.cu",
"./gpu/update_inputs_v2.cu",
"./gpu/set_preids_token_penalty_multi_scores.cu",
"./gpu/speculate_decoding_kernels/ngram_match.cc",
]
sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu")
sources += find_end_files("./gpu/speculate_decoding_kernels", ".cu")

nvcc_compile_args = gencode_flags
Expand All @@ -138,6 +133,14 @@ def get_gencode_flags():
if cc >= 80:
sources += ["gpu/int8_gemm_with_cutlass/gemm_dequant.cu"]

sources += [
"./gpu/append_attention.cu",
"./gpu/append_attn/get_block_shape_and_split_kv_block.cu",
"./gpu/append_attn/decoder_write_cache_with_rope_kernel.cu",
"./gpu/append_attn/speculate_write_cache_with_rope_kernel.cu",
]
sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu")

if cc >= 89 and cuda_version >= 12.4:
os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py")
os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
Expand Down
4 changes: 2 additions & 2 deletions csrc/utils/tune_cublaslt_int8_gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
n1 = [6144, 4096, 28672, 4096]

# llama3.1-405b mp=8
k2 = [16384, 16384, 16384, 6656]
k2 = [16384, 2048, 16384, 6656]
n2 = [2560, 16384, 13312, 16384]

# qwen2-1.5b
Expand All @@ -43,5 +43,5 @@

# shape 计算公式
# [qkv, out_linear, ffn1, ffn2]
# k = [hidden_size, hidden_size, hidden_size, intermediate_size//mp_size]
# k = [hidden_size, hidden_size//mp_size, hidden_size, intermediate_size//mp_size]
# n = [((num_attention_heads//mp_size)+2*(num_key_value_heads//mp_size))*(hidden_size//num_attention_heads), hidden_size, 2*(intermediate_size//mp_size), hidden_size]
2 changes: 1 addition & 1 deletion llm/experimental/ernie-3.5-se/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class BFloatFInfo:

def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
return paddle.where(mask.to("bool"), y, x)


def scaled_dot_product_attention(
Expand Down
2 changes: 1 addition & 1 deletion llm/predict/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def main():
tensor_parallel_rank = hcg.get_model_parallel_rank()

# set predictor type
predictor = create_predictor(predictor_args, model_args, tensor_parallel_degree, tensor_parallel_rank)
predictor = create_predictor(predictor_args, model_args)
predictor.model.eval()

predictor.model.to_static(
Expand Down
Loading

0 comments on commit 6c6a6f1

Please sign in to comment.