From 34fcced27f53934d872893c52690f92e817c84a2 Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Thu, 20 Jun 2024 19:51:27 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E3=80=90benchmark=E3=80=91=20fix=20model?= =?UTF-8?q?=5Fzoo=20path=20(#8643)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add llama-7b_auto_dp2mp2pp2 benchmark sh * add llama-7b_auto_dp2mp2pp2 benchmark script for cinn * update llama-7b_auto_dp2mp2pp2 benchmark script * Update run_benchmark.sh * Update run_benchmark.sh * fix llama-7b_auto_dp2mp2pp2/benchmark_common * Update run_benchmark.sh * Update prepare.sh * Update prepare.sh * Update prepare.sh * Update prepare.sh * Update prepare.sh --- .../auto_tuner/llama_pretrain/benchmark_common/prepare.sh | 4 ++-- .../hybrid_parallelism/llama/benchmark_common/prepare.sh | 6 +++--- .../hybrid_parallelism/llama2/benchmark_common/prepare.sh | 4 ++-- .../hybrid_parallelism/qwen/benchmark_common/prepare.sh | 6 +++--- .../static/auto_parallel/llama2/benchmark_common/prepare.sh | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh index 5472a36e94c4..3e51c5dc6fb7 100644 --- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh +++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers rm -rf data && mkdir data diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh index 0563a1aaabac..388b179e6905 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy @@ -28,4 +28,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe mkdir data mv llama_openwebtext_100k_ids.npy ./data -mv llama_openwebtext_100k_idx.npz ./data \ No newline at end of file +mv llama_openwebtext_100k_idx.npz ./data diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh index 45fd82fad914..9405521c7b3f 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers # download data diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh index 1d8a79cc2a0e..bf6952c135ca 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh @@ -18,11 +18,11 @@ python -m pip install -r ../requirements-dev.txt python -m pip install tiktoken # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/qwen +cd ../../../../llm/qwen python -m pip install tool_helpers wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy @@ -30,4 +30,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe mkdir data mv llama_openwebtext_100k_ids.npy ./data -mv llama_openwebtext_100k_idx.npz ./data \ No newline at end of file +mv llama_openwebtext_100k_idx.npz ./data diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh index edb4590e2f15..697d5d1d92e0 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers # download data From 498f70988431be278dac618411fbfb0287853cd9 Mon Sep 17 00:00:00 2001 From: lugimzzz <63761690+lugimzzz@users.noreply.github.com> Date: Fri, 21 Jun 2024 10:51:22 +0800 Subject: [PATCH 2/3] [LLM] change llm content (#8627) * change llm * fix * fix * fix ci * fix * fix * fix --------- Co-authored-by: zhangjunjun04 --- docs/llm/peft.md | 2 +- .../llm/llama_single_gpu}/benchmark.py | 0 .../llm/llama_single_gpu}/benchmark_utils.py | 0 llm/.gitignore | 12 - llm/Alignment/RM/models | 1 - llm/README.md | 84 ++-- llm/{Alignment => alignment}/README.md | 28 +- llm/{ => alignment/dpo}/dpo_argument.py | 0 .../dpo/run_dpo.py} | 0 .../PPO => alignment/ppo}/comm_utils.py | 0 .../PPO => alignment/ppo}/data/__init__.py | 0 .../PPO => alignment/ppo}/data/alpaca.py | 0 .../PPO => alignment/ppo}/data/base.py | 0 .../PPO => alignment/ppo}/data/preference.py | 0 .../PPO => alignment/ppo}/data/prompt_only.py | 0 .../PPO => alignment/ppo}/data/safe_rlhf.py | 0 .../PPO => alignment/ppo}/data/supervised.py | 0 .../PPO => alignment/ppo}/infer_utils.py | 0 .../PPO => alignment/ppo}/models/__init__.py | 0 .../ppo}/models/infer_model_utils.py | 0 .../PPO => alignment/ppo}/models/model_pp.py | 0 .../ppo}/models/pp_model_utils.py | 0 .../PPO => alignment/ppo}/models/ppo_model.py | 0 .../ppo}/models/ppo_model_utils.py | 0 .../ppo}/models/score_model.py | 0 .../ppo}/models/score_model_utils.py | 0 .../PPO => alignment/ppo}/ppo_trainer.py | 0 .../ppo_main.py => alignment/ppo/run_ppo.py} | 0 .../PPO => alignment/ppo}/tests/run_model.py | 0 .../ppo}/tests/test_export.py | 0 .../PPO => alignment/ppo}/trainer_utils.py | 0 llm/alignment/rm/models | 1 + .../RM => alignment/rm}/reward_trainer.py | 0 .../rm/run_reward.py} | 0 .../gpt-3}/run_pretrain_auto.py | 237 ++++------- .../gpt-3}/run_pretrain_auto_dp2mp2pp2.sh | 14 + .../llama}/README.md | 0 .../llama}/run_llama3.sh | 0 .../llama}/run_pretrain_auto.py | 0 .../llama}/run_pretrain_auto.sh | 0 .../llama}/run_pretrain_auto_static.py | 0 .../llama}/run_pretrain_auto_static.sh | 0 .../llama}/run_pretrain_auto_static_sp.sh | 0 .../llama}/run_pretrain_hand.py | 0 .../llama}/run_pretrain_hand.sh | 0 .../pretrain_argument_auto_dp2tp2pp2.json | 0 .../qwen}/run_pretrain_3D_auto.py | 0 .../qwen}/run_pretrain_3D_auto.sh | 0 .../pretrain-baichuan2_13b-sd8_stage2.json | 40 -- llm/benchmark.sh | 36 -- llm/config/baichuan/README.md | 15 + llm/config/baichuan/awq_argument.json | 23 + llm/config/baichuan/dpo_argument.json | 38 ++ .../baichuan}/gptq_argument.json | 5 +- llm/config/baichuan/lora_argument.json | 35 ++ .../baichuan/pretrain_argument.json} | 4 +- llm/config/baichuan/ptq_argument.json | 23 + llm/config/baichuan/qlora_argument.json | 34 ++ llm/{ => config}/bloom/README.md | 3 - .../bloom}/gptq_argument.json | 5 +- llm/{ => config}/bloom/lora_argument.json | 3 +- llm/{ => config}/bloom/pt_argument.json | 3 +- .../bloom}/ptq_argument.json | 5 +- llm/{ => config}/bloom/sft_argument.json | 3 +- llm/{ => config}/chatglm/README.md | 3 - .../chatglm}/gptq_argument.json | 5 +- llm/{ => config}/chatglm/lora_argument.json | 3 +- llm/{ => config}/chatglm/pt_argument.json | 2 +- llm/{ => config}/chatglm/ptq_argument.json | 5 +- llm/{ => config}/chatglm/sft_argument.json | 3 +- llm/{ => config}/chatglm2/README.md | 3 - .../chatglm2}/gptq_argument.json | 5 +- llm/{ => config}/chatglm2/lora_argument.json | 3 +- llm/{ => config}/chatglm2/pt_argument.json | 2 +- .../chatglm2}/ptq_argument.json | 5 +- llm/{ => config}/chatglm2/sft_argument.json | 7 +- llm/{ => config}/gemma/README.md | 0 llm/{ => config}/gemma/sft_argument.json | 12 +- llm/config/gpt-3/README.md | 5 + .../gpt-3}/lora_argument.json | 5 +- .../gpt-3/pretrain_argument.json} | 1 + llm/config/gpt-3/sft_argument.json | 33 ++ llm/{ => config}/llama/README.md | 12 +- llm/{ => config}/llama/awq_argument.json | 7 +- llm/{ => config}/llama/dpo_argument.json | 3 +- llm/config/llama/gptq_argument.json | 17 + llm/config/llama/lora_argument.json | 35 ++ .../llama/{ppo.json => ppo_argument.json} | 0 .../llama/pretrain_argument.json} | 7 +- llm/{qwen => config/llama}/pt_argument.json | 6 +- llm/{ => config}/llama/ptq_argument.json | 5 +- llm/{ => config}/llama/qlora_argument.json | 7 +- .../llama/{rm.json => rm_argument.json} | 0 llm/{ => config}/llama/sft_argument.json | 15 +- .../llama/wint8_lora_argument.json | 5 +- llm/{ => config}/mixtral/lora_argument.json | 5 +- .../mixtral/pretrain_argument.json} | 9 +- llm/{ => config}/mixtral/sft_argument.json | 11 +- llm/{ => config}/opt/README.md | 3 - llm/{ => config}/opt/lora_argument.json | 2 +- llm/{ => config}/opt/sft_argument.json | 2 +- llm/{ => config}/qwen/README.md | 4 - llm/{ => config}/qwen/dpo_argument.json | 3 +- llm/{ => config}/qwen/lora_argument.json | 8 +- .../qwen/pretrain_argument.json} | 7 +- llm/{llama => config/qwen}/pt_argument.json | 9 +- llm/{ => config}/qwen/sft_argument.json | 10 +- llm/docs/chat_template.md | 6 +- llm/docs/finetune.md | 36 +- llm/docs/inference.md | 62 +-- llm/docs/pretrain.rst | 4 +- llm/docs/quantization.md | 6 +- llm/{ => experimental}/ernie-3.5-se/README.md | 0 .../ernie-3.5-se/configuration.py | 0 .../ernie-3.5-se/conversion_utils.py | 0 llm/{ => experimental}/ernie-3.5-se/data.py | 0 .../ernie-tokenizer/sentencepiece.bpe.model | Bin .../ernie-tokenizer/special_tokens_map.json | 0 .../ernie-tokenizer/tokenizer_config.json | 0 .../ernie-3.5-se/ernie_dataset.py | 0 .../ernie-3.5-se/finetune_generation.py | 0 .../ernie-3.5-se/modeling.py | 0 .../ernie-3.5-se/predict_generation.py | 0 .../ernie-3.5-se/run_pretrain.py | 0 .../ernie-3.5-se/run_trainer_stage2.sh | 0 .../ernie-3.5-se/tokenizer.py | 0 llm/{ => experimental}/ernie-3.5-se/utils.py | 0 .../scripts}/run_sharding_v2.sh | 0 .../scripts}/run_trainer.sh | 0 .../scripts}/run_trainer_tp2cp2.sh | 0 .../scripts}/run_trainer_tp4pp2.sh | 0 .../scripts}/run_trainer_tp4sep2.sh | 0 llm/fused_layers.py | 1 - llm/gemma/sft_argument_7b.json | 32 -- llm/gemma/sft_argument_7b_sharding.json | 33 -- llm/gemma/sft_argument_sharding.json | 31 -- llm/glm/README.md | 102 ----- llm/glm/data.py | 67 --- llm/glm/finetune_generation.py | 188 --------- llm/glm/predict_generation.py | 151 ------- llm/glm/utils.py | 79 ---- llm/gpt-3/README.md | 205 --------- llm/gpt-3/finetune_generation.py | 250 ----------- llm/gpt-3/predict_generation.py | 165 -------- llm/gpt-3/run_pretrain.py | 1 - llm/gpt-3/tests/test_sequence_parallel.py | 98 ----- llm/gpt-3/utils.py | 393 ------------------ llm/llama/lora_argument_pissa.json | 33 -- llm/llama/megre_tp_and_pp.py | 88 ---- .../pretrain-baichuan2_13b-tp2sd4_stage2.json | 40 -- ...in-flagalpha_llama2_13b-tp2sd4_stage2.json | 40 -- ...ain-flagalpha_llama2_7b-tp2sd4_stage2.json | 40 -- ...retrain-linly_llama2_7b-tp2sd4_stage2.json | 40 -- .../pretrain-llama2_7b-tp2sd4_stage2.json | 40 -- .../pretrain-llama_13b-tp2sd4_stage2.json | 40 -- .../pretrain-llama_7b-tp2sd4_stage2.json | 40 -- llm/llama/run_pretrain.py | 1 - llm/llama/sft_pp_argument.json | 31 -- llm/llama/tests/test_pipeline_parallel.py | 132 ------ llm/llama/tests/test_sequence_parallel.py | 118 ------ .../tests/unified-ckpt-llama-500m/config.json | 17 - llm/merge_lora_params.py | 147 ------- llm/{llama/npu => npu/llama}/export_utils.py | 16 +- .../npu => npu/llama}/llama_npu_opt_lora.sh | 4 +- .../npu => npu/llama}/llama_npu_opt_ppt.sh | 0 .../npu => npu/llama}/llama_npu_opt_sft.sh | 4 +- llm/{ => predict}/export_model.py | 4 +- llm/{ => predict}/flask_server.py | 7 +- llm/{ => predict}/gradio_ui.py | 0 llm/{ => predict}/predictor.py | 2 +- llm/{ => predict}/request_flask_server.py | 0 llm/qwen/lora_argument_pissa.json | 33 -- llm/qwen/lora_argument_qwen2_7b.json | 32 -- llm/qwen/lora_argument_qwen2moe.json | 32 -- .../pretrain-qwen1.5_7b-tp2sd4_stage2.json | 41 -- llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json | 41 -- llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json | 40 -- llm/qwen/pretrain_argument_tp2pp4.json | 40 -- llm/qwen/pt_argument_qwen2_7b.json | 33 -- llm/qwen/sft_argument_qwen2_7b.json | 31 -- llm/qwen/sft_argument_qwen2moe.json | 30 -- ...finetune_generation.py => run_finetune.py} | 52 +-- llm/run_pretrain.py | 12 +- llm/tests/test_best_pretrain_speed.py | 266 ------------ llm/tools/merge_lora_params.py | 222 ++++++++++ llm/{ => tools}/merge_tp_and_pp_params.py | 0 llm/{ => utils}/argument.py | 37 ++ llm/{ => utils}/data.py | 1 + llm/{llama => utils}/fused_layers.py | 0 llm/{ => utils}/quant.py | 0 llm/{llama => utils}/register_reshard.py | 0 llm/{ => utils}/utils.py | 10 + scripts/ci_approval/run_ci_approval.sh | 2 +- scripts/distribute/ci_case_auto.sh | 4 +- tests/llm/test_finetune.py | 2 +- tests/llm/test_finetune_prefix_tuning.py | 2 +- tests/llm/test_gradio.py | 2 +- tests/llm/test_long_sequence_strategies.py | 2 +- tests/llm/test_lora.py | 18 +- tests/llm/test_predictor.py | 4 +- tests/llm/test_ptq.py | 8 +- tests/llm/testing_utils.py | 6 +- .../benchmark_common/run_benchmark.sh | 8 +- .../configs/llama/train_infer_python.txt | 2 +- .../ft/benchmark_common/run_benchmark.sh | 4 +- tests/trainer/test_lora_unified_checkpoint.py | 4 +- tests/trainer/test_unified_checkpoint.py | 2 +- tests/transformers/test_chat_template.py | 8 +- 208 files changed, 945 insertions(+), 3828 deletions(-) rename {llm/llama => legacy/examples/benchmark/llm/llama_single_gpu}/benchmark.py (100%) rename {llm/llama => legacy/examples/benchmark/llm/llama_single_gpu}/benchmark_utils.py (100%) delete mode 100644 llm/.gitignore delete mode 120000 llm/Alignment/RM/models rename llm/{Alignment => alignment}/README.md (87%) rename llm/{ => alignment/dpo}/dpo_argument.py (100%) rename llm/{dpo_train.py => alignment/dpo/run_dpo.py} (100%) rename llm/{Alignment/PPO => alignment/ppo}/comm_utils.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/data/__init__.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/data/alpaca.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/data/base.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/data/preference.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/data/prompt_only.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/data/safe_rlhf.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/data/supervised.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/infer_utils.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/__init__.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/infer_model_utils.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/model_pp.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/pp_model_utils.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/ppo_model.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/ppo_model_utils.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/score_model.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/models/score_model_utils.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/ppo_trainer.py (100%) rename llm/{Alignment/PPO/ppo_main.py => alignment/ppo/run_ppo.py} (100%) rename llm/{Alignment/PPO => alignment/ppo}/tests/run_model.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/tests/test_export.py (100%) rename llm/{Alignment/PPO => alignment/ppo}/trainer_utils.py (100%) create mode 120000 llm/alignment/rm/models rename llm/{Alignment/RM => alignment/rm}/reward_trainer.py (100%) rename llm/{Alignment/RM/reward_main.py => alignment/rm/run_reward.py} (100%) rename llm/{gpt-3/auto_parallel => auto_parallel/gpt-3}/run_pretrain_auto.py (70%) rename llm/{gpt-3/auto_parallel => auto_parallel/gpt-3}/run_pretrain_auto_dp2mp2pp2.sh (72%) rename llm/{llama/auto_parallel => auto_parallel/llama}/README.md (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_llama3.sh (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto.py (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto.sh (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto_static.py (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto_static.sh (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto_static_sp.sh (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_hand.py (100%) rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_hand.sh (100%) rename llm/{qwen/auto_parallel => auto_parallel/qwen}/pretrain_argument_auto_dp2tp2pp2.json (100%) rename llm/{qwen/auto_parallel => auto_parallel/qwen}/run_pretrain_3D_auto.py (100%) rename llm/{qwen/auto_parallel => auto_parallel/qwen}/run_pretrain_3D_auto.sh (100%) delete mode 100644 llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json delete mode 100644 llm/benchmark.sh create mode 100644 llm/config/baichuan/README.md create mode 100644 llm/config/baichuan/awq_argument.json create mode 100644 llm/config/baichuan/dpo_argument.json rename llm/{chatglm2 => config/baichuan}/gptq_argument.json (71%) create mode 100644 llm/config/baichuan/lora_argument.json rename llm/{baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json => config/baichuan/pretrain_argument.json} (90%) create mode 100644 llm/config/baichuan/ptq_argument.json create mode 100644 llm/config/baichuan/qlora_argument.json rename llm/{ => config}/bloom/README.md (92%) rename llm/{llama => config/bloom}/gptq_argument.json (72%) rename llm/{ => config}/bloom/lora_argument.json (91%) rename llm/{ => config}/bloom/pt_argument.json (92%) rename llm/{chatglm2 => config/bloom}/ptq_argument.json (79%) rename llm/{ => config}/bloom/sft_argument.json (91%) rename llm/{ => config}/chatglm/README.md (92%) rename llm/{bloom => config/chatglm}/gptq_argument.json (73%) rename llm/{ => config}/chatglm/lora_argument.json (91%) rename llm/{ => config}/chatglm/pt_argument.json (94%) rename llm/{ => config}/chatglm/ptq_argument.json (73%) rename llm/{ => config}/chatglm/sft_argument.json (91%) rename llm/{ => config}/chatglm2/README.md (91%) rename llm/{chatglm => config/chatglm2}/gptq_argument.json (73%) rename llm/{ => config}/chatglm2/lora_argument.json (91%) rename llm/{ => config}/chatglm2/pt_argument.json (94%) rename llm/{bloom => config/chatglm2}/ptq_argument.json (79%) rename llm/{ => config}/chatglm2/sft_argument.json (85%) rename llm/{ => config}/gemma/README.md (100%) rename llm/{ => config}/gemma/sft_argument.json (71%) create mode 100644 llm/config/gpt-3/README.md rename llm/{llama => config/gpt-3}/lora_argument.json (86%) rename llm/{gpt-3/pretrain-gpt_medium_en-stage2.json => config/gpt-3/pretrain_argument.json} (97%) create mode 100644 llm/config/gpt-3/sft_argument.json rename llm/{ => config}/llama/README.md (92%) rename llm/{ => config}/llama/awq_argument.json (76%) rename llm/{ => config}/llama/dpo_argument.json (92%) create mode 100644 llm/config/llama/gptq_argument.json create mode 100644 llm/config/llama/lora_argument.json rename llm/config/llama/{ppo.json => ppo_argument.json} (100%) rename llm/{llama/pretrain-llama2_13b-tp2sd4_stage2.json => config/llama/pretrain_argument.json} (83%) rename llm/{qwen => config/llama}/pt_argument.json (85%) rename llm/{ => config}/llama/ptq_argument.json (83%) rename llm/{ => config}/llama/qlora_argument.json (84%) rename llm/config/llama/{rm.json => rm_argument.json} (100%) rename llm/{ => config}/llama/sft_argument.json (68%) rename llm/{ => config}/llama/wint8_lora_argument.json (89%) rename llm/{ => config}/mixtral/lora_argument.json (88%) rename llm/{llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json => config/mixtral/pretrain_argument.json} (79%) rename llm/{ => config}/mixtral/sft_argument.json (74%) rename llm/{ => config}/opt/README.md (88%) rename llm/{ => config}/opt/lora_argument.json (94%) rename llm/{ => config}/opt/sft_argument.json (94%) rename llm/{ => config}/qwen/README.md (96%) rename llm/{ => config}/qwen/dpo_argument.json (93%) rename llm/{ => config}/qwen/lora_argument.json (82%) rename llm/{qwen/pretrain_argument_stage2.json => config/qwen/pretrain_argument.json} (84%) rename llm/{llama => config/qwen}/pt_argument.json (81%) rename llm/{ => config}/qwen/sft_argument.json (78%) rename llm/{ => experimental}/ernie-3.5-se/README.md (100%) rename llm/{ => experimental}/ernie-3.5-se/configuration.py (100%) rename llm/{ => experimental}/ernie-3.5-se/conversion_utils.py (100%) rename llm/{ => experimental}/ernie-3.5-se/data.py (100%) rename llm/{ => experimental}/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model (100%) rename llm/{ => experimental}/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json (100%) rename llm/{ => experimental}/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json (100%) rename llm/{ => experimental}/ernie-3.5-se/ernie_dataset.py (100%) rename llm/{ => experimental}/ernie-3.5-se/finetune_generation.py (100%) rename llm/{ => experimental}/ernie-3.5-se/modeling.py (100%) rename llm/{ => experimental}/ernie-3.5-se/predict_generation.py (100%) rename llm/{ => experimental}/ernie-3.5-se/run_pretrain.py (100%) rename llm/{ => experimental}/ernie-3.5-se/run_trainer_stage2.sh (100%) rename llm/{ => experimental}/ernie-3.5-se/tokenizer.py (100%) rename llm/{ => experimental}/ernie-3.5-se/utils.py (100%) rename llm/{llama => experimental/scripts}/run_sharding_v2.sh (100%) rename llm/{llama => experimental/scripts}/run_trainer.sh (100%) rename llm/{llama => experimental/scripts}/run_trainer_tp2cp2.sh (100%) rename llm/{llama => experimental/scripts}/run_trainer_tp4pp2.sh (100%) rename llm/{llama => experimental/scripts}/run_trainer_tp4sep2.sh (100%) delete mode 120000 llm/fused_layers.py delete mode 100644 llm/gemma/sft_argument_7b.json delete mode 100644 llm/gemma/sft_argument_7b_sharding.json delete mode 100644 llm/gemma/sft_argument_sharding.json delete mode 100644 llm/glm/README.md delete mode 100644 llm/glm/data.py delete mode 100644 llm/glm/finetune_generation.py delete mode 100644 llm/glm/predict_generation.py delete mode 100644 llm/glm/utils.py delete mode 100644 llm/gpt-3/README.md delete mode 100644 llm/gpt-3/finetune_generation.py delete mode 100644 llm/gpt-3/predict_generation.py delete mode 120000 llm/gpt-3/run_pretrain.py delete mode 100644 llm/gpt-3/tests/test_sequence_parallel.py delete mode 100644 llm/gpt-3/utils.py delete mode 100644 llm/llama/lora_argument_pissa.json delete mode 100644 llm/llama/megre_tp_and_pp.py delete mode 100644 llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json delete mode 100644 llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json delete mode 100644 llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json delete mode 100644 llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json delete mode 100644 llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json delete mode 100644 llm/llama/pretrain-llama_13b-tp2sd4_stage2.json delete mode 100644 llm/llama/pretrain-llama_7b-tp2sd4_stage2.json delete mode 120000 llm/llama/run_pretrain.py delete mode 100644 llm/llama/sft_pp_argument.json delete mode 100644 llm/llama/tests/test_pipeline_parallel.py delete mode 100644 llm/llama/tests/test_sequence_parallel.py delete mode 100644 llm/llama/tests/unified-ckpt-llama-500m/config.json delete mode 100644 llm/merge_lora_params.py rename llm/{llama/npu => npu/llama}/export_utils.py (91%) rename llm/{llama/npu => npu/llama}/llama_npu_opt_lora.sh (94%) rename llm/{llama/npu => npu/llama}/llama_npu_opt_ppt.sh (100%) rename llm/{llama/npu => npu/llama}/llama_npu_opt_sft.sh (95%) rename llm/{ => predict}/export_model.py (96%) rename llm/{ => predict}/flask_server.py (98%) rename llm/{ => predict}/gradio_ui.py (100%) rename llm/{ => predict}/predictor.py (99%) rename llm/{ => predict}/request_flask_server.py (100%) delete mode 100644 llm/qwen/lora_argument_pissa.json delete mode 100644 llm/qwen/lora_argument_qwen2_7b.json delete mode 100644 llm/qwen/lora_argument_qwen2moe.json delete mode 100644 llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json delete mode 100644 llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json delete mode 100644 llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json delete mode 100644 llm/qwen/pretrain_argument_tp2pp4.json delete mode 100644 llm/qwen/pt_argument_qwen2_7b.json delete mode 100644 llm/qwen/sft_argument_qwen2_7b.json delete mode 100644 llm/qwen/sft_argument_qwen2moe.json rename llm/{finetune_generation.py => run_finetune.py} (95%) delete mode 100644 llm/tests/test_best_pretrain_speed.py create mode 100644 llm/tools/merge_lora_params.py rename llm/{ => tools}/merge_tp_and_pp_params.py (100%) rename llm/{ => utils}/argument.py (89%) rename llm/{ => utils}/data.py (99%) rename llm/{llama => utils}/fused_layers.py (100%) rename llm/{ => utils}/quant.py (100%) rename llm/{llama => utils}/register_reshard.py (100%) rename llm/{ => utils}/utils.py (99%) diff --git a/docs/llm/peft.md b/docs/llm/peft.md index 234756e0f71b..f720138c6d23 100644 --- a/docs/llm/peft.md +++ b/docs/llm/peft.md @@ -277,4 +277,4 @@ key function 该函数会遍历整个权重参数列表,对于每个权重参数weight,统计所有进行梯度更新的参数,最后将信息打印出来。 ``` -更详细的使用可以参考[finetuning 脚本](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/causallm/finetune_generation.py)版本, 以及对应的启动脚本编写方式(写在 [README.md](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/causallm/README.md)文件中)。 +更详细的使用可以参考[finetuning 脚本](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/run_finetune.py)版本, 以及对应的启动脚本编写方式(写在 [README.md](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/causallm/README.md)文件中)。 diff --git a/llm/llama/benchmark.py b/legacy/examples/benchmark/llm/llama_single_gpu/benchmark.py similarity index 100% rename from llm/llama/benchmark.py rename to legacy/examples/benchmark/llm/llama_single_gpu/benchmark.py diff --git a/llm/llama/benchmark_utils.py b/legacy/examples/benchmark/llm/llama_single_gpu/benchmark_utils.py similarity index 100% rename from llm/llama/benchmark_utils.py rename to legacy/examples/benchmark/llm/llama_single_gpu/benchmark_utils.py diff --git a/llm/.gitignore b/llm/.gitignore deleted file mode 100644 index d81fdef50031..000000000000 --- a/llm/.gitignore +++ /dev/null @@ -1,12 +0,0 @@ -# tmp files -infer.json -output.json - -# data -AdvertiseGen.tar.gz - -# checkpoints -checkpoints/ - -# inference_model -inference*/ \ No newline at end of file diff --git a/llm/Alignment/RM/models b/llm/Alignment/RM/models deleted file mode 120000 index 39963209bbb5..000000000000 --- a/llm/Alignment/RM/models +++ /dev/null @@ -1 +0,0 @@ -../PPO/models \ No newline at end of file diff --git a/llm/README.md b/llm/README.md index 36311c9980d1..c3009a1ceab2 100644 --- a/llm/README.md +++ b/llm/README.md @@ -19,17 +19,17 @@ ## 🛠️ 支持模型列表 🛠️ -| Model | Pretrain | SFT | LoRA | Prefix Tuning | Quantization | Weight convert | -| --- | --- | --- | --- | --- | --- | --- | -| [LLaMA/LLaMA2](./llama) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [Baichuan/Baichuan2](./llama) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [ChatGLM-6B](./chatglm) | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | -| [ChatGLM2/ChatGLM3](./chatglm2) | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [Qwen](./qwen) | ✅ | ✅ | ✅ | ✅ | 🚧 | ✅ |j -| [Bloom](./bloom) | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [GPT-3](./gpt-3) | ✅ | ✅ | 🚧 | 🚧 | 🚧 | ✅ | -| [OPT](./opt) | 🚧 | ✅ | ✅ | 🚧 | 🚧 | ✅ | -| [GLM](./glm) | ❌ | ✅ | ✅ | 🚧 | 🚧 | ✅ | +| Model | Pretrain | SFT | LoRA | Prefix Tuning | DPO | Quantization | Weight convert | +| --- | --- | --- | --- | --- | --- | --- | --- | +| [LLaMA](./llama) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [Qwen](./qwen) | ✅ | ✅ | ✅ | ✅ | ✅ | 🚧 | ✅ | +| [Mixtral](./mixtral) | ✅ | ✅ | ✅ | ❌ | 🚧 |🚧 | 🚧 | +| [Baichuan/Baichuan2](./llama) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [ChatGLM-6B](./chatglm) | ❌ | ✅ | ✅ | ✅ | 🚧 | ✅ | ❌ | +| [ChatGLM2/ChatGLM3](./chatglm2) | ❌ | ✅ | ✅ | ✅ | 🚧 | ✅ | ✅ | +| [Bloom](./bloom) | ❌ | ✅ | ✅ | ✅ |🚧 | ✅ | ✅ | +| [GPT-3](./gpt-3) | ✅ | ✅ | 🚧 | 🚧 |🚧 | 🚧 | ✅ | +| [OPT](./opt) | 🚧 | ✅ | ✅ | 🚧 | 🚧 |🚧 | ✅ | * ✅: Supported * 🚧: In Progress @@ -39,7 +39,7 @@ ## 🚀 快速开始 🚀 ### 1. 预训练 -PaddleNLP将飞桨4D并行策略加入到Trainer API中, 用户只需修改Trainer配置即可使用不同的分布式策略。目前工具链提供[LLaMA/LLaMA2](./llama)、[GPT-3](./gpt-3)、[Qwen](./qwen)、[Baichuan/Baichuan2](./llama) 等模型预训练功能,更多模型支持持续更新中。 +PaddleNLP将飞桨4D并行策略加入到Trainer API中, 用户只需修改Trainer配置即可使用不同的分布式策略。目前工具链提供[LLaMA/LLaMA2](./llama)、[GPT-3](./gpt-3)、[Qwen](./qwen)、[Baichuan/Baichuan2](./llama)、[Mixtral](./mixtral) 等模型预训练功能,更多模型支持持续更新中。
llm @@ -54,7 +54,7 @@ PaddleNLP将飞桨4D并行策略加入到Trainer API中, 用户只需修改Tra 我们在此处提供了更详细的[预训练数据制作](),[分布式策略支持情况]( https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html#model-capability),[性能测试报告文档](https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html#model-performance),参见: https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html. 大模型权重列表参见[此处](https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html#model-weight) -此项目支持了LLaMA、GPT-3、BaiChuan、Qwen 等大模型的预训练。用户切换配置config文件,即可一键运行。 +此项目支持了LLaMA、GPT-3、BaiChuan、Qwen、Mixtral 等大模型的预训练。用户切换配置config文件,即可一键运行。 数据详细制作流程可参考[此处](https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/dataset.html) : https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/dataset.html @@ -79,30 +79,26 @@ mv llama_openwebtext_100k.idx ./data ```shell # 编译自定义算子,可选 -cd ../model_zoo/gpt-3/external_ops/ && python3 setup.py install && cd - +cd ..legacy/model_zoo/gpt-3/external_ops/ && python3 setup.py install && cd - -# llama 模型预训练 -python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./llama/pretrain-llama2_7b-tp2sd4_stage2.json - -# Qwen 模型预训练 -python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./qwen/pretrain_argument_stage2.json +# 模型预训练参考 +python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/llama/pretrain_argument.json ``` 注意: 1. 建议使用paddle develop版本训练,需要安装`pip install tool_helpers visualdl==2.5.3`等相关缺失whl包 2. `use_flash_attention` 需要在A100机器开启,建议使用cuda11.8环境。 -3. `use_fused_rms_norm` 需要安装[此目录](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt-3/external_ops)下的自定义OP, `python setup.py install`。如果安装后仍然找不到算子,需要额外设置PYTHONPATH +3. `use_fused_rms_norm` 需要安装自定义算子。如果安装后仍然找不到算子,需要额外设置PYTHONPATH 4. `continue_training` 表示从现有的预训练模型加载训练。7b模型初始loss大概为2.xx, 随机初始化模型loss从11.x左右下降。 -5. 当前脚本为sharding版本,需要4D并行训练(数据、sharding、张量、流水线并行)的用户,请参考 `run_trainer_tp4pp2.sh`脚本。 -6. 多机训练时,若各机器使用的训练数据文件位置相同(例如挂载共享硬盘情况),请指定`--share_folder true`使全局0号卡制作缓存数据。否则默认各台机器的0号卡独立制作缓存数据, -7. 若数据集文件夹中存在默认缓存文件夹`index-cache/`,则额外指定的`--data_cache`不生效,训练时优先加载默认缓存文件夹中的内容。 +5. 多机训练时,若各机器使用的训练数据文件位置相同(例如挂载共享硬盘情况),请指定`--share_folder true`使全局0号卡制作缓存数据。否则默认各台机器的0号卡独立制作缓存数据, +6. 若数据集文件夹中存在默认缓存文件夹`index-cache/`,则额外指定的`--data_cache`不生效,训练时优先加载默认缓存文件夹中的内容。 ### 2. 精调 PaddleNLP支持多个主流大模型的SFT、LoRA、Prefix Tuning等精调策略,提供统一、高效精调方案: - **统一训练入口**。飞桨大模型套件精调方案可适配业界主流大模型,用户只需修改配置文件,即能在单卡或多卡(支持4D并行分布式策略)进行多种大模型精调。 -- **高效数据和分布式策略**。Zero Padding零填充优化策略有效减少了pad token的占比,提高模型训练效率高达100%。独创PEFT结合低比特和分布式并行策略,大幅降低大模型精调硬件门槛,支持单卡(A100 80G)百亿模型微调、单机(A100 80G * 8)千亿模型微调。 +- **高效数据和分布式策略**。Zero Padding零填充优化策略结合FlashMask策略有效提升模型训练效率。独创PEFT结合低比特和分布式并行策略,大幅降低大模型精调硬件门槛,支持单卡(A100 80G)百亿模型微调、单机(A100 80G * 8)千亿模型微调。 - **支持多轮对话**。支持统一对话模板,支持多轮对话高效训练,详参[多轮对话文档](./docs/chat_template.md)。 @@ -137,26 +133,26 @@ tar -zxvf AdvertiseGen.tar.gz **全参精调:SFT** ```bash -# 四卡llama SFT启动命令参考 -python -u -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./llama/sft_argument.json +# SFT启动命令参考 +python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py ./config/llama/sft_argument.json ``` **LoRA** ```bash -# 单卡llama LoRA启动命令参考 -python finetune_generation.py ./llama/lora_argument.json +# LoRA启动命令参考 +python run_finetune.py ./config/llama/lora_argument.json ``` **Prefix Tuning** ```bash -# 单卡llama Prefix Tuning启动命令参考 -python finetune_generation.py ./llama/pt_argument.json +# Prefix Tuning启动命令参考 +python run_finetune.py ./config/llama/pt_argument.json ``` 更多大模型精调分布式使用文档、训练细节和效果请参见[大模型精调教程](./docs/finetune.md)。 ### 3. 对齐 -我们支持DPO等偏好对齐策略。 +我们支持DPO等偏好对齐策略。DPO策略采用zero_padding策略,结合FlashMask策略,有效提升模型训练效率。 **数据准备**: @@ -189,10 +185,10 @@ wget https://bj.bcebos.com/paddlenlp/datasets/examples/ultrafeedback_binarized.t tar -zxvf ultrafeedback_binarized.tar.gz ``` -**全参精调:SFT** +**全参DPO** ```bash -# 四卡llama SFT启动命令参考 -python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" dpo_train.py ./llama/dpo_argument.json +# DPO启动命令参考 +python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./dpo/run_dpo.py ./config/llama/dpo_argument.json ``` ### 4. 量化 @@ -215,10 +211,10 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" dpo_train.py ./ ``` # PTQ 量化启动命令参考 -python finetune_generation.py ./llama/ptq_argument.json +python run_finetune.py ./config/llama/ptq_argument.json # GPTQ 量化启动命令参考 -python finetune_generation.py ./llama/ptq_argument.json +python run_finetune.py ./config/llama/ptq_argument.json ``` 更多技术细节和模型量化使用详见[量化文档](./docs/quantization.md)。 @@ -231,13 +227,13 @@ PaddleNLP除了提供常用模型推理外,还提供了高性能推理,内 ```shell # 动态图模型推理命令参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16 +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16 # 静态图模型推理命令参考 # step1 : 静态图导出 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16 +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16 # step2: 静态图推理 -python predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static +python ./predict/predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static ``` - **InferenceModel 高性能推理**:PaddleNLP 还提供了高性能推理模型加快并行推理的速度,同时支持FP16、Prefix Tuning、WINT8、A8W8多种推理方式。 @@ -253,13 +249,13 @@ python predictor.py --model_name_or_path ./inference --data_file ./data/dev.json ```shell # 高性能动态图模型推理命令参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 # 高性能静态图模型推理命令参考 # step1 : 静态图导出 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 # step2: 静态图推理 -python predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" ``` 更多常用模型推理和高性能模型使用方法详见[大模型推理文档](./docs/inference.md)。 @@ -277,7 +273,7 @@ python predictor.py --model_name_or_path ./inference --inference_model --dtype " 我们提供了一套基于动态图推理的简单易用UI服务化部署脚本,用户可以快速部署服务化推理。 ``` -python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" flask_server.py \ +python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./predict/flask_server.py \ --model_name_or_path meta-llama/Llama-2-7b-chat \ --port 8010 \ --flask_port 8011 \ @@ -287,7 +283,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" flask_server.py \ - `flask_port`: Flask服务端口号,默认8010。 - 其他参数请参见[推理文档](./docs/inference.md)中推理参数配置。 -此外,如果想通过API脚本的方式跑推理,可参考:`./request_flask_server.py` 文件。 +此外,如果想通过API脚本的方式跑推理,可参考:`./predict/request_flask_server.py` 文件。
diff --git a/llm/Alignment/README.md b/llm/alignment/README.md similarity index 87% rename from llm/Alignment/README.md rename to llm/alignment/README.md index fbf978dc208c..7a9a54408d92 100644 --- a/llm/Alignment/README.md +++ b/llm/alignment/README.md @@ -8,7 +8,7 @@ ``` . -├── PPO # PPO 训练相关目录 +├── ppo # PPO 训练相关目录 │ ├── comm_utils.py # 通信相关工具py文件 │ ├── data # 数据集相关目录 │ │ ├── alpaca.py # alpaca(raw)数据集py文件 @@ -28,16 +28,16 @@ │ │ ├── ppo_model_utils.py # PPO loss等模型策略py文件 │ │ ├── score_model.py # score model模型定义py文件 │ │ └── score_model_utils.py # score model基类及工具py文件 -│ ├── ppo_main.py # RLHF训练脚本 +│ ├── run_ppo.py # RLHF训练脚本 │ ├── ppo_trainer.py # RLHF训练执行器py脚本 │ ├── tests # 测试相关目录 │ │ ├── run_model.py │ │ └── test_export.py │ └── trainer_utils.py # Trainer补丁及工具py脚本 ├── README.md -└── RM # Reward Model 训练相关目录 - ├── models -> ../PPO/models - ├── reward_main.py # reward model训练脚本 +└── rm # Reward Model 训练相关目录 + ├── models -> ../ppo/models + ├── run_reward.py # reward model训练脚本 └── reward_trainer.py # reward训练执行器py脚本 ``` @@ -179,14 +179,14 @@ PPO 完整的训练过程包括以下 3 个阶段,如下图所示(来自[Dee 2. Reward Model Fine-Tuning -使用 `reward_main.py` 脚本根据 `rm.json` 训练奖励模型 +使用 `run_reward.py` 脚本根据 `rm_argument.json` 训练奖励模型 ``` -cd RM -python -u -m paddle.distributed.launch reward_main.py ../../config/llama/rm.json +cd rm +python -u -m paddle.distributed.launch run_reward.py ../../config/llama/rm_argument.json ``` -`rm.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述;稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): +`rm_argument.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述;稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): - `normalize_score_during_training`:是否在训练过程中对奖励进行 normalize,默认为 `False`。 - `normalizer_type`:使用 normalizer 时计算 mean、var 的方式,可选`"RunningMeanStd", "ExponentialMovingAverage"`。 @@ -196,15 +196,15 @@ python -u -m paddle.distributed.launch reward_main.py ../../config/llama/rm.json 3. RLHF: -RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型;actor-model/reference-model 使用 SFT 模型进行 initialize/frozen;critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并)。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型([PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced))和 reward 模型([PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward),注意该模型只关注 helpful 未考量 harmless)作为示例,使用 `ppo_main.py` 脚本根据 `ppo.json` 进行 RLHF 训练。 +RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型;actor-model/reference-model 使用 SFT 模型进行 initialize/frozen;critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并)。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型([PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced))和 reward 模型([PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward),注意该模型只关注 helpful 未考量 harmless)作为示例,使用 `run_ppo.py` 脚本根据 `ppo_argument.json` 进行 RLHF 训练。 ``` # 类型提升 warning 暂时通过 loglevel 屏蔽,待后续修复 -cd PPO -PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_main.py ../../config/llama/ppo.json +cd ppo +PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch run_ppo.py ../../config/llama/ppo_argument.json ``` -`ppo.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述,重点给出以下参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): +`ppo_argument.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述,重点给出以下参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): - `train_datasets`:使用数据集定义注册时的`NAME`属性给出训练集。 - `eval_datasets`:使用数据集定义注册时的`NAME`属性给出验证集。 @@ -230,7 +230,7 @@ PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_ 此外为了支持更高性、更大规模的 RLHF 训练提供了以下特殊参数配置,可以按需使用: - `use_fusemt`:安装 paddlenlp_ops 后将在 rollout 生成时开启生成加速(开启流水线并行时不支持生成加速),通过此设置可以禁用生成加速。 - `eval_mode`:支持为空或者设置为 "single"、"tensor_parallel";通常可以在使用流水线并行训练时设置为"tensor_parallel",以此在 rollout 生成阶段使用非流水线并行模型并进行生成加速。 -- `offload_level`:支持设置为"freeze_model"、"optimizer"、"train_model"或者同时使用(空格分隔),分别指示 reward+reference 两个冻结模型、actor+critic 两个训练模型的优化器状态和模型参数的 offload/reload,用于在不同阶段 model/optimizer 使用结束后及时 offload 并在下次使用时 reload 相应参数权重以节省显存。 +- `offload_level`:支持设置为"freeze_model"、"optimizer"、"train_model"或者同时使用(空格分隔),分别指示 reward+reference 两个冻结模型、actor+critic 两个训练模型的优化器状态和模型参数的 offload/reload,用于在不同阶段 model/optimizer 使用结束后及时 offload 并在下次使用时 reload 相应参数权重以节省显存。 另外注意,在使用流水线并行时(pipeline_parallel_degree大于1)建议将 `dataloader_drop_last` 设置为 true, 以此避免不同batch size带来的问题。 diff --git a/llm/dpo_argument.py b/llm/alignment/dpo/dpo_argument.py similarity index 100% rename from llm/dpo_argument.py rename to llm/alignment/dpo/dpo_argument.py diff --git a/llm/dpo_train.py b/llm/alignment/dpo/run_dpo.py similarity index 100% rename from llm/dpo_train.py rename to llm/alignment/dpo/run_dpo.py diff --git a/llm/Alignment/PPO/comm_utils.py b/llm/alignment/ppo/comm_utils.py similarity index 100% rename from llm/Alignment/PPO/comm_utils.py rename to llm/alignment/ppo/comm_utils.py diff --git a/llm/Alignment/PPO/data/__init__.py b/llm/alignment/ppo/data/__init__.py similarity index 100% rename from llm/Alignment/PPO/data/__init__.py rename to llm/alignment/ppo/data/__init__.py diff --git a/llm/Alignment/PPO/data/alpaca.py b/llm/alignment/ppo/data/alpaca.py similarity index 100% rename from llm/Alignment/PPO/data/alpaca.py rename to llm/alignment/ppo/data/alpaca.py diff --git a/llm/Alignment/PPO/data/base.py b/llm/alignment/ppo/data/base.py similarity index 100% rename from llm/Alignment/PPO/data/base.py rename to llm/alignment/ppo/data/base.py diff --git a/llm/Alignment/PPO/data/preference.py b/llm/alignment/ppo/data/preference.py similarity index 100% rename from llm/Alignment/PPO/data/preference.py rename to llm/alignment/ppo/data/preference.py diff --git a/llm/Alignment/PPO/data/prompt_only.py b/llm/alignment/ppo/data/prompt_only.py similarity index 100% rename from llm/Alignment/PPO/data/prompt_only.py rename to llm/alignment/ppo/data/prompt_only.py diff --git a/llm/Alignment/PPO/data/safe_rlhf.py b/llm/alignment/ppo/data/safe_rlhf.py similarity index 100% rename from llm/Alignment/PPO/data/safe_rlhf.py rename to llm/alignment/ppo/data/safe_rlhf.py diff --git a/llm/Alignment/PPO/data/supervised.py b/llm/alignment/ppo/data/supervised.py similarity index 100% rename from llm/Alignment/PPO/data/supervised.py rename to llm/alignment/ppo/data/supervised.py diff --git a/llm/Alignment/PPO/infer_utils.py b/llm/alignment/ppo/infer_utils.py similarity index 100% rename from llm/Alignment/PPO/infer_utils.py rename to llm/alignment/ppo/infer_utils.py diff --git a/llm/Alignment/PPO/models/__init__.py b/llm/alignment/ppo/models/__init__.py similarity index 100% rename from llm/Alignment/PPO/models/__init__.py rename to llm/alignment/ppo/models/__init__.py diff --git a/llm/Alignment/PPO/models/infer_model_utils.py b/llm/alignment/ppo/models/infer_model_utils.py similarity index 100% rename from llm/Alignment/PPO/models/infer_model_utils.py rename to llm/alignment/ppo/models/infer_model_utils.py diff --git a/llm/Alignment/PPO/models/model_pp.py b/llm/alignment/ppo/models/model_pp.py similarity index 100% rename from llm/Alignment/PPO/models/model_pp.py rename to llm/alignment/ppo/models/model_pp.py diff --git a/llm/Alignment/PPO/models/pp_model_utils.py b/llm/alignment/ppo/models/pp_model_utils.py similarity index 100% rename from llm/Alignment/PPO/models/pp_model_utils.py rename to llm/alignment/ppo/models/pp_model_utils.py diff --git a/llm/Alignment/PPO/models/ppo_model.py b/llm/alignment/ppo/models/ppo_model.py similarity index 100% rename from llm/Alignment/PPO/models/ppo_model.py rename to llm/alignment/ppo/models/ppo_model.py diff --git a/llm/Alignment/PPO/models/ppo_model_utils.py b/llm/alignment/ppo/models/ppo_model_utils.py similarity index 100% rename from llm/Alignment/PPO/models/ppo_model_utils.py rename to llm/alignment/ppo/models/ppo_model_utils.py diff --git a/llm/Alignment/PPO/models/score_model.py b/llm/alignment/ppo/models/score_model.py similarity index 100% rename from llm/Alignment/PPO/models/score_model.py rename to llm/alignment/ppo/models/score_model.py diff --git a/llm/Alignment/PPO/models/score_model_utils.py b/llm/alignment/ppo/models/score_model_utils.py similarity index 100% rename from llm/Alignment/PPO/models/score_model_utils.py rename to llm/alignment/ppo/models/score_model_utils.py diff --git a/llm/Alignment/PPO/ppo_trainer.py b/llm/alignment/ppo/ppo_trainer.py similarity index 100% rename from llm/Alignment/PPO/ppo_trainer.py rename to llm/alignment/ppo/ppo_trainer.py diff --git a/llm/Alignment/PPO/ppo_main.py b/llm/alignment/ppo/run_ppo.py similarity index 100% rename from llm/Alignment/PPO/ppo_main.py rename to llm/alignment/ppo/run_ppo.py diff --git a/llm/Alignment/PPO/tests/run_model.py b/llm/alignment/ppo/tests/run_model.py similarity index 100% rename from llm/Alignment/PPO/tests/run_model.py rename to llm/alignment/ppo/tests/run_model.py diff --git a/llm/Alignment/PPO/tests/test_export.py b/llm/alignment/ppo/tests/test_export.py similarity index 100% rename from llm/Alignment/PPO/tests/test_export.py rename to llm/alignment/ppo/tests/test_export.py diff --git a/llm/Alignment/PPO/trainer_utils.py b/llm/alignment/ppo/trainer_utils.py similarity index 100% rename from llm/Alignment/PPO/trainer_utils.py rename to llm/alignment/ppo/trainer_utils.py diff --git a/llm/alignment/rm/models b/llm/alignment/rm/models new file mode 120000 index 000000000000..46643733d940 --- /dev/null +++ b/llm/alignment/rm/models @@ -0,0 +1 @@ +../ppo/models \ No newline at end of file diff --git a/llm/Alignment/RM/reward_trainer.py b/llm/alignment/rm/reward_trainer.py similarity index 100% rename from llm/Alignment/RM/reward_trainer.py rename to llm/alignment/rm/reward_trainer.py diff --git a/llm/Alignment/RM/reward_main.py b/llm/alignment/rm/run_reward.py similarity index 100% rename from llm/Alignment/RM/reward_main.py rename to llm/alignment/rm/run_reward.py diff --git a/llm/gpt-3/auto_parallel/run_pretrain_auto.py b/llm/auto_parallel/gpt-3/run_pretrain_auto.py similarity index 70% rename from llm/gpt-3/auto_parallel/run_pretrain_auto.py rename to llm/auto_parallel/gpt-3/run_pretrain_auto.py index 0ee470d37255..5afb828d0e2f 100644 --- a/llm/gpt-3/auto_parallel/run_pretrain_auto.py +++ b/llm/auto_parallel/gpt-3/run_pretrain_auto.py @@ -18,7 +18,6 @@ import random import sys import types -from collections import OrderedDict from dataclasses import dataclass, field from typing import List, Optional @@ -33,10 +32,10 @@ from paddlenlp.transformers import ( AutoTokenizer, CosineAnnealingWithWarmupDecay, - LinearAnnealingWithWarmupDecay, GPTConfig, GPTForCausalLMAuto, GPTPretrainingCriterionAuto, + LinearAnnealingWithWarmupDecay, ) from paddlenlp.utils.log import logger @@ -50,11 +49,10 @@ print_rank_0, ) -def add_start_docstrings(*docstr): +def add_start_docstrings(*docstr): def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ - if fn.__doc__ is not None else "") + fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") return fn return docstring_decorator @@ -70,22 +68,19 @@ class PreTrainingArguments(TrainingArguments): decay_steps: float = field( default=None, metadata={ - "help": - "The steps use to control the learing rate. If the step > decay_steps, will use the min_learning_rate." + "help": "The steps use to control the learing rate. If the step > decay_steps, will use the min_learning_rate." }, ) enable_linear_fused_grad_add: bool = field( default=False, metadata={ - "help": - "Enable fused linear grad add strategy, which will reduce elementwise add for grad accumulation in the backward of nn.Linear ." + "help": "Enable fused linear grad add strategy, which will reduce elementwise add for grad accumulation in the backward of nn.Linear ." }, ) fused_linear_param_grad_add: bool = field( default=False, metadata={ - "help": - "Enable fused_linear_param_grad pass, which should replace add_n_op with add_op for gradients accumulation." + "help": "Enable fused_linear_param_grad pass, which should replace add_n_op with add_op for gradients accumulation." }, ) job_schedule_profiler_start: int = field( @@ -97,27 +92,19 @@ class PreTrainingArguments(TrainingArguments): metadata={"help": "The step to end job_schedule_profiler."}, ) pipeline_schedule_mode: str = field( - default="1F1B", - metadata={ - "help": - "The pipeline schedule mode, support FThenB, 1F1B, VPP and Eager-1F1B." - }) - sr: Optional[int] = field( - default=0, metadata={"help": "The count of chunks without recompute."}) + default="1F1B", metadata={"help": "The pipeline schedule mode, support FThenB, 1F1B, VPP and Eager-1F1B."} + ) + sr: Optional[int] = field(default=0, metadata={"help": "The count of chunks without recompute."}) refined_ops_patterns: Optional[List[str]] = field( - default=None, metadata={"help": "The pattern of refined recompute."}) + default=None, metadata={"help": "The pattern of refined recompute."} + ) virtual_pipeline_seg_method: str = field( - default="LlamaDecoderLayerAuto", - metadata={ - "help": "The seg method of spliting pp layer for virtual pipeline." - }) + default="LlamaDecoderLayerAuto", metadata={"help": "The seg method of spliting pp layer for virtual pipeline."} + ) # NOTE(gongenlei): new add autotuner_benchmark autotuner_benchmark: bool = field( default=False, - metadata={ - "help": - "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length." - }, + metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, ) def __post_init__(self): @@ -140,8 +127,7 @@ def __post_init__(self): if self.fused_linear_param_grad_add: fused_passes = self.strategy.fused_passes fused_passes.enable = True - fused_passes.fused_passes_list.append( - "fused_linear_param_grad_add_pass") + fused_passes.fused_passes_list.append("fused_linear_param_grad_add_pass") logger.info(self.strategy) @@ -155,39 +141,28 @@ class DataArguments: """ input_dir: str = field( - default=None, - metadata={ - "help": - "The name of the dataset to use (via the datasets library)." - }) - split: str = field(default="949,50,1", - metadata={"help": "Train/valid/test data split."}) + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + split: str = field(default="949,50,1", metadata={"help": "Train/valid/test data split."}) max_seq_length: int = field( default=1024, metadata={ - "help": - "The maximum total input sequence length after tokenization. Sequences longer " + "help": "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded." }, ) share_folder: bool = field( default=False, - metadata={ - "help": - "Use share folder for data dir and output dir on multi machine." - }, + metadata={"help": "Use share folder for data dir and output dir on multi machine."}, ) - data_impl: str = field( - default="mmap", - metadata={"help": "The format of the preprocessed data."}) + data_impl: str = field(default="mmap", metadata={"help": "The format of the preprocessed data."}) skip_warmup: bool = field( default=True, metadata={"help": "Whether to skip the warmup process of mmap files."}, ) - data_cache: str = field( - default=None, metadata={"help": "The path of the cached dataset."}) + data_cache: str = field(default=None, metadata={"help": "The path of the cached dataset."}) @dataclass @@ -197,52 +172,35 @@ class ModelArguments: """ model_type: Optional[str] = field( - default="llama", - metadata={"help": "Only support for llama pre-training for now."}) + default="llama", metadata={"help": "Only support for llama pre-training for now."} + ) model_name_or_path: str = field( default="__internal_testing__/tiny-random-llama", metadata={ - "help": - "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" + "help": "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" }, ) tokenizer_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not the same as model_name" - }) + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) config_name: Optional[str] = field( - default=None, - metadata={ - "help": - "Pretrained config name or path if not the same as model_name" - }) + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) vocab_size: Optional[int] = field( default=None, metadata={ - "help": - ".Vocabulary size of the Llama model. Defines the number of different tokens that can be represented by the `inputs_ids`" + "help": ".Vocabulary size of the Llama model. Defines the number of different tokens that can be represented by the `inputs_ids`" }, ) - hidden_size: Optional[int] = field( - default=None, - metadata={"help": "Dimension of the hidden representations."}) - intermediate_size: Optional[int] = field( - default=None, - metadata={"help": "Dimension of the MLP representations."}) + hidden_size: Optional[int] = field(default=None, metadata={"help": "Dimension of the hidden representations."}) + intermediate_size: Optional[int] = field(default=None, metadata={"help": "Dimension of the MLP representations."}) num_hidden_layers: Optional[int] = field( - default=None, - metadata={ - "help": "Number of hidden layers in the Transformer encoder." - }) + default=None, metadata={"help": "Number of hidden layers in the Transformer encoder."} + ) num_attention_heads: Optional[int] = field( default=None, - metadata={ - "help": - "Number of attention heads for each attention layer in the Transformer encoder." - }, + metadata={"help": "Number of attention heads for each attention layer in the Transformer encoder."}, ) use_flash_attention: bool = field( default=False, @@ -258,9 +216,7 @@ class ModelArguments: ) fuse_attention_ffn: bool = field( default=False, - metadata={ - "help": "whether to fuse first up and gate proj in mlp block" - }, + metadata={"help": "whether to fuse first up and gate proj in mlp block"}, ) recompute_granularity: str = field( default="full", @@ -273,15 +229,12 @@ class ModelArguments: continue_training: bool = field( default=False, metadata={ - "help": - "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models." + "help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models." }, ) - hidden_dropout_prob: float = field( - default=0.1, metadata={"help": "The hidden dropout prob."}) - attention_probs_dropout_prob: float = field( - default=0.1, metadata={"help": "The attention hidden dropout prob."}) + hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."}) + attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."}) sequence_parallel: bool = field( default=False, @@ -297,16 +250,12 @@ class ModelArguments: ) no_recompute_layers: Optional[List[int]] = field( default=None, - metadata={ - "help": - "Specify the full transformer layers that should not be recomputed." - }, + metadata={"help": "Specify the full transformer layers that should not be recomputed."}, ) pp_recompute_interval: int = field( default=1, metadata={ - "help": - "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0." + "help": "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0." }, ) recompute_use_reentrant: bool = field( @@ -323,30 +272,27 @@ def create_pretrained_dataset( need_data=True, ): - check_data_split(data_args.split, training_args.do_train, - training_args.do_eval, training_args.do_predict) + check_data_split(data_args.split, training_args.do_train, training_args.do_eval, training_args.do_predict) train_val_test_num_samples = [ - training_args.per_device_train_batch_size * - training_args.data_parallel_degree * training_args.max_steps * - training_args.gradient_accumulation_steps, - training_args.per_device_eval_batch_size * - training_args.data_parallel_degree * training_args.eval_iters * - (training_args.max_steps // training_args.eval_steps + 1), - training_args.per_device_eval_batch_size * - training_args.data_parallel_degree * training_args.test_iters, + training_args.per_device_train_batch_size + * training_args.data_parallel_degree + * training_args.max_steps + * training_args.gradient_accumulation_steps, + training_args.per_device_eval_batch_size + * training_args.data_parallel_degree + * training_args.eval_iters + * (training_args.max_steps // training_args.eval_steps + 1), + training_args.per_device_eval_batch_size * training_args.data_parallel_degree * training_args.test_iters, ] print_rank_0(" > datasets target sizes (minimum size):") if training_args.do_train: - print_rank_0(" train: {}".format( - train_val_test_num_samples[0])) + print_rank_0(" train: {}".format(train_val_test_num_samples[0])) if training_args.do_eval: - print_rank_0(" validation: {}".format( - train_val_test_num_samples[1])) + print_rank_0(" validation: {}".format(train_val_test_num_samples[1])) if training_args.do_predict: - print_rank_0(" test: {}".format( - train_val_test_num_samples[2])) + print_rank_0(" test: {}".format(train_val_test_num_samples[2])) # Build the datasets. train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets( @@ -399,9 +345,9 @@ def get_train_data_file(args): return args.input_dir.split() else: files = [ - os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) - if (os.path.isfile(os.path.join(args.input_dir, f)) and ( - "_idx.npz" in str(f) or ".idx" in str(f))) + os.path.join(args.input_dir, f) + for f in os.listdir(args.input_dir) + if (os.path.isfile(os.path.join(args.input_dir, f)) and ("_idx.npz" in str(f) or ".idx" in str(f))) ] files = [x.replace("_idx.npz", "") for x in files] files = [x.replace(".idx", "") for x in files] # add @@ -419,7 +365,6 @@ def get_train_data_file(args): class PretrainingTrainer(AutoTrainer): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -441,8 +386,7 @@ def print_config(args, key=""): logger.info("{:^40}".format("{} Configuration Arguments".format(key))) logger.info("{:30}: {}".format("paddle commit id", paddle.version.commit)) - logger.info("{:30}: {}".format("paddlenlp commit id", - paddlenlp.version.commit)) + logger.info("{:30}: {}".format("paddlenlp commit id", paddlenlp.version.commit)) for a in dir(args): if a[:2] != "__": # don't print double underscore methods @@ -467,12 +411,10 @@ def init_seed(seed: int = 1234, args=None): dp_degree=args.data_parallel_degree, pp_degree=args.pipeline_parallel_degree, mp_degree=args.tensor_parallel_degree, - sharding_degree= - 1, # auto_parallel's sharding is not orthogonal with dp, mp and pp + sharding_degree=1, # auto_parallel's sharding is not orthogonal with dp, mp and pp ) - global_seed, local_seed, random_seed = _get_distributed_seeds( - args.seed, topo) + global_seed, local_seed, random_seed = _get_distributed_seeds(args.seed, topo) paddle.seed(local_seed) random.seed(random_seed) @@ -480,8 +422,8 @@ def init_seed(seed: int = 1234, args=None): logger.info( "The global seed is set to {}, local seed is set to {} and " - "random seed is set to {}.".format(global_seed, local_seed, - random_seed)) + "random seed is set to {}.".format(global_seed, local_seed, random_seed) + ) else: random.seed(args.seed) np.random.seed(args.seed) @@ -489,14 +431,11 @@ def init_seed(seed: int = 1234, args=None): def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - model_args, data_args, training_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1])) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses( - ) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() if training_args.enable_linear_fused_grad_add: from fused_layers import mock_layers @@ -524,15 +463,12 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + - f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) # Detecting last checkpoint. last_checkpoint = None - if os.path.isdir( - training_args.output_dir - ) and training_args.do_train and not training_args.overwrite_output_dir: + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( @@ -540,41 +476,35 @@ def main(): "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) - config_class, model_class, criterion_class = MODEL_CLASSES[ - model_args.model_type] + config_class, model_class, criterion_class = MODEL_CLASSES[model_args.model_type] - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name_or_path) + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = config_class.from_pretrained(model_args.model_name_or_path) config.seq_length = data_args.max_seq_length # There are some technique extend RotaryEmbedding context. so don't change max_position_embeddings if not model_args.continue_training: - config.max_position_embeddings = max(config.max_position_embeddings, - data_args.max_seq_length) + config.max_position_embeddings = max(config.max_position_embeddings, data_args.max_seq_length) if not model_args.continue_training: - config.vocab_size = max(config.vocab_size, - ((tokenizer.vocab_size - 1) // 128 + 1) * 128) - logger.info( - f"Reset vocab size to {config.vocab_size} for batter amp peformance." - ) + config.vocab_size = max(config.vocab_size, ((tokenizer.vocab_size - 1) // 128 + 1) * 128) + logger.info(f"Reset vocab size to {config.vocab_size} for batter amp peformance.") if model_args.no_recompute_layers is not None: model_args.no_recompute_layers.sort() config.vocab_size = model_args.vocab_size if model_args.vocab_size is not None else config.vocab_size config.hidden_size = model_args.hidden_size if model_args.hidden_size is not None else config.hidden_size - config.intermediate_size = (model_args.intermediate_size - if model_args.intermediate_size is not None - else config.intermediate_size) - config.num_hidden_layers = (model_args.num_hidden_layers - if model_args.num_hidden_layers is not None - else config.num_hidden_layers) - config.num_attention_heads = (model_args.num_attention_heads - if model_args.num_attention_heads is not None - else config.num_attention_heads) + config.intermediate_size = ( + model_args.intermediate_size if model_args.intermediate_size is not None else config.intermediate_size + ) + config.num_hidden_layers = ( + model_args.num_hidden_layers if model_args.num_hidden_layers is not None else config.num_hidden_layers + ) + config.num_attention_heads = ( + model_args.num_attention_heads if model_args.num_attention_heads is not None else config.num_attention_heads + ) config.use_flash_attention = model_args.use_flash_attention config.use_fused_rms_norm = model_args.use_fused_rms_norm @@ -615,10 +545,7 @@ def main(): if training_args.recompute: def fn(layer): - if hasattr( - layer, - "enable_recompute") and (layer.enable_recompute is False - or layer.enable_recompute == 0): + if hasattr(layer, "enable_recompute") and (layer.enable_recompute is False or layer.enable_recompute == 0): layer.enable_recompute = True model.apply(fn) diff --git a/llm/gpt-3/auto_parallel/run_pretrain_auto_dp2mp2pp2.sh b/llm/auto_parallel/gpt-3/run_pretrain_auto_dp2mp2pp2.sh similarity index 72% rename from llm/gpt-3/auto_parallel/run_pretrain_auto_dp2mp2pp2.sh rename to llm/auto_parallel/gpt-3/run_pretrain_auto_dp2mp2pp2.sh index 9219cd27e3a3..71578bb81532 100755 --- a/llm/gpt-3/auto_parallel/run_pretrain_auto_dp2mp2pp2.sh +++ b/llm/auto_parallel/gpt-3/run_pretrain_auto_dp2mp2pp2.sh @@ -1,3 +1,17 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + export PYTHONPATH="../../../":$PYTHONPATH export FLAGS_cudnn_deterministic=1 export FLAGS_embedding_deterministic=1 diff --git a/llm/llama/auto_parallel/README.md b/llm/auto_parallel/llama/README.md similarity index 100% rename from llm/llama/auto_parallel/README.md rename to llm/auto_parallel/llama/README.md diff --git a/llm/llama/auto_parallel/run_llama3.sh b/llm/auto_parallel/llama/run_llama3.sh similarity index 100% rename from llm/llama/auto_parallel/run_llama3.sh rename to llm/auto_parallel/llama/run_llama3.sh diff --git a/llm/llama/auto_parallel/run_pretrain_auto.py b/llm/auto_parallel/llama/run_pretrain_auto.py similarity index 100% rename from llm/llama/auto_parallel/run_pretrain_auto.py rename to llm/auto_parallel/llama/run_pretrain_auto.py diff --git a/llm/llama/auto_parallel/run_pretrain_auto.sh b/llm/auto_parallel/llama/run_pretrain_auto.sh similarity index 100% rename from llm/llama/auto_parallel/run_pretrain_auto.sh rename to llm/auto_parallel/llama/run_pretrain_auto.sh diff --git a/llm/llama/auto_parallel/run_pretrain_auto_static.py b/llm/auto_parallel/llama/run_pretrain_auto_static.py similarity index 100% rename from llm/llama/auto_parallel/run_pretrain_auto_static.py rename to llm/auto_parallel/llama/run_pretrain_auto_static.py diff --git a/llm/llama/auto_parallel/run_pretrain_auto_static.sh b/llm/auto_parallel/llama/run_pretrain_auto_static.sh similarity index 100% rename from llm/llama/auto_parallel/run_pretrain_auto_static.sh rename to llm/auto_parallel/llama/run_pretrain_auto_static.sh diff --git a/llm/llama/auto_parallel/run_pretrain_auto_static_sp.sh b/llm/auto_parallel/llama/run_pretrain_auto_static_sp.sh similarity index 100% rename from llm/llama/auto_parallel/run_pretrain_auto_static_sp.sh rename to llm/auto_parallel/llama/run_pretrain_auto_static_sp.sh diff --git a/llm/llama/auto_parallel/run_pretrain_hand.py b/llm/auto_parallel/llama/run_pretrain_hand.py similarity index 100% rename from llm/llama/auto_parallel/run_pretrain_hand.py rename to llm/auto_parallel/llama/run_pretrain_hand.py diff --git a/llm/llama/auto_parallel/run_pretrain_hand.sh b/llm/auto_parallel/llama/run_pretrain_hand.sh similarity index 100% rename from llm/llama/auto_parallel/run_pretrain_hand.sh rename to llm/auto_parallel/llama/run_pretrain_hand.sh diff --git a/llm/qwen/auto_parallel/pretrain_argument_auto_dp2tp2pp2.json b/llm/auto_parallel/qwen/pretrain_argument_auto_dp2tp2pp2.json similarity index 100% rename from llm/qwen/auto_parallel/pretrain_argument_auto_dp2tp2pp2.json rename to llm/auto_parallel/qwen/pretrain_argument_auto_dp2tp2pp2.json diff --git a/llm/qwen/auto_parallel/run_pretrain_3D_auto.py b/llm/auto_parallel/qwen/run_pretrain_3D_auto.py similarity index 100% rename from llm/qwen/auto_parallel/run_pretrain_3D_auto.py rename to llm/auto_parallel/qwen/run_pretrain_3D_auto.py diff --git a/llm/qwen/auto_parallel/run_pretrain_3D_auto.sh b/llm/auto_parallel/qwen/run_pretrain_3D_auto.sh similarity index 100% rename from llm/qwen/auto_parallel/run_pretrain_3D_auto.sh rename to llm/auto_parallel/qwen/run_pretrain_3D_auto.sh diff --git a/llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json b/llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json deleted file mode 100644 index 51d55556a9c1..000000000000 --- a/llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base", - "tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base", - "input_dir": "./data", - "output_dir": "./checkpoints/baichuan_pretrain_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 1, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 1000, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": true, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/benchmark.sh b/llm/benchmark.sh deleted file mode 100644 index d49858b42b76..000000000000 --- a/llm/benchmark.sh +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH - -export FLAGS_control_flow_use_new_executor=1 -export FLAGS_new_executor_serial_run=1 -export FLAGS_allocator_strategy=naive_best_fit -export FLAGS_fraction_of_gpu_memory_to_use=0.92 - -export FLAGS_use_autotune=1 -export FLAGS_cublaslt_exhaustive_search_times=10 -export FLAGS_cache_inference_while_scope=1 - - -python predictor.py \ - --model_name_or_path ./llama7b-inference_model_fp16 \ - --dtype float16 \ - --src_length 300 \ - --max_length 100 \ - --output_file "infer.json" \ - --mode "static" \ - --batch_size 1 \ - --benchmark \ - --inference_model diff --git a/llm/config/baichuan/README.md b/llm/config/baichuan/README.md new file mode 100644 index 000000000000..98bf760a6caa --- /dev/null +++ b/llm/config/baichuan/README.md @@ -0,0 +1,15 @@ +# Baichuan + +## 1. 模型介绍 + +**支持模型权重:** + +| Model | +| ---------------------------------| +| baichuan-inc/Baichuan-7B | +| baichuan-inc/Baichuan-13B-Base | +| baichuan-inc/Baichuan-13B-Chat | +| baichuan-inc/Baichuan2-7B-Base | +| baichuan-inc/Baichuan2-7B-Chat | +| baichuan-inc/Baichuan2-13B-Base | +| baichuan-inc/Baichuan2-13B-Chat | diff --git a/llm/config/baichuan/awq_argument.json b/llm/config/baichuan/awq_argument.json new file mode 100644 index 000000000000..23c1884ed768 --- /dev/null +++ b/llm/config/baichuan/awq_argument.json @@ -0,0 +1,23 @@ +{ + "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base", + "per_device_train_batch_size": 8, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/ptq_ckpts", + "do_eval": true, + "eval_with_do_generation": false, + "do_ptq": true, + "quant_type": "weight_only_int4", + "weight_quant_method": "groupwise", + "ptq_step": 16, + "smooth": true, + "auto_clip": true, + "autoclip_step": 1, + "do_awq": true, + "unified_checkpoint": true + } \ No newline at end of file diff --git a/llm/config/baichuan/dpo_argument.json b/llm/config/baichuan/dpo_argument.json new file mode 100644 index 000000000000..376caef0eda7 --- /dev/null +++ b/llm/config/baichuan/dpo_argument.json @@ -0,0 +1,38 @@ +{ + "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base", + "train_dataset_path": "./data/train.jsonl", + "dev_dataset_path": "./data/dev.jsonl", + "output_dir": "./checkpoints/dpo_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 8, + "per_device_eval_batch_size": 1, + "num_train_epochs": 1, + "max_steps": 100, + "learning_rate": 1e-06, + "warmup_steps": 10, + "logging_steps": 1, + "evaluation_strategy": "steps", + "save_strategy": "steps", + "eval_steps": 100, + "save_steps": 500, + "max_seq_len": 4096, + "max_prompt_len": 2048, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "tensor_parallel_degree": 8, + "sharding_parallel_degree": 1, + "sharding": "stage1", + "use_flash_attention": true, + "recompute": false, + "recompute_granularity": "full", + "dpo_beta": 0.1, + "benchmark": false, + "dpo_loss_type": "sigmoid", + "dpo_label_smoothing": 0.0, + "unified_checkpoint": true, + "autotuner_benchmark":false + } diff --git a/llm/chatglm2/gptq_argument.json b/llm/config/baichuan/gptq_argument.json similarity index 71% rename from llm/chatglm2/gptq_argument.json rename to llm/config/baichuan/gptq_argument.json index 9285e8b628ad..593773a268e2 100644 --- a/llm/chatglm2/gptq_argument.json +++ b/llm/config/baichuan/gptq_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "./checkpoints/chatglm2_sft_ckpts", + "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, @@ -8,9 +8,10 @@ "fp16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm2_gptq_ckpts", + "output_dir": "./checkpoints/gptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_gptq": true, + "unified_checkpoint": true, "gptq_step": 8 } \ No newline at end of file diff --git a/llm/config/baichuan/lora_argument.json b/llm/config/baichuan/lora_argument.json new file mode 100644 index 000000000000..8d2702551f4b --- /dev/null +++ b/llm/config/baichuan/lora_argument.json @@ -0,0 +1,35 @@ +{ + "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/lora_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "sharding_parallel_degree": 1, + "sharding": "stage1", + "lora": true, + "zero_padding": false, + "unified_checkpoint": true, + "use_flash_attention": true + } diff --git a/llm/baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json b/llm/config/baichuan/pretrain_argument.json similarity index 90% rename from llm/baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json rename to llm/config/baichuan/pretrain_argument.json index da31682d6949..aeb17cf475a4 100644 --- a/llm/baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json +++ b/llm/config/baichuan/pretrain_argument.json @@ -2,12 +2,13 @@ "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base", "tokenizer_name_or_path": "baichuan-inc/Baichuan2-7B-Base", "input_dir": "./data", - "output_dir": "./checkpoints/baichuan_pretrain_ckpts", + "output_dir": "./checkpoints/pretrain_ckpts", "per_device_train_batch_size": 2, "gradient_accumulation_steps": 8, "per_device_eval_batch_size": 2, "tensor_parallel_degree": 2, "pipeline_parallel_degree": 1, + "sharding_parallel_degree": 4, "sharding": "stage2", "virtual_pp_degree": 1, "sequence_parallel": 0, @@ -36,5 +37,6 @@ "recompute": false, "distributed_dataloader": 1, "recompute_granularity": "full", + "unified_checkpoint": true, "save_total_limit": 2 } diff --git a/llm/config/baichuan/ptq_argument.json b/llm/config/baichuan/ptq_argument.json new file mode 100644 index 000000000000..f15164f44eef --- /dev/null +++ b/llm/config/baichuan/ptq_argument.json @@ -0,0 +1,23 @@ +{ + "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base", + "per_device_train_batch_size": 8, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/ptq_ckpts", + "do_eval": true, + "eval_with_do_generation": false, + "do_ptq": true, + "ptq_step": 16, + "unified_checkpoint": true, + "smooth": true, + "smooth_step": 16, + "smooth_all_linears": true, + "smooth_piecewise_search": true, + "smooth_k_piece": 3, + "smooth_search_piece": true +} \ No newline at end of file diff --git a/llm/config/baichuan/qlora_argument.json b/llm/config/baichuan/qlora_argument.json new file mode 100644 index 000000000000..c820bcff63df --- /dev/null +++ b/llm/config/baichuan/qlora_argument.json @@ -0,0 +1,34 @@ +{ + "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/qlora_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "lora": true, + "zero_padding": false, + "use_flash_attention": true, + "unified_checkpoint": true, + "weight_quantize_algo": "nf4" + } \ No newline at end of file diff --git a/llm/bloom/README.md b/llm/config/bloom/README.md similarity index 92% rename from llm/bloom/README.md rename to llm/config/bloom/README.md index 2cdeafa66968..52311561818a 100644 --- a/llm/bloom/README.md +++ b/llm/config/bloom/README.md @@ -20,6 +20,3 @@ BLOOM是一种自回归大型语言模型(LLM),在大量文本数据上训练 | bigscience/bloomz-7b1-p3 | | bigscience/bloomz-7b1 | | bellegroup/belle-7b-2m | - -## 2. 模型精调 -请参考[LLM全流程工具介绍](../README.md) diff --git a/llm/llama/gptq_argument.json b/llm/config/bloom/gptq_argument.json similarity index 72% rename from llm/llama/gptq_argument.json rename to llm/config/bloom/gptq_argument.json index 75944f076c29..615286908be0 100644 --- a/llm/llama/gptq_argument.json +++ b/llm/config/bloom/gptq_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "./checkpoints/llama_sft_ckpts", + "model_name_or_path": "bigscience/bloomz-7b1-mt", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, @@ -8,9 +8,10 @@ "fp16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_gptq_ckpts", + "output_dir": "./checkpoints/gptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_gptq": true, + "unified_checkpoint": true, "gptq_step": 8 } \ No newline at end of file diff --git a/llm/bloom/lora_argument.json b/llm/config/bloom/lora_argument.json similarity index 91% rename from llm/bloom/lora_argument.json rename to llm/config/bloom/lora_argument.json index 6867ecaeedf2..d36d821a35ce 100644 --- a/llm/bloom/lora_argument.json +++ b/llm/config/bloom/lora_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "bigscience/bloomz-7b1-mt", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/bloom_lora_ckpts", + "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -28,5 +28,6 @@ "pipeline_parallel_degree": 1, "lora": true, "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/bloom/pt_argument.json b/llm/config/bloom/pt_argument.json similarity index 92% rename from llm/bloom/pt_argument.json rename to llm/config/bloom/pt_argument.json index 30d6839369cc..44801b6eb623 100644 --- a/llm/bloom/pt_argument.json +++ b/llm/config/bloom/pt_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "bigscience/bloomz-7b1-mt", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/bloom_pt_ckpts", + "output_dir": "./checkpoints/pt_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -28,5 +28,6 @@ "pipeline_parallel_degree": 1, "prefix_tuning": true, "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/chatglm2/ptq_argument.json b/llm/config/bloom/ptq_argument.json similarity index 79% rename from llm/chatglm2/ptq_argument.json rename to llm/config/bloom/ptq_argument.json index 46a57083584a..fff4560700e7 100644 --- a/llm/chatglm2/ptq_argument.json +++ b/llm/config/bloom/ptq_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "./checkpoints/chatglm2_sft_ckpts", + "model_name_or_path": "bigscience/bloomz-7b1-mt", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, @@ -8,7 +8,7 @@ "fp16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm2_ptq_ckpts", + "output_dir": "./checkpoints/ptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_ptq": true, @@ -18,5 +18,6 @@ "smooth_all_linears": true, "smooth_piecewise_search": true, "smooth_k_piece": 3, + "unified_checkpoint": true, "smooth_search_piece": true } \ No newline at end of file diff --git a/llm/bloom/sft_argument.json b/llm/config/bloom/sft_argument.json similarity index 91% rename from llm/bloom/sft_argument.json rename to llm/config/bloom/sft_argument.json index 2c793576b7e0..31b020da30a1 100644 --- a/llm/bloom/sft_argument.json +++ b/llm/config/bloom/sft_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "bigscience/bloomz-7b1-mt", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/bloom_sft_ckpts", + "output_dir": "./checkpoints/sft_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -27,5 +27,6 @@ "tensor_parallel_degree": 4, "pipeline_parallel_degree": 1, "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/chatglm/README.md b/llm/config/chatglm/README.md similarity index 92% rename from llm/chatglm/README.md rename to llm/config/chatglm/README.md index 281a7ceea61f..c8cfb4f8b28b 100644 --- a/llm/chatglm/README.md +++ b/llm/config/chatglm/README.md @@ -14,6 +14,3 @@ ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型, ## 2. 模型协议 ChatGLM-6B 模型的权重的使用需要遵循[License](../../paddlenlp/transformers/chatglm/LICENSE)。 - -## 3. 模型精调 -请参考[LLM全流程工具介绍](../README.md) diff --git a/llm/bloom/gptq_argument.json b/llm/config/chatglm/gptq_argument.json similarity index 73% rename from llm/bloom/gptq_argument.json rename to llm/config/chatglm/gptq_argument.json index 6a5cb7e882a7..d509f6aed280 100644 --- a/llm/bloom/gptq_argument.json +++ b/llm/config/chatglm/gptq_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "./checkpoints/bloom_sft_ckpts", + "model_name_or_path": "THUDM/chatglm-6b", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, @@ -8,9 +8,10 @@ "fp16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/bloom_gptq_ckpts", + "output_dir": "./checkpoints/gptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_gptq": true, + "unified_checkpoint": true, "gptq_step": 8 } \ No newline at end of file diff --git a/llm/chatglm/lora_argument.json b/llm/config/chatglm/lora_argument.json similarity index 91% rename from llm/chatglm/lora_argument.json rename to llm/config/chatglm/lora_argument.json index af49af041d72..11069e723f8f 100644 --- a/llm/chatglm/lora_argument.json +++ b/llm/config/chatglm/lora_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "THUDM/chatglm-6b", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm_lora_ckpts", + "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -28,5 +28,6 @@ "pipeline_parallel_degree": 1, "lora": true, "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/chatglm/pt_argument.json b/llm/config/chatglm/pt_argument.json similarity index 94% rename from llm/chatglm/pt_argument.json rename to llm/config/chatglm/pt_argument.json index 03158f7f127f..54c95fd56744 100644 --- a/llm/chatglm/pt_argument.json +++ b/llm/config/chatglm/pt_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "THUDM/chatglm-6b", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm_pt_ckpts", + "output_dir": "./checkpoints/pt_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, diff --git a/llm/chatglm/ptq_argument.json b/llm/config/chatglm/ptq_argument.json similarity index 73% rename from llm/chatglm/ptq_argument.json rename to llm/config/chatglm/ptq_argument.json index 63474a9e0a19..64b6e480776b 100644 --- a/llm/chatglm/ptq_argument.json +++ b/llm/config/chatglm/ptq_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "./checkpoints/llama_sft_ckpts", + "model_name_or_path": "THUDM/chatglm-6b", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, @@ -8,9 +8,10 @@ "fp16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_ptq_ckpts", + "output_dir": "./checkpoints/ptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_ptq": true, + "unified_checkpoint": true, "ptq_step": 16 } \ No newline at end of file diff --git a/llm/chatglm/sft_argument.json b/llm/config/chatglm/sft_argument.json similarity index 91% rename from llm/chatglm/sft_argument.json rename to llm/config/chatglm/sft_argument.json index 8309f28f1439..73286c3bb5c8 100644 --- a/llm/chatglm/sft_argument.json +++ b/llm/config/chatglm/sft_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "THUDM/chatglm-6b", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm_sft_ckpts", + "output_dir": "./checkpoints/sft_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -27,5 +27,6 @@ "tensor_parallel_degree": 4, "pipeline_parallel_degree": 1, "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/chatglm2/README.md b/llm/config/chatglm2/README.md similarity index 91% rename from llm/chatglm2/README.md rename to llm/config/chatglm2/README.md index f04166f5bd50..0929e7b20fac 100644 --- a/llm/chatglm2/README.md +++ b/llm/config/chatglm2/README.md @@ -15,6 +15,3 @@ ChatGLM2-6B 是开源中英双语对话模型 [ChatGLM-6B](https://github.com/TH ChatGLM2-6B 模型的权重的使用需要遵循[License](../../paddlenlp/transformers/chatglm_v2/LICENSE)。 - -## 3. 模型精调 -请参考[LLM全流程工具介绍](../README.md) diff --git a/llm/chatglm/gptq_argument.json b/llm/config/chatglm2/gptq_argument.json similarity index 73% rename from llm/chatglm/gptq_argument.json rename to llm/config/chatglm2/gptq_argument.json index 8b1c07742ba8..137f036a0552 100644 --- a/llm/chatglm/gptq_argument.json +++ b/llm/config/chatglm2/gptq_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "./checkpoints/chatglm_sft_ckpts", + "model_name_or_path": "THUDM/chatglm2-6b", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, @@ -8,9 +8,10 @@ "fp16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm_gptq_ckpts", + "output_dir": "./checkpoints/gptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_gptq": true, + "unified_checkpoint": true, "gptq_step": 8 } \ No newline at end of file diff --git a/llm/chatglm2/lora_argument.json b/llm/config/chatglm2/lora_argument.json similarity index 91% rename from llm/chatglm2/lora_argument.json rename to llm/config/chatglm2/lora_argument.json index c88636b9bd1d..6e734fc1f2a8 100644 --- a/llm/chatglm2/lora_argument.json +++ b/llm/config/chatglm2/lora_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "THUDM/chatglm2-6b", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm2_lora_ckpts", + "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -28,5 +28,6 @@ "pipeline_parallel_degree": 1, "lora": true, "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/chatglm2/pt_argument.json b/llm/config/chatglm2/pt_argument.json similarity index 94% rename from llm/chatglm2/pt_argument.json rename to llm/config/chatglm2/pt_argument.json index a10f9b4d788c..52a80b837686 100644 --- a/llm/chatglm2/pt_argument.json +++ b/llm/config/chatglm2/pt_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "THUDM/chatglm2-6b", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm2_pt_ckpts", + "output_dir": "./checkpoints/pt_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, diff --git a/llm/bloom/ptq_argument.json b/llm/config/chatglm2/ptq_argument.json similarity index 79% rename from llm/bloom/ptq_argument.json rename to llm/config/chatglm2/ptq_argument.json index 21a28735ecc1..806c80a3cf63 100644 --- a/llm/bloom/ptq_argument.json +++ b/llm/config/chatglm2/ptq_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "./checkpoints/bloom_sft_ckpts", + "model_name_or_path": "./checkpoints/sft_ckpts", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, @@ -8,11 +8,12 @@ "fp16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/bloom_ptq_ckpts", + "output_dir": "./checkpoints/ptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_ptq": true, "ptq_step": 16, + "unified_checkpoint": true, "smooth": true, "smooth_step": 16, "smooth_all_linears": true, diff --git a/llm/chatglm2/sft_argument.json b/llm/config/chatglm2/sft_argument.json similarity index 85% rename from llm/chatglm2/sft_argument.json rename to llm/config/chatglm2/sft_argument.json index 8508d9676379..ee2ffb4ee7ae 100644 --- a/llm/chatglm2/sft_argument.json +++ b/llm/config/chatglm2/sft_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "THUDM/chatglm2-6b", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm2_sft_ckpts", + "output_dir": "./checkpoints/sft_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -24,8 +24,9 @@ "metric_for_best_model": "accuracy", "recompute": true, "save_total_limit": 1, - "sharding_parallel_degree": 4, - "sharding": "stage3", + "sharding_parallel_degree": 8, + "sharding": "stage2", "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/gemma/README.md b/llm/config/gemma/README.md similarity index 100% rename from llm/gemma/README.md rename to llm/config/gemma/README.md diff --git a/llm/gemma/sft_argument.json b/llm/config/gemma/sft_argument.json similarity index 71% rename from llm/gemma/sft_argument.json rename to llm/config/gemma/sft_argument.json index 45a483d7e52a..15d9c3b93807 100644 --- a/llm/gemma/sft_argument.json +++ b/llm/config/gemma/sft_argument.json @@ -1,7 +1,7 @@ { - "model_name_or_path": "google/gemma-2b/", + "model_name_or_path": "google/gemma-2b", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/gemma_sft_ckpts", + "output_dir": "./checkpoints/sft_ckpts", "per_device_train_batch_size": 2, "gradient_accumulation_steps": 1, "per_device_eval_batch_size": 8, @@ -24,7 +24,11 @@ "metric_for_best_model": "accuracy", "recompute": true, "save_total_limit": 1, - "tensor_parallel_degree": 2, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "sharding_parallel_degree": 8, + "sharding": "stage2", "zero_padding": false, - "use_flash_attention": false + "unified_checkpoint": true, + "use_flash_attention": true } \ No newline at end of file diff --git a/llm/config/gpt-3/README.md b/llm/config/gpt-3/README.md new file mode 100644 index 000000000000..472c2f74cd42 --- /dev/null +++ b/llm/config/gpt-3/README.md @@ -0,0 +1,5 @@ +# GPT + +## 1. 模型介绍 + +GPT-3是一种预训练语言模型,它能够模拟人类语言思维和表达。GPT-3拥有巨大的参数,包含了1750亿个参数,这使得它具有强大的语言理解和生成能力。它可以完成的任务包括文本生成、文本摘要、回答问题、翻译、阅读理解等。GPT-3的预训练过程使用了大量的语料库,包括互联网上的大量文本。它通过分析这些文本,学习如何生成和理解人类语言。GPT-3在自然语言处理领域具有很高的影响力,它可以模拟人类对话和生成文本,这使得它在许多应用领域都有广泛的应用,比如智能客服、自然语言处理、游戏设计等。 diff --git a/llm/llama/lora_argument.json b/llm/config/gpt-3/lora_argument.json similarity index 86% rename from llm/llama/lora_argument.json rename to llm/config/gpt-3/lora_argument.json index 6817215e0c74..1ed0576d951b 100644 --- a/llm/llama/lora_argument.json +++ b/llm/config/gpt-3/lora_argument.json @@ -1,7 +1,7 @@ { - "model_name_or_path": "facebook/llama-7b", + "model_name_or_path": "gpt2-medium-en", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_lora_ckpts", + "output_dir": "./checkpoints/gpt_lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -28,5 +28,6 @@ "pipeline_parallel_degree": 1, "lora": true, "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } diff --git a/llm/gpt-3/pretrain-gpt_medium_en-stage2.json b/llm/config/gpt-3/pretrain_argument.json similarity index 97% rename from llm/gpt-3/pretrain-gpt_medium_en-stage2.json rename to llm/config/gpt-3/pretrain_argument.json index 3d7685a9696d..3959956bd21d 100644 --- a/llm/gpt-3/pretrain-gpt_medium_en-stage2.json +++ b/llm/config/gpt-3/pretrain_argument.json @@ -33,6 +33,7 @@ "disable_tqdm": true, "recompute": false, "distributed_dataloader": 1, + "unified_checkpoint": true, "recompute_granularity": "full", "save_total_limit": 2 } diff --git a/llm/config/gpt-3/sft_argument.json b/llm/config/gpt-3/sft_argument.json new file mode 100644 index 000000000000..76d50ec28628 --- /dev/null +++ b/llm/config/gpt-3/sft_argument.json @@ -0,0 +1,33 @@ +{ + "model_name_or_path": "gpt2-medium-en", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/sft_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "lora": true, + "zero_padding": false, + "unified_checkpoint": true, + "use_flash_attention": false + } diff --git a/llm/llama/README.md b/llm/config/llama/README.md similarity index 92% rename from llm/llama/README.md rename to llm/config/llama/README.md index c707c0cd64ac..bda1959533d7 100644 --- a/llm/llama/README.md +++ b/llm/config/llama/README.md @@ -16,6 +16,10 @@ | meta-llama/Llama-2-13b-chat | | meta-llama/Llama-2-70b | | meta-llama/Llama-2-70b-chat | +|meta-llama/Meta-Llama-3-8B| +|meta-llama/Meta-Llama-3-8B-Instruct| +|meta-llama/Meta-Llama-3-70B| +|meta-llama/Meta-Llama-3-70B-Instruct| | ziqingyang/chinese-llama-7b | | ziqingyang/chinese-llama-13b | | ziqingyang/chinese-alpaca-7b | @@ -48,11 +52,3 @@ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat") LLaMA 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/LICENSE)。 Llama2 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/Llama2.LICENSE)。 - - -## 3. 预训练 - -请参考[LLM全流程工具介绍](../README.md) - -## 4. 模型精调 -请参考[LLM全流程工具介绍](../README.md) diff --git a/llm/llama/awq_argument.json b/llm/config/llama/awq_argument.json similarity index 76% rename from llm/llama/awq_argument.json rename to llm/config/llama/awq_argument.json index 21a9bcdb13b3..7ae7f55b678c 100644 --- a/llm/llama/awq_argument.json +++ b/llm/config/llama/awq_argument.json @@ -1,14 +1,14 @@ { - "model_name_or_path": "./checkpoints/llama_sft_ckpts", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_ptq_ckpts", + "output_dir": "./checkpoints/ptq_ckpts", "do_eval": true, "eval_with_do_generation": false, "do_ptq": true, @@ -18,5 +18,6 @@ "smooth": true, "auto_clip": true, "autoclip_step": 1, + "unified_checkpoint": true, "do_awq": true } \ No newline at end of file diff --git a/llm/llama/dpo_argument.json b/llm/config/llama/dpo_argument.json similarity index 92% rename from llm/llama/dpo_argument.json rename to llm/config/llama/dpo_argument.json index 7aa86b342128..b30fcc86478c 100644 --- a/llm/llama/dpo_argument.json +++ b/llm/config/llama/dpo_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "meta-llama/Llama-2-7b-chat", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", "train_dataset_path": "./data/train.jsonl", "dev_dataset_path": "./data/dev.jsonl", "output_dir": "./checkpoints/dpo_ckpts", @@ -34,5 +34,6 @@ "benchmark": false, "dpo_loss_type": "sigmoid", "dpo_label_smoothing": 0.0, + "unified_checkpoint": true, "autotuner_benchmark":false } diff --git a/llm/config/llama/gptq_argument.json b/llm/config/llama/gptq_argument.json new file mode 100644 index 000000000000..bbc2ac60d5a7 --- /dev/null +++ b/llm/config/llama/gptq_argument.json @@ -0,0 +1,17 @@ +{ + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", + "per_device_train_batch_size": 8, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "src_length": 1024, + "max_length": 2048, + "bf16": true, + "fp16_opt_level": "O2", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/gptq_ckpts", + "do_eval": true, + "eval_with_do_generation": false, + "do_gptq": true, + "unified_checkpoint": true, + "gptq_step": 8 + } \ No newline at end of file diff --git a/llm/config/llama/lora_argument.json b/llm/config/llama/lora_argument.json new file mode 100644 index 000000000000..3b4374529880 --- /dev/null +++ b/llm/config/llama/lora_argument.json @@ -0,0 +1,35 @@ +{ + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/lora_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "sharding": "stage1", + "lora": true, + "zero_padding": false, + "use_flash_attention": true, + "unified_checkpoint": true, + "pissa": false + } diff --git a/llm/config/llama/ppo.json b/llm/config/llama/ppo_argument.json similarity index 100% rename from llm/config/llama/ppo.json rename to llm/config/llama/ppo_argument.json diff --git a/llm/llama/pretrain-llama2_13b-tp2sd4_stage2.json b/llm/config/llama/pretrain_argument.json similarity index 83% rename from llm/llama/pretrain-llama2_13b-tp2sd4_stage2.json rename to llm/config/llama/pretrain_argument.json index 3dbfd8c1e12c..dff5b322337e 100644 --- a/llm/llama/pretrain-llama2_13b-tp2sd4_stage2.json +++ b/llm/config/llama/pretrain_argument.json @@ -1,8 +1,8 @@ { - "model_name_or_path": "meta-llama/Llama-2-13b", - "tokenizer_name_or_path": "meta-llama/Llama-2-13b", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", + "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-8B", "input_dir": "./data", - "output_dir": "./checkpoints/llama2_pretrain_ckpts", + "output_dir": "./checkpoints/pretrain_ckpts", "per_device_train_batch_size": 1, "gradient_accumulation_steps": 16, "per_device_eval_batch_size": 2, @@ -36,5 +36,6 @@ "recompute": false, "distributed_dataloader": 1, "recompute_granularity": "full", + "unified_checkpoint": true, "save_total_limit": 2 } diff --git a/llm/qwen/pt_argument.json b/llm/config/llama/pt_argument.json similarity index 85% rename from llm/qwen/pt_argument.json rename to llm/config/llama/pt_argument.json index 3500215eb3da..66c336cc4b87 100644 --- a/llm/qwen/pt_argument.json +++ b/llm/config/llama/pt_argument.json @@ -1,7 +1,7 @@ { - "model_name_or_path": "qwen/qwen-7b", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen_pt_ckpts", + "output_dir": "./checkpoints/pt_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -28,5 +28,5 @@ "pipeline_parallel_degree": 1, "prefix_tuning": true, "zero_padding": false, - "use_flash_attention": false + "use_flash_attention": true } diff --git a/llm/llama/ptq_argument.json b/llm/config/llama/ptq_argument.json similarity index 83% rename from llm/llama/ptq_argument.json rename to llm/config/llama/ptq_argument.json index 0a64f3818834..79cc82e8d5d7 100644 --- a/llm/llama/ptq_argument.json +++ b/llm/config/llama/ptq_argument.json @@ -1,11 +1,11 @@ { - "model_name_or_path": "./checkpoints/llama_sft_ckpts", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", "per_device_train_batch_size": 8, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/llama_ptq_ckpts", @@ -13,6 +13,7 @@ "eval_with_do_generation": false, "do_ptq": true, "ptq_step": 16, + "unified_checkpoint": true, "smooth": true, "smooth_step": 16, "smooth_all_linears": true, diff --git a/llm/llama/qlora_argument.json b/llm/config/llama/qlora_argument.json similarity index 84% rename from llm/llama/qlora_argument.json rename to llm/config/llama/qlora_argument.json index 38775ac03948..30963715d2af 100644 --- a/llm/llama/qlora_argument.json +++ b/llm/config/llama/qlora_argument.json @@ -1,7 +1,7 @@ { - "model_name_or_path": "facebook/llama-7b", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_lora_ckpts", + "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -14,7 +14,7 @@ "save_strategy": "epoch", "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "do_train": true, "do_eval": true, @@ -29,5 +29,6 @@ "lora": true, "zero_padding": false, "use_flash_attention": false, + "unified_checkpoint": true, "weight_quantize_algo": "nf4" } \ No newline at end of file diff --git a/llm/config/llama/rm.json b/llm/config/llama/rm_argument.json similarity index 100% rename from llm/config/llama/rm.json rename to llm/config/llama/rm_argument.json diff --git a/llm/llama/sft_argument.json b/llm/config/llama/sft_argument.json similarity index 68% rename from llm/llama/sft_argument.json rename to llm/config/llama/sft_argument.json index 34b36a3bc023..9af167187555 100644 --- a/llm/llama/sft_argument.json +++ b/llm/config/llama/sft_argument.json @@ -1,9 +1,9 @@ { - "model_name_or_path": "facebook/llama-7b", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/llama_sft_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 4, + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 2, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, "num_train_epochs": 3, @@ -14,7 +14,7 @@ "save_strategy": "epoch", "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "do_train": true, "do_eval": true, @@ -22,10 +22,13 @@ "load_best_model_at_end": true, "eval_with_do_generation": false, "metric_for_best_model": "accuracy", - "recompute": true, + "recompute": false, "save_total_limit": 1, - "tensor_parallel_degree": 4, + "tensor_parallel_degree": 1, "pipeline_parallel_degree": 1, + "pipeline_parallel_config": "disable_p2p_cache_shape", + "sharding": "stage2", "zero_padding": false, + "unified_checkpoint": true, "use_flash_attention": false } \ No newline at end of file diff --git a/llm/llama/wint8_lora_argument.json b/llm/config/llama/wint8_lora_argument.json similarity index 89% rename from llm/llama/wint8_lora_argument.json rename to llm/config/llama/wint8_lora_argument.json index 97d9f96d6419..fbce73a89e50 100644 --- a/llm/llama/wint8_lora_argument.json +++ b/llm/config/llama/wint8_lora_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "facebook/llama-7b", + "model_name_or_path": "meta-llama/Meta-Llama-3-8B", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/llama_lora_ckpts", "per_device_train_batch_size": 4, @@ -14,7 +14,7 @@ "save_strategy": "epoch", "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "do_train": true, "do_eval": true, @@ -29,5 +29,6 @@ "lora": true, "zero_padding": false, "use_flash_attention": false, + "unified_checkpoint": true, "weight_quantize_algo": "weight_only_int8" } \ No newline at end of file diff --git a/llm/mixtral/lora_argument.json b/llm/config/mixtral/lora_argument.json similarity index 88% rename from llm/mixtral/lora_argument.json rename to llm/config/mixtral/lora_argument.json index 507c0f76e798..e70bd58a5eb7 100644 --- a/llm/mixtral/lora_argument.json +++ b/llm/config/mixtral/lora_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/mixtral_lora_ckpts", + "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -28,5 +28,6 @@ "pipeline_parallel_degree": 1, "lora": true, "zero_padding": false, - "use_flash_attention": false + "unified_checkpoint": true, + "use_flash_attention": true } diff --git a/llm/llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json b/llm/config/mixtral/pretrain_argument.json similarity index 79% rename from llm/llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json rename to llm/config/mixtral/pretrain_argument.json index bd227877bfd2..efd3823fa988 100644 --- a/llm/llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json +++ b/llm/config/mixtral/pretrain_argument.json @@ -1,12 +1,12 @@ { - "model_name_or_path": "idea-ccnl/ziya-llama-13b-v1", - "tokenizer_name_or_path": "idea-ccnl/ziya-llama-13b-v1", + "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tokenizer_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", "input_dir": "./data", - "output_dir": "./checkpoints/ziya_pretrain_ckpts", + "output_dir": "./checkpoints/pretrain_ckpts", "per_device_train_batch_size": 1, "gradient_accumulation_steps": 16, "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, + "tensor_parallel_degree": 8, "pipeline_parallel_degree": 1, "sharding": "stage2", "virtual_pp_degree": 1, @@ -36,5 +36,6 @@ "recompute": false, "distributed_dataloader": 1, "recompute_granularity": "full", + "unified_checkpoint": true, "save_total_limit": 2 } diff --git a/llm/mixtral/sft_argument.json b/llm/config/mixtral/sft_argument.json similarity index 74% rename from llm/mixtral/sft_argument.json rename to llm/config/mixtral/sft_argument.json index 3e778b913ffc..b11bb80380a0 100644 --- a/llm/mixtral/sft_argument.json +++ b/llm/config/mixtral/sft_argument.json @@ -1,9 +1,9 @@ { "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/mixtral_sft_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 4, + "output_dir": "./checkpoints/sft_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 16, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, "num_train_epochs": 3, @@ -26,5 +26,8 @@ "save_total_limit": 1, "tensor_parallel_degree": 8, "sharding": "stage2", - "pipeline_parallel_degree": 1 + "pipeline_parallel_degree": 1, + "zero_padding": false, + "unified_checkpoint": true, + "use_flash_attention": true } diff --git a/llm/opt/README.md b/llm/config/opt/README.md similarity index 88% rename from llm/opt/README.md rename to llm/config/opt/README.md index 98b3f140fbfb..3b77d6304b14 100644 --- a/llm/opt/README.md +++ b/llm/config/opt/README.md @@ -17,6 +17,3 @@ |facebook/opt-66b | |facebook/opt-iml-1.3b | |opt-iml-max-1.3b | - -## 2. 模型精调 -请参考[LLM全流程工具介绍](../README.md) diff --git a/llm/opt/lora_argument.json b/llm/config/opt/lora_argument.json similarity index 94% rename from llm/opt/lora_argument.json rename to llm/config/opt/lora_argument.json index 75193e47238d..2ddeb5f2a9f8 100644 --- a/llm/opt/lora_argument.json +++ b/llm/config/opt/lora_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "facebook/opt-125m", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/opt_lora_ckpts", + "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, diff --git a/llm/opt/sft_argument.json b/llm/config/opt/sft_argument.json similarity index 94% rename from llm/opt/sft_argument.json rename to llm/config/opt/sft_argument.json index 4eed122fa3cb..2b4f03b842bc 100644 --- a/llm/opt/sft_argument.json +++ b/llm/config/opt/sft_argument.json @@ -1,7 +1,7 @@ { "model_name_or_path": "facebook/opt-125m", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/opt_sft_ckpts", + "output_dir": "./checkpoints/sft_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, diff --git a/llm/qwen/README.md b/llm/config/qwen/README.md similarity index 96% rename from llm/qwen/README.md rename to llm/config/qwen/README.md index 22ac37c19e17..ce32fd88d5b5 100644 --- a/llm/qwen/README.md +++ b/llm/config/qwen/README.md @@ -55,7 +55,3 @@ | Qwen/Qwen2-72B-Instruct | | Qwen/Qwen2-57B-A14B | | Qwen/Qwen2-57B-A14B-Instruct | - - -## 2. 模型精调 -请参考[LLM全流程工具介绍](../README.md) diff --git a/llm/qwen/dpo_argument.json b/llm/config/qwen/dpo_argument.json similarity index 93% rename from llm/qwen/dpo_argument.json rename to llm/config/qwen/dpo_argument.json index 19884cfaefc0..716cdba59da6 100644 --- a/llm/qwen/dpo_argument.json +++ b/llm/config/qwen/dpo_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "qwen/qwen-7b", + "model_name_or_path": "Qwen/Qwen2-7B", "train_dataset_path": "./data/train.jsonl", "dev_dataset_path": "./data/dev.jsonl", "output_dir": "./checkpoints/dpo_ckpts", @@ -32,6 +32,7 @@ "recompute_granularity": "full", "dpo_beta": 0.1, "benchmark": false, + "unified_checkpoint": true, "dpo_loss_type": "sigmoid", "dpo_label_smoothing": 0.0, "autotuner_benchmark":false diff --git a/llm/qwen/lora_argument.json b/llm/config/qwen/lora_argument.json similarity index 82% rename from llm/qwen/lora_argument.json rename to llm/config/qwen/lora_argument.json index 321a2ee3354f..aeb0d5d61f92 100644 --- a/llm/qwen/lora_argument.json +++ b/llm/config/qwen/lora_argument.json @@ -1,7 +1,7 @@ { - "model_name_or_path": "qwen/qwen-7b", + "model_name_or_path": "Qwen/Qwen2-7B", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen_lora_ckpts", + "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -27,6 +27,8 @@ "tensor_parallel_degree": 1, "pipeline_parallel_degree": 1, "lora": true, + "unified_checkpoint": true, "zero_padding": false, - "use_flash_attention": false + "use_flash_attention": true, + "pissa": false } diff --git a/llm/qwen/pretrain_argument_stage2.json b/llm/config/qwen/pretrain_argument.json similarity index 84% rename from llm/qwen/pretrain_argument_stage2.json rename to llm/config/qwen/pretrain_argument.json index 1345021f3d19..99d37d832874 100644 --- a/llm/qwen/pretrain_argument_stage2.json +++ b/llm/config/qwen/pretrain_argument.json @@ -1,8 +1,8 @@ { - "model_name_or_path": "qwen/qwen-7b", - "tokenizer_name_or_path": "qwen/qwen-7b", + "model_name_or_path": "Qwen/Qwen2-7B", + "tokenizer_name_or_path": "Qwen/Qwen2-7B", "input_dir": "./data", - "output_dir": "./checkpoints/qwen_pretrain_ckpts", + "output_dir": "./checkpoints/pretrain_ckpts", "per_device_train_batch_size": 2, "gradient_accumulation_steps": 1, "per_device_eval_batch_size": 2, @@ -35,5 +35,6 @@ "recompute": true, "distributed_dataloader": 1, "recompute_granularity": "full", + "unified_checkpoint": true, "save_total_limit": 2 } diff --git a/llm/llama/pt_argument.json b/llm/config/qwen/pt_argument.json similarity index 81% rename from llm/llama/pt_argument.json rename to llm/config/qwen/pt_argument.json index 501e09c47160..b70e4a144c75 100644 --- a/llm/llama/pt_argument.json +++ b/llm/config/qwen/pt_argument.json @@ -1,7 +1,7 @@ { - "model_name_or_path": "facebook/llama-7b", + "model_name_or_path": "Qwen/Qwen2-7B", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_pt_ckpts", + "output_dir": "./checkpoints/pt_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -14,7 +14,7 @@ "save_strategy": "epoch", "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "do_train": true, "do_eval": true, @@ -27,6 +27,5 @@ "tensor_parallel_degree": 1, "pipeline_parallel_degree": 1, "prefix_tuning": true, - "zero_padding": false, - "use_flash_attention": false + "use_flash_attention": true } diff --git a/llm/qwen/sft_argument.json b/llm/config/qwen/sft_argument.json similarity index 78% rename from llm/qwen/sft_argument.json rename to llm/config/qwen/sft_argument.json index 38daa1d0f293..21b1e0da7f74 100644 --- a/llm/qwen/sft_argument.json +++ b/llm/config/qwen/sft_argument.json @@ -1,7 +1,7 @@ { - "model_name_or_path": "qwen/qwen-7b", + "model_name_or_path": "Qwen/Qwen2-7B", "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen_sft_ckpts", + "output_dir": "./checkpoints/sft_ckpts", "per_device_train_batch_size": 1, "gradient_accumulation_steps": 4, "per_device_eval_batch_size": 8, @@ -24,8 +24,10 @@ "metric_for_best_model": "accuracy", "recompute": true, "save_total_limit": 1, - "tensor_parallel_degree": 4, + "tensor_parallel_degree": 1, "pipeline_parallel_degree": 1, + "sharding": "stage2", "zero_padding": false, - "use_flash_attention": false + "unified_checkpoint": true, + "use_flash_attention": true } diff --git a/llm/docs/chat_template.md b/llm/docs/chat_template.md index 6c9e699c8468..e8ad37167f26 100644 --- a/llm/docs/chat_template.md +++ b/llm/docs/chat_template.md @@ -36,14 +36,14 @@ ... ``` -其次就是将构造好的`chat_template.json`文件传入到 `llm/finetune_generation.py` 模块当中: +其次就是将构造好的`chat_template.json`文件传入到 `llm/run_finetune.py` 模块当中: * 使用模型自带chat-template > 并不是所有的模型支持chat-template,PaddleNLP 正在全力支持,可根据是否有下载 `chat_template.json` 文件来判断该模型是否支持 chat-template。 ```shell -python finetune_generation.py ... --model_name_or_path qwen/qwen-7b-chat --chat_template qwen/qwen-7b-chat +python run_finetune.py ... --model_name_or_path qwen/qwen-7b-chat --chat_template qwen/qwen-7b-chat ``` 此时当 `chat_template` 参数和 `model_name_or_path` 参数一致时,此时将默认使用模型自带的chat_template.json` 文件。 @@ -51,7 +51,7 @@ python finetune_generation.py ... --model_name_or_path qwen/qwen-7b-chat --chat_ * 使用自定义 chat-template ```shell -python finetune_generation.py ... --chat_template ./qwen_14b_chat_template.json +python run_finetune.py ... --chat_template ./qwen_14b_chat_template.json ``` 1. 当 `chat_template` 参数和 `model_name_or_path` 参数一致时,此时将默认使用模型自带的 `chat_template.json` 文件。 diff --git a/llm/docs/finetune.md b/llm/docs/finetune.md index 79bd7eb84dfe..b590a09739b7 100644 --- a/llm/docs/finetune.md +++ b/llm/docs/finetune.md @@ -70,28 +70,21 @@ git clone 代码到本地,即可开始。 SFT(Supervised Fine-Tuning)模型全参微调依托飞桨提出的[4D混合分布式并行](https://ai.baidu.com/forum/topic/show/987996)能力,支持使用Trainer API轻松切换数据并行(DP)、[张量并行(TP, Tensor Parallelism)](https://arxiv.org/abs/1909.08053)、[流水线并行(PP, Pipeline Parallelism)](https://arxiv.org/abs/1811.06965)(目前仅支持Llama)等多种分布式训练策略。 ``` -# 张量并行分布式训练(常用) -python -u -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./llama/sft_argument.json - -# 目前ChatGLM2、OPT不支持张量并行,默认使用Sharding策略 -python -u -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./chatglm2/sft_argument.json - -# 张量并行&流水线并行分布式训练(目前仅支持Llama) -python -u -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./llama/sft_pp_argument.json +python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py ./config/llama/sft_argument.json ``` 1. `zero_padding`设为True有助于提高训练效率。建议将`per_device_train_batch_size`设为1,使用`gradient_accumulation_steps`控制batch size,适当调整`max_length`取值。 2. 设置`use_flash_attention`为True使用FlashAttention。 +3. SFT API支持4D并行策略,可以通过控制`tensor_parallel_degree`、`pipeline_parallel_degree`、 `sharding`、`sharding_parallel_degree`调整 ### 2.4 LoRA ``` # 单卡训练 -python finetune_generation.py ./llama/lora_argument.json +python run_finetune.py ./config/llama/lora_argument.json -# 张量并行分布式训练(ChatGLM2、OPT不支持张量并行) -# 将lora_argument.json中tensor_parallel_degree修改为2 -python -u -m paddle.distributed.launch --gpus "0,1" finetune_generation.py ./llama/lora_argument.json +# 张量并行分布式训练 +python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py ./config/llama/lora_argument.json ``` **Note:** @@ -107,11 +100,10 @@ python -u -m paddle.distributed.launch --gpus "0,1" finetune_generation.py ./ ``` # 单卡训练 -python finetune_generation.py ./llama/pt_argument.json +python run_finetune.py ./llama/pt_argument.json -# 张量并行分布式训练(ChatGLM2、OPT不支持张量并行) -# 将pt_argument.json中tensor_parallel_degree修改为2 -python -u -m paddle.distributed.launch --gpus "0,1" finetune_generation.py ./llama/pt_argument.json +# 张量并行分布式训练 +python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py ./llama/pt_argument.json ``` **Note:** @@ -198,7 +190,7 @@ python -u -m paddle.distributed.launch --gpus "0,1" finetune_generation.py ./ ## 4.分布式策略参数合并 -我们使用张量并行(TP,Tensor Parallelism)和 流水线并行(PP,Pipeline Parallelism)训练过程中,为了节省TP参数合并时间通常在中间checkpoint将参数存储为多个TP和PP参数分片,可以使用提供的分片合并参数脚本进行参数合并。 +**如果开启unified_checkpoint则不需要合参**。我们使用张量并行(TP,Tensor Parallelism)和 流水线并行(PP,Pipeline Parallelism)训练过程中,为了节省TP参数合并时间通常在中间checkpoint将参数存储为多个TP和PP参数分片,可以使用提供的分片合并参数脚本进行参数合并。 ``` python merge_tp_and_pp_params.py \ @@ -216,16 +208,18 @@ python merge_tp_and_pp_params.py \ 为了后续的**压缩**和**静态图推理**方便,我们提供LoRA参数合并脚本,可以将LoRA参数合并到主干模型并保存相应的权重。 ``` python merge_lora_params.py \ - --lora_path ./checkpoints/llama_lora_ckpts \ - --merge_lora_model_path ./checkpoints/llama_lora_merge \ + --model_name_or_path ./checkpoints/sft_ckpts \ + --lora_path ./checkpoints/lora_ckpts \ + --output_path ./checkpoints/lora_merge \ --device "gpu" \ - --low_gpu_mem True + --safe_serialization True ```   脚本参数介绍
- `lora_path`: LoRA参数和配置路径,对LoRA参数进行初始化,默认为None。 +- `model_name_or_path`: 必须,主干模型参数路径,默认为None。 - `merge_model_path`: 必须,合并参数后保存路径,默认为None。 - `device`: 运行环境,默认为gpu。 -- `low_gpu_mem`:降低合参时候所需显存,默认为False。如果合参时显存不足,建议开启 +- `safe_serialization`: 是否保存为safetensor格式,默认为True。
diff --git a/llm/docs/inference.md b/llm/docs/inference.md index a20e3a32d614..9660778a22ef 100644 --- a/llm/docs/inference.md +++ b/llm/docs/inference.md @@ -17,7 +17,7 @@ PaddleNLP 提供了动态图推理和静态图推理两种方式,方便用户 ### 1.1 动态图推理 ```shell # 动态图模型推理命令参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16 +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16 ``` 对于LoRA、PrefixTuning 模型只需额外传入相应的lora_path或prefix_path即可,如:`--lora_path ./checkpoints/llama_lora_ckpts`或`--prefix_path ./checkpoints/llama_prefix_ckpts`,详见推理参数减少。 @@ -26,9 +26,9 @@ python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ```shell # 静态图模型推理命令参考, LoRA需要先合并参数,Prefix Tuning暂不支持 # step1 : 静态图导出 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16 +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16 # step2: 静态图推理 -python predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static +python ./predict/predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static ``` ## 2. 高性能模型推理 @@ -86,7 +86,7 @@ git clone https://github.com/PaddlePaddle/PaddleNLP #GPU设备安装自定义算子 cd ./paddlenlp/csrc && python setup_cuda.py install #XPU设备安装自定义算子 -cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh +cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh ``` ### 2.3 关闭BlockAttention的高性能推理 @@ -95,16 +95,16 @@ cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh ```shell # 动态图模型推理命令参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 # PrefixTuning动态图推理参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts # Weight Only Int8 动态图推理参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8 +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8 # PTQ-A8W8推理命令参考 -python predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16 +python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16 ``` **Note**: 1. LoRA 模型在推理之前是需要合并参数,详细可见:[合并 LoRA 参数](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/merge_lora_params.py)。 @@ -115,16 +115,16 @@ python predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference **step1:动转静** ```shell # 动转静命令参考 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 # PrefixTuning动转静命令参考 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --export_precache true +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --export_precache true # Weight Only Int8 动转静命令参考 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8 +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8 # PTQ-A8W8动转静命令参考 -python export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16 +python ./predict/export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16 ``` **Note**: 1. LoRA 模型在推理之前是需要合并参数,详细可见:[合并 LoRA 参数](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/merge_lora_params.py)。 @@ -135,13 +135,13 @@ python export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --infere **step2:静态图推理** ```shell # 静态图推理命令参考 -python predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" # PrefixTuning静态图推理命令参考 -python predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts # Weight Only Int8 静态图推理命令参考 -python predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --quant_type weight_only_int8 +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --quant_type weight_only_int8 # PTQ-A8W8静态图推理命令参考 # 以下环境变量用于开启int8矩阵乘的算法选择以获得更快的推理速度,打开之后第一次执行会执行算法选择从而导致速度较慢。 @@ -149,7 +149,7 @@ export FLAGS_use_autotune=1 export FLAGS_cublaslt_exhaustive_search_times=10 export FLAGS_cache_inference_while_scope=1 -python predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" ``` **Note**: 1. LoRA 模型在推理之前是需要合并参数,详细可见:[合并 LoRA 参数](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/merge_lora_params.py)。 @@ -164,50 +164,50 @@ python predictor.py --model_name_or_path ./inference --inference_model --quant_ ```shell # 动态图模型推理命令参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn # XPU设备动态图模型推理命令参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --device xpu +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --device xpu # Weight Only Int8 动态图推理参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8 --block_attn +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8 --block_attn # PTQ-A8W8推理命令参考 -python predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16 --block_attn +python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16 --block_attn # CacheKV 动态量化推理命令参考 -python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --cachekv_int8 +python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --cachekv_int8 ``` #### 2.4.2 静态图推理 **step1:动转静** ```shell # 动转静命令参考 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn # XPU设备动转静命令参考 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --device xpu +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --device xpu # Weight Only Int8 动转静命令参考 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8 --block_attn +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8 --block_attn # PTQ-A8W8动转静命令参考 -python export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16 --block_attn +python ./predict/export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16 --block_attn # CacheKV 动态量化动转静命令参考 -python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --cachekv_int8 +python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --cachekv_int8 ``` **step2:静态图推理** ```shell # 静态图推理命令参考 -python predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn # XPU设备静态图推理命令参考 -python predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn --device xpu +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn --device xpu # Weight Only Int8 静态图推理命令参考 -python predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --quant_type weight_only_int8 --block_attn +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --quant_type weight_only_int8 --block_attn # PTQ-A8W8静态图推理命令参考 # 以下环境变量用于开启int8矩阵乘的算法选择以获得更快的推理速度,打开之后第一次执行会执行算法选择从而导致速度较慢。 @@ -215,10 +215,10 @@ export FLAGS_use_autotune=1 export FLAGS_cublaslt_exhaustive_search_times=10 export FLAGS_cache_inference_while_scope=1 -python predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn # CacheKV 动态量化8静态图推理命令参考 -python predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --cachekv_int8 --block_attn +python ./predict/predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --cachekv_int8 --block_attn ``` **Note**: 1. 使用Weight Only Int8 推理需要额外传入 `quant_type`。 diff --git a/llm/docs/pretrain.rst b/llm/docs/pretrain.rst index 987e6c53f90d..d0fd203b97e3 100644 --- a/llm/docs/pretrain.rst +++ b/llm/docs/pretrain.rst @@ -68,10 +68,10 @@ git clone 代码到本地,即可开始。 cd ../model_zoo/gpt-3/external_ops/ && python3 setup.py install && cd - # llama 模型预训练 - python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./llama/pretrain-llama2_7b-tp2sd4_stage2.json + python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/llama/pretrain_argument.json # Qwen 模型预训练 - python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./qwen/pretrain_argument_stage2.json + python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/qwen/pretrain_argument.json 注意: diff --git a/llm/docs/quantization.md b/llm/docs/quantization.md index 101c18f4441a..eadaa77397a2 100644 --- a/llm/docs/quantization.md +++ b/llm/docs/quantization.md @@ -58,19 +58,19 @@ git clone 代码到本地,即可开始。 ### 2.3 PTQ 量化 ``` -python finetune_generation.py ./llama/ptq_argument.json +python run_finetune.py ./config/llama/ptq_argument.json ``` ### 2.4 GPTQ 量化 ``` -python finetune_generation.py ./llama/gptq_argument.json +python run_finetune.py ./config/llama/gptq_argument.json ``` ### 2.5 AWQ 量化 ``` -python finetune_generation.py ./llama/awq_argument.json +python run_finetune.py ./config/llama/awq_argument.json ``` ### 2.6 量化参数介绍 diff --git a/llm/ernie-3.5-se/README.md b/llm/experimental/ernie-3.5-se/README.md similarity index 100% rename from llm/ernie-3.5-se/README.md rename to llm/experimental/ernie-3.5-se/README.md diff --git a/llm/ernie-3.5-se/configuration.py b/llm/experimental/ernie-3.5-se/configuration.py similarity index 100% rename from llm/ernie-3.5-se/configuration.py rename to llm/experimental/ernie-3.5-se/configuration.py diff --git a/llm/ernie-3.5-se/conversion_utils.py b/llm/experimental/ernie-3.5-se/conversion_utils.py similarity index 100% rename from llm/ernie-3.5-se/conversion_utils.py rename to llm/experimental/ernie-3.5-se/conversion_utils.py diff --git a/llm/ernie-3.5-se/data.py b/llm/experimental/ernie-3.5-se/data.py similarity index 100% rename from llm/ernie-3.5-se/data.py rename to llm/experimental/ernie-3.5-se/data.py diff --git a/llm/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model b/llm/experimental/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model similarity index 100% rename from llm/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model rename to llm/experimental/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model diff --git a/llm/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json b/llm/experimental/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json similarity index 100% rename from llm/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json rename to llm/experimental/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json diff --git a/llm/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json b/llm/experimental/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json similarity index 100% rename from llm/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json rename to llm/experimental/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json diff --git a/llm/ernie-3.5-se/ernie_dataset.py b/llm/experimental/ernie-3.5-se/ernie_dataset.py similarity index 100% rename from llm/ernie-3.5-se/ernie_dataset.py rename to llm/experimental/ernie-3.5-se/ernie_dataset.py diff --git a/llm/ernie-3.5-se/finetune_generation.py b/llm/experimental/ernie-3.5-se/finetune_generation.py similarity index 100% rename from llm/ernie-3.5-se/finetune_generation.py rename to llm/experimental/ernie-3.5-se/finetune_generation.py diff --git a/llm/ernie-3.5-se/modeling.py b/llm/experimental/ernie-3.5-se/modeling.py similarity index 100% rename from llm/ernie-3.5-se/modeling.py rename to llm/experimental/ernie-3.5-se/modeling.py diff --git a/llm/ernie-3.5-se/predict_generation.py b/llm/experimental/ernie-3.5-se/predict_generation.py similarity index 100% rename from llm/ernie-3.5-se/predict_generation.py rename to llm/experimental/ernie-3.5-se/predict_generation.py diff --git a/llm/ernie-3.5-se/run_pretrain.py b/llm/experimental/ernie-3.5-se/run_pretrain.py similarity index 100% rename from llm/ernie-3.5-se/run_pretrain.py rename to llm/experimental/ernie-3.5-se/run_pretrain.py diff --git a/llm/ernie-3.5-se/run_trainer_stage2.sh b/llm/experimental/ernie-3.5-se/run_trainer_stage2.sh similarity index 100% rename from llm/ernie-3.5-se/run_trainer_stage2.sh rename to llm/experimental/ernie-3.5-se/run_trainer_stage2.sh diff --git a/llm/ernie-3.5-se/tokenizer.py b/llm/experimental/ernie-3.5-se/tokenizer.py similarity index 100% rename from llm/ernie-3.5-se/tokenizer.py rename to llm/experimental/ernie-3.5-se/tokenizer.py diff --git a/llm/ernie-3.5-se/utils.py b/llm/experimental/ernie-3.5-se/utils.py similarity index 100% rename from llm/ernie-3.5-se/utils.py rename to llm/experimental/ernie-3.5-se/utils.py diff --git a/llm/llama/run_sharding_v2.sh b/llm/experimental/scripts/run_sharding_v2.sh similarity index 100% rename from llm/llama/run_sharding_v2.sh rename to llm/experimental/scripts/run_sharding_v2.sh diff --git a/llm/llama/run_trainer.sh b/llm/experimental/scripts/run_trainer.sh similarity index 100% rename from llm/llama/run_trainer.sh rename to llm/experimental/scripts/run_trainer.sh diff --git a/llm/llama/run_trainer_tp2cp2.sh b/llm/experimental/scripts/run_trainer_tp2cp2.sh similarity index 100% rename from llm/llama/run_trainer_tp2cp2.sh rename to llm/experimental/scripts/run_trainer_tp2cp2.sh diff --git a/llm/llama/run_trainer_tp4pp2.sh b/llm/experimental/scripts/run_trainer_tp4pp2.sh similarity index 100% rename from llm/llama/run_trainer_tp4pp2.sh rename to llm/experimental/scripts/run_trainer_tp4pp2.sh diff --git a/llm/llama/run_trainer_tp4sep2.sh b/llm/experimental/scripts/run_trainer_tp4sep2.sh similarity index 100% rename from llm/llama/run_trainer_tp4sep2.sh rename to llm/experimental/scripts/run_trainer_tp4sep2.sh diff --git a/llm/fused_layers.py b/llm/fused_layers.py deleted file mode 120000 index b183f45159cc..000000000000 --- a/llm/fused_layers.py +++ /dev/null @@ -1 +0,0 @@ -llama/fused_layers.py \ No newline at end of file diff --git a/llm/gemma/sft_argument_7b.json b/llm/gemma/sft_argument_7b.json deleted file mode 100644 index 16eba55bed9e..000000000000 --- a/llm/gemma/sft_argument_7b.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "model_name_or_path": "google/gemma-7b", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/gemma_sft_ckpts", - "per_device_train_batch_size": 8, - "gradient_accumulation_steps": 1, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":1, - "num_train_epochs": 3, - "learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 512, - "max_length": 1024, - "bf16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 8, - "pipeline_parallel_degree": 1, - "zero_padding": false, - "use_flash_attention": false -} \ No newline at end of file diff --git a/llm/gemma/sft_argument_7b_sharding.json b/llm/gemma/sft_argument_7b_sharding.json deleted file mode 100644 index ca04affdb243..000000000000 --- a/llm/gemma/sft_argument_7b_sharding.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "model_name_or_path": "google/gemma-7b", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_sft_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 1, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":1, - "num_train_epochs": 3, - "learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 1024, - "max_length": 2048, - "fp16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "sharding_parallel_degree": 8, - "sharding": "stage3", - "pipeline_parallel_degree": 1, - "zero_padding": false, - "use_flash_attention": false -} \ No newline at end of file diff --git a/llm/gemma/sft_argument_sharding.json b/llm/gemma/sft_argument_sharding.json deleted file mode 100644 index d462645e2235..000000000000 --- a/llm/gemma/sft_argument_sharding.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "model_name_or_path": "google/gemma-2b/", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/chatglm2_sft_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 1, - "per_device_eval_batch_size": 1, - "eval_accumulation_steps":1, - "num_train_epochs": 3, - "learning_rate": 3e-05, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 512, - "max_length": 1024, - "fp16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "sharding_parallel_degree": 2, - "sharding": "stage3", - "zero_padding": false, - "use_flash_attention": false - } \ No newline at end of file diff --git a/llm/glm/README.md b/llm/glm/README.md deleted file mode 100644 index 86bc69d571e6..000000000000 --- a/llm/glm/README.md +++ /dev/null @@ -1,102 +0,0 @@ -# GLM - -## 1. 模型介绍 - -[General Language Model (GLM)](https://arxiv.org/abs/2103.10360) 是以自回归填空作为训练目标的通用语言模型,可用于各类理解和生成任务。 - -现有预训练框架包括以 BERT 为代表的自编码模型,以 GPT 为代表的自回归模型和以 T5 为代表的编码-解码模型。但这些框架均不能完全支持自然语言理解、无条件生成和条件生成这三类主要任务。为了解决这一问题,我们提出了基于自回归填空任务的通用语言模型(GLM)。GLM 使用 2D 位置编码和任意顺序预测改进了填空预训练过程,在自然语言理解任务上超越了 BERT 和 T5。同时,GLM 的预训练过程基于多种任务,填空长度和数量各不相同。在自然语言理解、无条件生成和条件生成任务上,GLM 均超过了具有相同参数规模和训练数据量的 BERT、T5 和 GPT 模型。除此之外,GLM 还以 BERT Large 1.25 倍参数量的规模取得了当前最优的效果,证明了其在不同下游任务上良好的泛化能力。 - - -**支持模型权重:** - -| Model | -|----------------------------------| -| THUDM/glm-large-chinese | -| THUDM/glm-10b-chinese | - -## 3. 模型精调 - -### SFT - -``` -python -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py \ ---model_name_or_path THUDM/glm-large-chinese \ ---num_train_epochs 4 \ ---learning_rate 3e-5 \ ---warmup_ratio 0.06 \ ---weight_decay 0.1 \ ---label_smoothing 0.1 \ ---save_steps 100 \ ---logging_steps 1 \ ---eval_steps 100 \ ---output_dir ./checkpoints/glm-large-chinese \ ---src_length 608 \ ---tgt_length 160 \ ---min_tgt_length 55 \ ---length_penalty 0.7 \ ---no_repeat_ngram_size 3 \ ---num_beams 5 \ ---select_topk True \ ---per_device_eval_batch_size 2 \ ---per_device_train_batch_size 2 \ ---max_grad_norm 1.0 \ ---lr_scheduler_type linear \ ---fp16 \ ---fp16_opt_level O2 \ ---recompute \ ---do_train \ ---do_eval -``` - -### 单卡LoRA微调 - -``` -python finetune_generation.py \ ---model_name_or_path THUDM/glm-large-chinese \ ---num_train_epochs 4 \ ---learning_rate 3e-5 \ ---warmup_ratio 0.06 \ ---weight_decay 0.1 \ ---label_smoothing 0.1 \ ---save_steps 100 \ ---logging_steps 1 \ ---eval_steps 100 \ ---output_dir ./checkpoints/glm-large-chinese \ ---src_length 608 \ ---tgt_length 160 \ ---min_tgt_length 55 \ ---length_penalty 0.7 \ ---no_repeat_ngram_size 3 \ ---num_beams 5 \ ---select_topk True \ ---per_device_eval_batch_size 2 \ ---per_device_train_batch_size 2 \ ---max_grad_norm 1.0 \ ---lr_scheduler_type linear \ ---fp16 \ ---fp16_opt_level O2 \ ---recompute \ ---do_train \ ---do_eval \ ---lora True -``` - -其中参数释义如下: - -- `model_name_or_path`: 预训练模型内置名称或者模型所在目录,默认为`THUDM/glm-large-chinese`。 -- `src_length`: 上下文的最大输入长度,默认为608. -- `tgt_length`: 生成文本的最大长度,默认为160. -- `min_tgt_length`: 生成文本的最小长度,默认为55. -- `length_penalty`: 生成解码时的长度惩罚因子,默认为0.7. -- `num_beams`: 搜索方向数量,默认为5。 -- `label_smoothing`: 标签平滑因子,默认为0.1. -- `lr_decay_ratio`: 学习率衰减因子,默认为0.1. -- `lora`: 是否使用LoRA技术. - - -## 3.4 动态图推理 - -``` -python predict_generation.py \ - --model_name_or_path THUDM/glm-large-chinese -``` diff --git a/llm/glm/data.py b/llm/glm/data.py deleted file mode 100644 index 40f5f3320a64..000000000000 --- a/llm/glm/data.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - - -def custom_convert_example(example, tokenizer, data_args, is_test=True): - source = None - title = None - target = None - if "source" in example and "title" in example: - source = example["source"] - if "title" in example.keys(): - title = example["title"] - elif "context" in example and "answer" in example: - source = example["context"] - if "answer" in example.keys(): - title = example["answer"] - else: - assert False, "Source and title are not in the input dictionary, nor are context and answer." - if "target" in example.keys(): - target = example["target"] - elif "question" in example.keys(): - target = example["question"] - example["text_a"] = "答案:" + title + "," + "上下文:" + source - example["text_b"] = "在已知答案的前提下,问题:" + target - inputs = tokenizer.encode(example["text_a"], max_length=data_args.src_length - 1, truncation=True) - inputs["input_ids"] = inputs["input_ids"][:-1] + [tokenizer.gmask_token_id] + inputs["input_ids"][-1:] - pad_length = data_args.src_length - len(inputs["input_ids"]) - inputs["input_ids"] = np.array([inputs["input_ids"] + [tokenizer.pad_token_id] * pad_length]) - inputs["attention_mask"] = np.array([inputs["attention_mask"] + [1] + [0] * pad_length]) - sep = inputs["input_ids"].shape[1] - inputs = tokenizer.build_inputs_for_generation( - inputs, - max_gen_length=data_args.tgt_length, - targets=" " + example["text_b"] if not is_test else None, - padding="max_length", - ) - - for input_name in inputs.keys(): - inputs[input_name] = inputs[input_name].squeeze(0) - if is_test: - inputs["position_ids"] = inputs["position_ids"][:, : inputs["input_ids"].shape[-1]] - labels = tokenizer.encode( - " " + example["text_b"], add_special_tokens=False, max_length=data_args.tgt_length - 1 - )["input_ids"] - loss_mask = [0] * sep + [1] * len(labels) + [0] * (data_args.tgt_length - len(labels)) - labels = ( - [0] * sep - + labels - + [tokenizer.eop_token_id] - + [tokenizer.pad_token_id] * (data_args.tgt_length - len(labels) - 1) - ) - inputs["label_ids"] = labels - inputs["loss_mask"] = loss_mask - return inputs diff --git a/llm/glm/finetune_generation.py b/llm/glm/finetune_generation.py deleted file mode 100644 index e8779d68f3ee..000000000000 --- a/llm/glm/finetune_generation.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -from dataclasses import dataclass, field -from functools import partial - -import paddle -from data import custom_convert_example -from utils import GLMTrainer - -from paddlenlp.data import DefaultDataCollator -from paddlenlp.datasets import load_dataset -from paddlenlp.metrics import BLEU, Rouge1, Rouge2, RougeL -from paddlenlp.peft import LoRAConfig, LoRAModel -from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint -from paddlenlp.transformers import AutoModelForConditionalGeneration, AutoTokenizer -from paddlenlp.utils.log import logger - - -@dataclass -class DataArgument: - task_name: str = field(default="dureader_qg", metadata={"help": "The name of task."}) - src_length: int = field(default=608, metadata={"help": "The max length of source text."}) - tgt_length: int = field(default=160, metadata={"help": "The max length of target text."}) - min_tgt_length: int = field(default=55, metadata={"help": "The min length of target text."}) - length_penalty: float = field(default=0.7, metadata={"help": "The length penalty."}) - no_repeat_ngram_size: int = field(default=3, metadata={"help": "The no repeat ngram size."}) - num_beams: int = field(default=5, metadata={"help": "The number of beams."}) - select_topk: bool = field(default=True, metadata={"help": "Whether to select top k tokens for generation."}) - top_p: float = field( - default=0.0, metadata={"help": "The cumulative probability for top-p-filtering in the 'sampling' strategy."} - ) - top_k: int = field( - default=0, - metadata={ - "help": "The number of highest probability tokens to keep for top-k-filtering in the 'sampling' strategy." - }, - ) - no_block_position: bool = field(default=False) - - -@dataclass -class ModelArgument: - model_name_or_path: str = field( - default="THUDM/glm-2b", metadata={"help": "Build-in pretrained model name or the path to local model."} - ) - label_smoothing: float = field(default=0.1, metadata={"help": "The label smoothing parameter."}) - lr_decay_ratio: float = field(default=0.1, metadata={"help": "The ratio for learning rate decrease"}) - lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"}) - - -def main(): - parser = PdArgumentParser((ModelArgument, DataArgument, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - training_args.print_config(model_args, "Model") - training_args.print_config(data_args, "Data") - setattr(training_args, "label_smoothing", model_args.label_smoothing) - setattr(training_args, "lr_decay_ratio", model_args.lr_decay_ratio) - - paddle.set_device(training_args.device) - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" - ) - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 1: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - dtype = None - if training_args.fp16_opt_level == "O2": - if training_args.fp16: - dtype = "float16" - if training_args.bf16: - dtype = "bfloat16" - - # Load the pretrained language model. - model = AutoModelForConditionalGeneration.from_pretrained( - model_args.model_name_or_path, - output_predict=True, - parallel_output=True, - dtype=dtype, # todo enable set dtype to avoid additional mem usage - tensor_parallel_degree=training_args.tensor_parallel_degree, - tensor_parallel_rank=training_args.tensor_parallel_rank, - ) - if model_args.lora: - # TODO: hardcode parameters for now. Change after MergedLoRA is introduced - lora_config = LoRAConfig( - target_modules=[".*query_key_value.*"], - r=4, - lora_alpha=8, - merge_weights=True, - tensor_parallel_degree=training_args.tensor_parallel_degree, - dtype=dtype, - ) - model = LoRAModel(model, lora_config) - model.mark_only_lora_as_trainable() - model.print_trainable_parameters() - - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - - # Load the dataset. - train_ds, dev_ds = load_dataset(data_args.task_name, splits=["train", "dev"]) - trans_func = partial(custom_convert_example, tokenizer=tokenizer, data_args=data_args) - train_ds = train_ds.map(partial(trans_func, is_test=False)) - test_ds = dev_ds.map(trans_func) - - collate_fn = DefaultDataCollator() - - def compute_metrics(eval_preds): - rouge1 = Rouge1() - rouge2 = Rouge2() - rougel = RougeL() - bleu4 = BLEU(n_size=4) - predictions = [x[x != -100] for x in eval_preds.predictions] - references = [x[x != -100] for x in eval_preds.label_ids] - - # for pred in predictions: - - rouge1_score = rouge1.score(predictions, references) - rouge2_score = rouge2.score(predictions, references) - for pred, ref in zip(predictions, references): - rougel.add_inst(pred, [ref]) - bleu4.add_inst(pred, [ref]) - return { - "rouge1": rouge1_score, - "rouge2": rouge2_score, - "rougel": rougel.score(), - "bleu4": bleu4.score(), - } - - trainer = GLMTrainer( - model=model, - args=training_args, - train_dataset=train_ds, - eval_dataset=dev_ds, - tokenizer=tokenizer, - compute_metrics=compute_metrics, - do_generation=True, - data_collator=collate_fn, - ) - if training_args.fp16_opt_level == "O2": - trainer.disable_autocast_context_manager() - - if training_args.do_train: - train_result = trainer.train(resume_from_checkpoint=last_checkpoint) - trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1) - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - - if training_args.do_eval: - eval_result = trainer.evaluate(test_ds) - trainer.log_metrics("test", eval_result) - - -if __name__ == "__main__": - main() diff --git a/llm/glm/predict_generation.py b/llm/glm/predict_generation.py deleted file mode 100644 index 41dd6b3459af..000000000000 --- a/llm/glm/predict_generation.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from paddle.distributed import fleet - -from paddlenlp.peft import LoRAConfig, LoRAModel -from paddlenlp.transformers import ( - AutoConfig, - AutoModelForConditionalGeneration, - AutoTokenizer, -) - - -def parse_arguments(): - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", default="THUDM/glm-large-chinese", required=True, help="The directory of model." - ) - parser.add_argument("--lora_path", default=None, help="The directory of LoRA parameters. Default to None") - parser.add_argument( - "--merge_tensor_parallel_path", default=None, help="The directory of model to merge tensor parallel parts." - ) - parser.add_argument("--batch_size", type=int, default=2, help="The batch size of data.") - parser.add_argument("--src_length", type=int, default=200, help="The batch size of data.") - parser.add_argument("--tgt_length", type=int, default=20, help="The batch size of data.") - return parser.parse_args() - - -def batchfy_text(texts, batch_size): - batch_texts = [] - batch_start = 0 - while batch_start < len(texts): - batch_texts += [texts[batch_start : min(batch_start + batch_size, len(texts))]] - batch_start += batch_size - return batch_texts - - -class Predictor(object): - def __init__(self, args): - self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - self.batch_size = args.batch_size - self.args = args - - tensor_parallel_degree = paddle.distributed.get_world_size() - tensor_parallel_rank = 0 - if tensor_parallel_degree > 1: - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": 1, - "mp_degree": tensor_parallel_degree, - "pp_degree": 1, - "sharding_degree": 1, - } - fleet.init(is_collective=True, strategy=strategy) - hcg = fleet.get_hybrid_communicate_group() - tensor_parallel_rank = hcg.get_model_parallel_rank() - - if self.args.lora_path is not None: - lora_config = LoRAConfig.from_pretrained(self.args.lora_path) - dtype = lora_config.dtype - else: - config = AutoConfig.from_pretrained(args.model_name_or_path) - dtype = config.dtype if config.dtype is not None else "float32" - - self.model = AutoModelForConditionalGeneration.from_pretrained( - args.model_name_or_path, - tensor_parallel_degree=tensor_parallel_degree, - tensor_parallel_rank=tensor_parallel_rank, - dtype=dtype, - ) - if self.args.lora_path is not None: - self.model = LoRAModel.from_pretrained(self.model, self.args.lora_path) - self.model.eval() - - def preprocess(self, input_text): - input_text = [text.strip() + "[gMASK]" for text in input_text] - inputs = self.tokenizer( - input_text, - return_tensors="np", - add_special_tokens=True, - padding=True, - max_length=self.args.src_length, - truncation=True, - truncation_side="left", - ) - inputs = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=self.args.tgt_length) - inputs_tensor = {} - for key, value in inputs.items(): - inputs_tensor[key] = paddle.to_tensor(value) - return inputs_tensor - - def infer(self, inputs): - result = self.model.generate( - **inputs, - decode_strategy="sampling", - top_k=1, - max_length=self.args.tgt_length, - eos_token_id=self.tokenizer.eop_token_id, - pad_token_id=self.tokenizer.pad_token_id, - ) - result = result[0] - return result - - def postprocess(self, infer_data): - result = [] - for x in infer_data.tolist(): - res = self.tokenizer.decode(x, skip_special_tokens=True) - result.append(res) - out_dict = {"result": result} - return out_dict - - def predict(self, texts): - input_map = self.preprocess(texts) - infer_result = self.infer(input_map) - output = self.postprocess(infer_result) - return output - - -if __name__ == "__main__": - args = parse_arguments() - predictor = Predictor(args) - all_texts = [ - "答案:年基准利率4.35%,上下文:从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。在已知答案的前提下,问题:", - "答案:U系列,上下文:U系列是最好的,采用国际顶尖技术(由格力自主研发)双级变频压缩机,提高压缩机运转效率,制冷制热能力更强劲;1赫兹变频技术,使空调相当于一个15 W电灯泡,更加节能省电;送风面积广,风力大;生态风,净化空气。非常不错,现在国美在做活动,可以了解一下。在已知答案的前提下,问题:", - ] - batch_texts = batchfy_text(all_texts, args.batch_size) - for bs, texts in enumerate(batch_texts): - outputs = predictor.predict(texts) - for text, result in zip(texts, outputs["result"]): - print("{}\n{}".format(text, result)) - - if args.merge_tensor_parallel_path is not None: - predictor.model.save_pretrained( - save_dir=args.merge_tensor_parallel_path, - merge_tensor_parallel=True, - ) - predictor.tokenizer.save_pretrained(args.merge_tensor_parallel_path) diff --git a/llm/glm/utils.py b/llm/glm/utils.py deleted file mode 100644 index d3b9e8919aa7..000000000000 --- a/llm/glm/utils.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import paddle -import paddle.nn as nn - -from paddlenlp.trainer import Trainer - - -class GLMTrainer(Trainer): - def __init__(self, do_generation: bool, **kwargs): - super().__init__(**kwargs) - self.do_generation = do_generation - - def prediction_step( - self, - model: nn.Layer, - inputs: Dict[str, Union[paddle.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]: - - if not self.do_generation: - return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys) - - model.eval() - with paddle.no_grad(): - tokens = model.generate( - input_ids=inputs["input_ids"], - position_ids=inputs["position_ids"], - attention_mask=inputs["attention_mask"], - decode_strategy="sampling", - top_k=1, - repetition_penalty=2.0, - bos_token_id=self.tokenizer.sop_token_id, - eos_token_id=self.tokenizer.eop_token_id, - pad_token_id=self.tokenizer.pad_token_id, - )[0] - all_preds = [] - for pred_tokens in tokens: - all_preds.append(pred_tokens[pred_tokens != self.tokenizer.pad_token_id].tolist()) - max_pred_length = max([len(x) for x in all_preds]) - for index, preds in enumerate(all_preds): - all_preds[index] = preds + [-100] * (max_pred_length - len(preds)) - - all_labels = [] - for label, mask in zip(inputs["labels"].numpy(), inputs["loss_mask"].numpy()): - label = label[mask.astype("bool")] - label = [x for x in label[label != self.tokenizer.pad_token_id]] - all_labels.append(label) - max_label_length = max([len(x) for x in all_labels]) - for index, labels in enumerate(all_labels): - all_labels[index] = labels + [-100] * (max_label_length - len(labels)) - - return (None, paddle.to_tensor(all_preds), paddle.to_tensor(all_labels)) - - def log(self, logs: Dict[str, float], **kwargs) -> None: - - if self.state.epoch is not None: - logs["epoch"] = round(self.state.epoch, 4) - - if "eval_loss" in logs: - logs["eval_ppl"] = np.exp(logs["eval_loss"]) - output = {**logs, **{"step": self.state.global_step}} - self.state.log_history.append(output) - self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs, **kwargs) diff --git a/llm/gpt-3/README.md b/llm/gpt-3/README.md deleted file mode 100644 index a0c387158d43..000000000000 --- a/llm/gpt-3/README.md +++ /dev/null @@ -1,205 +0,0 @@ -# GPT - -## 1. 模型介绍 - -GPT-3是一种预训练语言模型,它能够模拟人类语言思维和表达。GPT-3拥有巨大的参数,包含了1750亿个参数,这使得它具有强大的语言理解和生成能力。它可以完成的任务包括文本生成、文本摘要、回答问题、翻译、阅读理解等。GPT-3的预训练过程使用了大量的语料库,包括互联网上的大量文本。它通过分析这些文本,学习如何生成和理解人类语言。GPT-3在自然语言处理领域具有很高的影响力,它可以模拟人类对话和生成文本,这使得它在许多应用领域都有广泛的应用,比如智能客服、自然语言处理、游戏设计等。 - -## 2. 预训练 - -预训练数据制作参考[此处](../../model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md) - -为了方便用户运行测试本模型,本项目提供了处理好的100k条doc的训练样本: -```shell -wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy -wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz -``` - -将所有预处理得到的文件统一放入一个文件夹中,以备训练使用: - -``` -mkdir data -mv gpt_en_dataset_300m_ids.npy ./data -mv gpt_en_dataset_300m_idx.npz ./data -``` - -注意: -1. 需要paddle develop版本训练,需要安装`pip install tool_helpers visualdl==2.5.3`等相关缺失whl包 -2. `use_flash_attention` 需要在A100机器开启。建议使用cuda11.8环境。 - -使用下面脚本,即可在gpt2-medium-en的基础上,继续训练. -```shell -task_name="gpt3_hybrid" -export PYTHONPATH="../../PaddleNLP/" -export FLAGS_cudnn_deterministic=True -log_dir="log" -rm -rf $log_dir - -python -u -m paddle.distributed.launch \ - --gpus "0,1,2,3,4,5,6,7" \ - --log_dir ${log_dir} \ - run_pretrain.py \ - --model_name_or_path gpt2-medium-en \ - --tokenizer_name_or_path gpt2-medium-en \ - --input_dir "./data" \ - --output_dir "output/$task_name" \ - --split 949,50,1 \ - --max_seq_length 1024 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --tensor_parallel_degree 1 \ - --pipeline_parallel_degree 1 \ - --sequence_parallel 0 \ - --fuse_attention_qkv 0 \ - --use_flash_attention 0 \ - --fp16 \ - --fp16_opt_level "O2" \ - --scale_loss 1024 \ - --learning_rate 0.00001 \ - --min_learning_rate 0.000005 \ - --max_steps 10000 \ - --save_steps 5000 \ - --weight_decay 0.01 \ - --warmup_ratio 0.01 \ - --max_grad_norm 1.0 \ - --logging_steps 1\ - --continue_training \ - --dataloader_num_workers 1 \ - --sharding "stage2" \ - --eval_steps 1000 \ - --report_to "visualdl" \ - --disable_tqdm true \ - --recompute 1 \ - --gradient_accumulation_steps 2 \ - --do_train \ - --do_eval \ - --device "gpu" -``` - -其中参数释义如下: - -- `model_name_or_path`: 预训练模型内置名称或者模型所在目录,默认为`gpt2-medium-en`。 -- `tokenizer_name_or_path`: tokenizer名称或者tokenizer所在目录,默认为`gpt2-medium-en`。 -- `input_dir`: 预训练数据所在目录。 -- `output_dir`: 模型参数及日志保存目录。 -- `split`: 预训练数据切分比例,默认为949,50,1。 -- `max_seq_length`: 预训练最大序列长度,默认为1024。 -- `per_device_train_batch_size`: 单卡训练batch_size大小,默认为1。 -- `per_device_eval_batch_size`: 单卡评估batch_size大小,默认为1。 -- `tensor_parallel_degree`: 模型并行数量。 -- `pipeline_parallel_degree`: 流水线并行数量。 -- `sequence_parallel`: 序列并行数量。需要当`tensor_parallel_degree>1`时,使用序列并行。注意:当模型规模较小、batch_size较小、sequence_length较小时,不建议使用序列并行。 -- `fuse_attention_qkv`:在MultiHeadAttention中使用qkv线性层融合 -- `use_flash_attention`:使用flash attention技术,注意此处需要在A100机器开启, 建议使用cuda11.8环境。 -- `fp16`: 使用 float16 精度进行模型训练和推理。 -- `fp16_opt_level`: float16 精度训练模式,`O2`表示纯 float16 训练。 -- `scale_loss`: float16 精度训练时,损失值的缩放比例。微调时建议使用1024,预训练时建议调大。 -- `learning_rate`: 参数更新的学习率。 -- `min_learning_rate`: 最小学习率。 -- `max_steps`: 模型训练步数。 -- `save_steps`: 模型参数保存的间隔步数。 -- `weight_decay`: 权重衰减系数。 -- `warmup_ratio`: warmup比例。 -- `max_grad_norm`: 梯度裁剪系数。 -- `logging_steps`: 训练日志打印的间隔步数。 -- `continue_training`: 是否继续训练模型。 -- `dataloader_num_workers`: dataloader进程数。 -- `sharding`: sharding切分策略,包含stage1、stage2、stage3。 -- `eval_steps`: 模型评估的间隔步数。 -- `recompute`: 使用重计算策略,开启后可节省训练显存。 -- `gradient_accumulation_steps`: 模型参数梯度累积的步数,可用于扩大 batch size。实际的 batch_size = per_device_train_batch_size * gradient_accumulation_steps。 -- `do_train`: 是否训练模型。 -- `do_eval`: 是否评估模型。 -- `lora`: 是否使用LoRA技术。 - - - - -## 3. 微调 -### SFT - -```shell -task_name="gpt3_hybrid" -export PYTHONPATH="../../PaddleNLP/" -export FLAGS_cudnn_deterministic=True -log_dir="log" -rm -rf $log_dir - -python -u -m paddle.distributed.launch \ - --gpus "0" \ - --log_dir ${log_dir} \ - finetune_generation.py \ - --model_name_or_path gpt2-medium-en \ - --output_dir "output/$task_name" \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 1 \ - --tensor_parallel_degree 1 \ - --pipeline_parallel_degree 1 \ - --sequence_parallel 0 \ - --fp16 \ - --fp16_opt_level "O2" \ - --scale_loss 1024 \ - --learning_rate 0.00001 \ - --max_steps 10000 \ - --save_steps 5000 \ - --weight_decay 0.01 \ - --warmup_ratio 0.01 \ - --max_grad_norm 1.0 \ - --logging_steps 1\ - --dataloader_num_workers 1 \ - --sharding "stage2" \ - --eval_steps 1000 \ - --report_to "visualdl" \ - --disable_tqdm true \ - --recompute 1 \ - --gradient_accumulation_steps 2 \ - --do_train \ - --do_eval \ - --device "gpu" -``` - -### LoRA - -```shell -export PYTHONPATH="../../PaddleNLP/" -export FLAGS_cudnn_deterministic=True -log_dir="log" -rm -rf $log_dir - -python finetune_generation.py \ - --model_name_or_path gpt2-medium-en \ - --output_dir "output/$task_name" \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 1 \ - --tensor_parallel_degree 1 \ - --pipeline_parallel_degree 1 \ - --sequence_parallel 0 \ - --fp16 \ - --fp16_opt_level "O2" \ - --scale_loss 1024 \ - --learning_rate 3e-4 \ - --max_steps 10000 \ - --save_steps 5000 \ - --weight_decay 0.01 \ - --warmup_ratio 0.01 \ - --max_grad_norm 1.0 \ - --logging_steps 1\ - --dataloader_num_workers 1 \ - --sharding "stage2" \ - --eval_steps 1000 \ - --report_to "visualdl" \ - --disable_tqdm true \ - --recompute 1 \ - --gradient_accumulation_steps 2 \ - --do_train \ - --do_eval \ - --device "gpu" \ - --lora -``` - - -## 3. 动态图推理 - -```shell -python predict_generation.py - -``` diff --git a/llm/gpt-3/finetune_generation.py b/llm/gpt-3/finetune_generation.py deleted file mode 100644 index 0d0df71d8100..000000000000 --- a/llm/gpt-3/finetune_generation.py +++ /dev/null @@ -1,250 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -from dataclasses import dataclass, field -from functools import partial - -import paddle -from utils import ( - DataCollatorForSupervisedDataset, - GPTTrainer, - compute_metrics, - convert_example, -) - -from paddlenlp.datasets import load_dataset -from paddlenlp.peft import LoRAConfig, LoRAModel -from paddlenlp.trainer import ( - PdArgumentParser, - TrainingArguments, - get_last_checkpoint, - set_seed, -) -from paddlenlp.transformers import ( - AutoTokenizer, - GPTConfig, - GPTForCausalLM, - GPTForCausalLMPipe, -) -from paddlenlp.utils.log import logger - -MODEL_CLASSES = { - "gpt": (GPTConfig, GPTForCausalLM), -} - - -@dataclass -class DataArgument: - task_name: str = field(default="squad", metadata={"help": "The name of task."}) - src_length: int = field(default=1024, metadata={"help": "The max length of source text."}) - tgt_length: int = field(default=142, metadata={"help": "The max length of target text."}) - generate_num: int = field(default=0, metadata={"help": "Save first k examples generation result in dev dataset"}) - - -@dataclass -class ModelArgument: - model_type: str = field( - default="gpt-cn", metadata={"help": "Build-in pretrained model from the different model type."} - ) - model_name_or_path: str = field( - default="gpt-cpm-large-cn", metadata={"help": "Build-in pretrained model name or the path to local model."} - ) - use_flash_attn: bool = field(default=False, metadata={"help": "Whether to use flash attention"}) - enable_fuse_transformer: bool = field( - default=False, - metadata={"help": "gpt, enable_fuse_transformer"}, - ) - - fuse_attention_qkv: bool = field( - default=False, - metadata={"help": "gpt, fuse_attention_qkv"}, - ) - eval_with_do_generation: bool = field( - default=True, metadata={"help": "Evaluate with generation, instead for calc loss."} - ) - lr_decay_ratio: float = field(default=0.1, metadata={"help": "The ratio for learning rate decrease"}) - # lora - lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"}) - lora_path: str = field(default=None, metadata={"help": "Initialize lora state dict."}) - lora_rank: int = field(default=8, metadata={"help": "Lora attention dimension"}) - merge_weights: bool = field( - default=False, metadata={"help": "Merge weights of the original model and the Lora model"} - ) - - -def main(): - parser = PdArgumentParser((ModelArgument, DataArgument, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # data_args.always_pad_to_max_length = False - data_args.always_pad_to_max_length = training_args.pipeline_parallel_degree > 1 - setattr(training_args, "lr_decay_ratio", model_args.lr_decay_ratio) - - training_args.print_config(model_args, "Model") - training_args.print_config(data_args, "Data") - training_args.tgt_length = data_args.tgt_length - paddle.set_device(training_args.device) - - set_seed(seed=training_args.seed) - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" - ) - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 1: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set the dtype for loading model - dtype = "float32" - if training_args.fp16_opt_level == "O2": - if training_args.fp16: - dtype = "float16" - if training_args.bf16: - dtype = "bfloat16" - - config_class, model_class = MODEL_CLASSES[model_args.model_type] - if training_args.pipeline_parallel_degree > 1: - model_class = GPTForCausalLMPipe - # Load the tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - tokenizer.padding_side = "left" - - # Load and set the pretrained configuration - config = config_class.from_pretrained(model_args.model_name_or_path) - config.enable_fuse_transformer = model_args.enable_fuse_transformer - config.fuse_attention_qkv = model_args.fuse_attention_qkv - config.use_flash_attn = model_args.use_flash_attn - config.use_recompute = training_args.recompute - - config.tensor_parallel_degree = training_args.tensor_parallel_degree - config.tensor_parallel_rank = training_args.tensor_parallel_rank - config.ignore_index = tokenizer.pad_token_id - - model = model_class.from_pretrained( - model_args.model_name_or_path, - config=config, - dtype=dtype, - ) - if model_args.lora: - if model_args.lora_path is None: - target_modules = [ - ".*qkv_proj.*", - ".*q_proj.*", - ".*k_proj.*", - ".*v_proj.*", - ".*linear1.*", - ".*linear2.*", - ".*out_proj.*", - ] - lora_config = LoRAConfig( - target_modules=target_modules, - r=model_args.lora_rank, - lora_alpha=2 * model_args.lora_rank, - merge_weights=model_args.merge_weights, - tensor_parallel_degree=training_args.tensor_parallel_degree, - dtype=dtype, - ) - model = LoRAModel(model, lora_config) - else: - model = LoRAModel.from_pretrained(model=model, lora_path=model_args.lora_path) - model.mark_only_lora_as_trainable() - model.print_trainable_parameters() - - # Load the dataset. - if training_args.do_train or training_args.do_eval: - train_ds, dev_ds = load_dataset(data_args.task_name, splits=["train_v1", "dev_v1"]) - trans_func = partial( - convert_example, - tokenizer=tokenizer, - max_source_length=data_args.src_length, - max_target_length=data_args.tgt_length, - ) - - if training_args.do_train: - train_ds = train_ds.map(partial(trans_func)) - if training_args.do_eval: - is_test = model_args.eval_with_do_generation - dev_ds = dev_ds.map(partial(trans_func, is_test=is_test)) - - collate_fn = DataCollatorForSupervisedDataset( - tokenizer, max_length=1024 if data_args.always_pad_to_max_length else 0 - ) - - def compute_metrics_trainer(eval_preds, tokenizer): - all_preds = [] - all_labels = [] - preds = eval_preds.predictions - preds = [x[x != -100] for x in preds] - all_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=False)) - labels = [x[x != -100] for x in eval_preds.label_ids] - all_labels.extend(tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)) - - all_preds = [pred.strip() for pred in all_preds] - all_labels = [label.strip() for label in all_labels] - all_preds = [pred.strip("question:") for pred in all_preds] - all_labels = [label.strip("question:") for label in all_labels] - - eval_result = compute_metrics(all_preds, all_labels) - return eval_result - - compute_metrics_func = partial( - compute_metrics_trainer, - tokenizer=tokenizer, - ) - - trainer = GPTTrainer( - model=model, - args=training_args, - train_dataset=train_ds if training_args.do_train else None, - eval_dataset=dev_ds if training_args.do_eval else None, - tokenizer=tokenizer, - compute_metrics=compute_metrics_func - if (model_args.eval_with_do_generation and training_args.do_eval) - else None, - do_generation=model_args.eval_with_do_generation, - data_collator=collate_fn, - ) - - if training_args.do_train: - train_result = trainer.train(resume_from_checkpoint=last_checkpoint) - trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1) - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - - if training_args.do_eval: - eval_result = trainer.evaluate() - trainer.log_metrics("test", eval_result) - - -if __name__ == "__main__": - main() diff --git a/llm/gpt-3/predict_generation.py b/llm/gpt-3/predict_generation.py deleted file mode 100644 index 060bcb9f8cf1..000000000000 --- a/llm/gpt-3/predict_generation.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -import paddle -from utils import get_hcg, init_dist_env, set_seed - -from paddlenlp.transformers import ( - GPTChineseTokenizer, - GPTConfig, - GPTForCausalLM, - GPTTokenizer, -) - -MODEL_CLASSES = { - "gpt2": (GPTForCausalLM, GPTTokenizer), - "gpt2-cn": (GPTForCausalLM, GPTChineseTokenizer), -} - - -def parse_arguments(): - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("--model_type", default="gpt2-cn", help="The directory of model.") - parser.add_argument("--model_name_or_path", default="gpt-cpm-large-cn", help="The directory of model.") - parser.add_argument("--save_onepiece_model_path", default=None, help="The directory of model.") - parser.add_argument("--batch_size", type=int, default=1, help="The batch size of data.") - parser.add_argument("--src_length", type=int, default=200, help="The batch size of data.") - parser.add_argument("--tgt_length", type=int, default=200, help="The batch size of data.") - parser.add_argument("--seed", type=int, default=20, help="the seed of parameter initialization") - return parser.parse_args() - - -def batchfy_text(texts, batch_size): - batch_texts = [] - batch_start = 0 - while batch_start < len(texts): - batch_texts += [texts[batch_start : min(batch_start + batch_size, len(texts))]] - batch_start += batch_size - return batch_texts - - -class Predictor(object): - def __init__(self, args=None, tokenizer=None, model=None, **kwargs): - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - self.tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) - self.tokenizer.padding_side = "left" - self.batch_size = args.batch_size - self.args = args - self.src_length = self.args.src_length - self.tgt_length = self.args.tgt_length - - tensor_parallel_degree = paddle.distributed.get_world_size() - tensor_parallel_rank = 0 - if tensor_parallel_degree > 1: - hcg = get_hcg() - tensor_parallel_rank = hcg.get_model_parallel_rank() - - config = GPTConfig.from_pretrained(args.model_name_or_path) - dtype = config.dtype if config.dtype is not None else "float16" - - self.model = GPTForCausalLM.from_pretrained( - args.model_name_or_path, - dtype=dtype, - tensor_parallel_degree=tensor_parallel_degree, - tensor_parallel_rank=tensor_parallel_rank, - ) - if self.tokenizer.pad_token_id is None: - self.tokenizer.pad_token_id = self.model.config.pad_token_id - self.model.eval() - - def preprocess(self, input_text): - inputs = self.tokenizer( - input_text, - return_tensors="np", - padding=True, - max_length=self.src_length, - ) - inputs_tensor = {} - for key, value in inputs.items(): - inputs_tensor[key] = paddle.to_tensor(value) - return inputs_tensor - - def infer(self, inputs): - if self.model.config.dtype == "float32" or self.model.config.dtype is None: - with paddle.no_grad(): - result = self.model.generate( - **inputs, - max_length=self.tgt_length, - bos_token_id=self.tokenizer.bos_token_id, - eos_token_id=self.tokenizer.eol_token_id, - pad_token_id=self.tokenizer.pad_token_id, - decode_strategy="sampling", - top_k=1, - ) - else: - with paddle.no_grad(): - with paddle.amp.auto_cast(False, level="O2", dtype=self.model.config.dtype): - result = self.model.generate( - **inputs, - max_length=self.tgt_length, - bos_token_id=self.tokenizer.bos_token_id, - eos_token_id=self.tokenizer.eol_token_id, - pad_token_id=self.tokenizer.pad_token_id, - decode_strategy="sampling", - top_k=1, - ) - result = result[0] - return result - - def postprocess(self, infer_data): - result = [] - for x in infer_data.tolist(): - res = self.tokenizer.convert_ids_to_string(x) - result.append(res) - out_dict = {"result": result} - return out_dict - - def predict(self, texts): - input_map = self.preprocess(texts) - infer_result = self.infer(input_map) - output = self.postprocess(infer_result) - return output - - def save_onepiece_model(self, save_onepiece_model_path): - self.model.save_pretrained(save_dir=save_onepiece_model_path, merge_tensor_parallel=True) - paddle.distributed.barrier() - self.tokenizer.save_pretrained(save_onepiece_model_path) - paddle.distributed.barrier() - - -def predict(): - args = parse_arguments() - - # Init the fleet config - tensor_parallel_degree = paddle.distributed.get_world_size() - if tensor_parallel_degree > 1: - init_dist_env(tensor_parallel_degree=tensor_parallel_degree, seed=args.seed) - set_seed(args.seed) - - predictor = Predictor(args) - all_texts = ["问题:中国的首都是哪里?答案:北京。\n问题:苹果的CEO是谁? 答案:", "问题:中国的首都是哪里?答案:北京。\n问题:广东的省会是哪个城市? 答案:"] - batch_texts = batchfy_text(all_texts, args.batch_size) - for bs, texts in enumerate(batch_texts): - outputs = predictor.predict(texts) - for text, result in zip(texts, outputs["result"]): - print(result) - if args.save_onepiece_model_path is not None: - predictor.save_onepiece_model(args.save_onepiece_model_path) - - -if __name__ == "__main__": - predict() diff --git a/llm/gpt-3/run_pretrain.py b/llm/gpt-3/run_pretrain.py deleted file mode 120000 index f4873c94b357..000000000000 --- a/llm/gpt-3/run_pretrain.py +++ /dev/null @@ -1 +0,0 @@ -../run_pretrain.py \ No newline at end of file diff --git a/llm/gpt-3/tests/test_sequence_parallel.py b/llm/gpt-3/tests/test_sequence_parallel.py deleted file mode 100644 index b8284695d652..000000000000 --- a/llm/gpt-3/tests/test_sequence_parallel.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.distributed.fleet as fleet -from paddle.distributed.fleet.meta_parallel.pipeline_parallel import PipelineParallel - -from paddlenlp.transformers import GPTConfig, GPTForCausalLM, GPTForCausalLMPipe - - -class TestGPT(unittest.TestCase): - def test_sequence_model(self): - model_name_or_path = "gpt2-medium-en" - seq_len = 1024 - batch_size = 2 - input_ids = paddle.arange(100, 100 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len]) - labels = paddle.arange(101, 101 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len]) - - world_size = paddle.distributed.get_world_size() - pp_degree = 2 - tp_degree = world_size // pp_degree - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": 1, - "mp_degree": tp_degree, - "pp_degree": pp_degree, - "sharding_degree": 1, - } - strategy.pipeline_configs = {"enable_partial_send_recv": False if pp_degree > 1 else True} - fleet.init(is_collective=True, strategy=strategy) - hcg = fleet.get_hybrid_communicate_group() - mp_group = hcg.get_model_parallel_group() - tensor_parallel_rank = mp_group.rank - - if pp_degree > 1: - model_class = GPTForCausalLMPipe - else: - model_class = GPTForCausalLM - - config = GPTConfig.from_pretrained(model_name_or_path) - config.seq_length = seq_len - config.use_flash_attention = False - config.fuse_attention_qkv = False - config.recompute_granularity = "full" - config.virtual_pp_degree = 1 - config.use_recompute = False - - config.tensor_parallel_degree = tp_degree - config.tensor_parallel_rank = tensor_parallel_rank - config.tensor_parallel_output = False - # when tp_degree > 1, sequence_parallel can be set to True - config.sequence_parallel = True - config.fuse_sequence_parallel_allreduce = False - - model = model_class.from_pretrained(model_name_or_path, config=config, dtype="float32") - model.eval() - - if pp_degree > 1: - pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy) - pp_model.accumulate_steps = batch_size # for micro_batch_size * acc_steps == batch_size - ret_mp_pp = pp_model.eval_batch(data=[input_ids, labels], compute_loss=True) - else: - ret_mp_pp = model(input_ids=input_ids, labels=labels)[0] - - # run model for single device - config.tensor_parallel_degree = 1 - config.tensor_parallel_rank = -1 - config.sequence_parallel = False - single_model = GPTForCausalLM.from_pretrained(model_name_or_path, config=config, dtype="float32") - single_model.eval() - ret_single = single_model(input_ids=input_ids, labels=labels)[0] - - # output all results - print(f"ret mp{tp_degree} pp{pp_degree}", float(ret_mp_pp)) - print("ret single", float(ret_single)) - - diff = (ret_single - ret_mp_pp) / ret_single - print(f"diff: {float(diff)}") - np.testing.assert_allclose(float(ret_single), ret_mp_pp, rtol=1.5e-7) - - -if __name__ == "__main__": - TestGPT().test_sequence_model() -# python -m paddle.distributed.launch --gpus 0,1,2,3 tests/test_pipeline_parallel.py diff --git a/llm/gpt-3/utils.py b/llm/gpt-3/utils.py deleted file mode 100644 index 647b9d05356f..000000000000 --- a/llm/gpt-3/utils.py +++ /dev/null @@ -1,393 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import random -import re -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import paddle -import paddle.distributed as dist -import paddle.nn as nn -from paddle.distributed import fleet -from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddle.optimizer.lr import LambdaDecay -from rouge import Rouge - -from paddlenlp.data import DataCollatorForSeq2Seq -from paddlenlp.metrics import BLEU -from paddlenlp.trainer import Trainer -from paddlenlp.utils.log import logger - -PREFIX_CHECKPOINT_DIR = "model_state" -_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\.tp(\d+)" + ".pdparams$") - - -_hcg = None - - -def set_hcg(hcg): - global _hcg - _hcg = hcg - - -def get_hcg(): - global _hcg - return _hcg - - -def set_seed(seed): - # NOTE(shenliang03): For parameter init seed: - # seed: dp/mp_undistributed_paramter/sharding is same; others is different - # For compute seed(dropout): - # global seed: only mp group is same. - # local seed: all groups are different - - hcg = get_hcg() - if paddle.distributed.get_world_size() > 1: - # obtain rank message of hybrid parallel - - mp_rank = hcg.get_model_parallel_rank() - mp_size = hcg.get_model_parallel_world_size() - - pp_rank = hcg.get_stage_id() - pp_size = hcg.get_pipe_parallel_world_size() - - dp_rank = hcg.get_data_parallel_rank() - dp_size = hcg.get_data_parallel_world_size() - - sharding_rank = hcg.get_sharding_parallel_rank() - # sharding_size = hcg.get_sharding_parallel_world_size() - else: - mp_rank, mp_size = 0, 1 - pp_rank, pp_size = 0, 1 - dp_rank, dp_size = 0, 1 - sharding_rank, _ = 0, 1 - - # NOTE: the commented seeds are set only for precision validation - # seed += 100 * pp_rank - random_seed = seed + 100 * pp_rank - random.seed(random_seed) - np.random.seed(random_seed) - - # seed = mp_rank + - # pp_rank * (mp_size) + - # dp_rank * (mp_size * pp_size) + - # sharding_rank * (mp_size * pp_size * dp_size) - # seed offset is order to avoid conflicts with the parameter initialization seed - - seed_offset = seed + 1024 + paddle.distributed.get_world_size() - global_seed = ( - seed_offset - + pp_rank * (mp_size) - + dp_rank * (mp_size * pp_size) - + sharding_rank * (mp_size * pp_size * dp_size) - ) - - seed_offset += paddle.distributed.get_world_size() - local_seed = ( - seed_offset - + mp_rank - + pp_rank * (mp_size) - + dp_rank * (mp_size * pp_size) - + sharding_rank * (mp_size * pp_size * dp_size) - ) - - tracker = get_rng_state_tracker() - tracker.add("global_seed", global_seed) - tracker.add("local_seed", local_seed) - - paddle.seed(global_seed) - - logger.info( - "The global seed is set to {}, local seed is set to {} and " - "random seed is set to {}.".format(global_seed, local_seed, random_seed) - ) - - -def create_hcg(strategy, hcg_name="HybridCommunicateGroup"): - if hcg_name == "HybridCommunicateGroup": - fleet.init(is_collective=True, strategy=strategy) - hcg = fleet.get_hybrid_communicate_group() - else: - dist.init_parallel_env() - hcg = eval("{}".format(hcg_name))(strategy) - - return hcg - - -def init_dist_env( - tensor_parallel_degree=1, sharding_parallel_degree=1, pipeline_parallel_degree=1, data_parallel_degree=1, seed=1 -): - - strategy = fleet.DistributedStrategy() - - def is_segment_parallel_supported(): - import inspect - - members = [name for (name, date) in inspect.getmembers(fleet.HybridCommunicateGroup)] - return "get_sep_parallel_world_size" in members - - if tensor_parallel_degree == 1 and sharding_parallel_degree == 1: - if is_segment_parallel_supported(): - order = ["pp", "dp", "sharding", "sep", "mp"] - else: - order = ["pp", "dp", "sharding", "mp"] - else: - if is_segment_parallel_supported(): - order = ["dp", "pp", "sharding", "sep", "mp"] - else: - order = ["dp", "pp", "sharding", "mp"] - - strategy.hybrid_configs = { - "dp_degree": data_parallel_degree, - "mp_degree": tensor_parallel_degree, - "pp_degree": pipeline_parallel_degree, - "sharding_degree": sharding_parallel_degree, - "order": order, - } - - # TODO(wawltor) The inference parallel do not support the pipeline mode - - """ - if pipeline_parallel_degree > 1: - if "sequence_parallel" in config.Model: - if config.Model.sequence_parallel: - assert config.Global.enable_partial_send_recv is False, ( - "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " - "config.Global.enable_partial_send_recv should be set False." - ) - - strategy.pipeline_configs = { - "accumulate_steps": config.Global.local_batch_size // config.Global.micro_batch_size, - "micro_batch_size": config.Global.micro_batch_size, - "enable_partial_send_recv": config.Global.enable_partial_send_recv, - } - """ - - # set control in tensor parallel - strategy.tensor_parallel_configs = {"tensor_init_seed": seed} - - hcg = create_hcg(strategy) - set_hcg(hcg) - - -def convert_example( - example, - tokenizer, - max_source_length, - max_target_length, - is_test=False, -): - """ - Convert an example into necessary features. - """ - # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results - # in one example possible giving several features when a context is long, each of those features having a - # context that overlaps a bit the context of the previous feature. - # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is - # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. - context = example["context"] - question = example["question"] - try: - answer = example["answers"][0] - except Exception: - print(example["context"]) - print(example["question"]) - print(example["answers"]) - print(example["answer_starts"]) - print(example["is_impossible"]) - - input_seq = f"answer: {answer} context: {context} " - output_seq = f"question: {question} " - - outputs = tokenizer( - output_seq, - max_length=max_target_length, - # pad_to_max_seq_len=True, - truncation_strategy="longest_first", - return_attention_mask=False, - return_token_type_ids=False, - ) - inputs = tokenizer( - input_seq, - max_length=max_source_length, - # pad_to_max_seq_len=True, - truncation_strategy="longest_first", - return_attention_mask=False, - return_length=False, - ) - - final = {} - for k in outputs.keys(): - final[k] = inputs[k] + outputs[k] - if k == "input_ids": - final["labels"] = [tokenizer.pad_token_id] * len(inputs["input_ids"]) + outputs[k] - if is_test: - return dict(input_ids=inputs["input_ids"], labels=outputs["input_ids"]) - - # shift inputs and labels - final["input_ids"] = final["input_ids"][:-1] - final["labels"] = final["labels"][1:] - return final - - -def compute_metrics(preds, targets): - assert len(preds) == len(targets), ( - "The length of pred_responses should be equal to the length of " - "target_responses. But received {} and {}.".format(len(preds), len(targets)) - ) - rouge = Rouge() - bleu4 = BLEU(n_size=4) - scores = [] - for pred, target in zip(preds, targets): - try: - score = rouge.get_scores(" ".join(pred), " ".join(target)) - scores.append([score[0]["rouge-1"]["f"], score[0]["rouge-2"]["f"], score[0]["rouge-l"]["f"]]) - except ValueError: - scores.append([0, 0, 0]) - bleu4.add_inst(pred, [target]) - rouge1 = np.mean([i[0] for i in scores]) - rouge2 = np.mean([i[1] for i in scores]) - rougel = np.mean([i[2] for i in scores]) - - rouge1 = round(rouge1, 4) - rouge2 = round(rouge2, 4) - rougel = round(rougel, 4) - bleu4 = round(bleu4.score(), 4) - return dict( - rouge1=rouge1, - rouge2=rouge2, - rougel=rougel, - bleu4=bleu4, - ) - - -class DataCollatorForSupervisedDataset(DataCollatorForSeq2Seq): - """Collate examples for supervised fine-tuning.""" - - def __call__(self, features, return_tensors=None): - # Deep copy to avoid modifying features in-place - batch = copy.deepcopy(features) - if return_tensors is None: - return_tensors = self.return_tensors - labels = [feature["labels"] for feature in batch] if "labels" in batch[0].keys() else None - # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the - # same length to return tensors. - if labels is not None: - # Note(gongenlei): In pipeline, max_label_length = self.max_length - if self.padding == "max_length" and self.max_length is not None: - max_label_length = self.max_length - else: - max_label_length = max(len(l) for l in labels) - if self.pad_to_multiple_of is not None: - max_label_length = ( - (max_label_length + self.pad_to_multiple_of - 1) - // self.pad_to_multiple_of - * self.pad_to_multiple_of - ) - - padding_side = self.tokenizer.padding_side - for feature in batch: - remainder = [self.tokenizer.pad_token_id] * (max_label_length - len(feature["labels"])) - if isinstance(feature["labels"], list): - feature["labels"] = ( - feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"] - ) - elif padding_side == "right": - feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64) - else: - feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64) - - batch = self.tokenizer.pad( - batch, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors=return_tensors, - return_attention_mask=self.return_attention_mask, - ) - - return batch - - -class GPTTrainer(Trainer): - def __init__(self, do_generation: bool, **kwargs): - super().__init__(**kwargs) - self.do_generation = do_generation - - def prediction_step( - self, - model: nn.Layer, - inputs: Dict[str, Union[paddle.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]: - - if prediction_loss_only: - return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys) - elif not self.do_generation: - loss, logits, labels = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys) - # argmax here to avoid gather all logits, which is too memory-consuming. - # keepdim in order to maintain the same shape as logits - return (loss, logits.argmax(axis=-1, keepdim=True), labels) - - model.eval() - - preds = model.generate( - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"] if "attention_mask" in inputs else None, - max_length=self.args.tgt_length, - min_length=0, - use_cache=True, - temperature=1.0, - top_k=1, - top_p=1.0, - repetition_penalty=1.0, - decode_strategy="sampling", - )[0] - all_labels = [] - for label in inputs["labels"].numpy(): - label = [x for x in label[label != self.tokenizer.pad_token_id]] - all_labels.append(label) - max_label_length = max([len(x) for x in all_labels]) - for index, labels in enumerate(all_labels): - all_labels[index] = labels + [-100] * (max_label_length - len(labels)) - - return (None, paddle.to_tensor(preds), paddle.to_tensor(all_labels)) - - def create_scheduler(self, num_training_steps: int): - num_warmup_steps = ( - self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_ratio * num_training_steps - ) - - def lr_lambda(current_step: int): - if current_step < num_warmup_steps: - return float(current_step) / float(max(1, num_warmup_steps)) - else: - decay_step_ratio = (current_step - num_warmup_steps) / (num_training_steps - num_warmup_steps) - return 1.0 - (1.0 - self.args.lr_decay_ratio) * decay_step_ratio - - if self.lr_scheduler is None: - self.lr_scheduler = LambdaDecay(self.args.learning_rate, lr_lambda, last_epoch=-1) - return self.lr_scheduler - - def log(self, logs: Dict[str, float], **kwargs) -> None: - if "loss" in logs: - logs["ppl"] = np.exp(logs["loss"]) - if "eval_loss" in logs: - logs["eval_ppl"] = np.exp(logs["eval_loss"]) - - super(GPTTrainer, self).log(logs, **kwargs) diff --git a/llm/llama/lora_argument_pissa.json b/llm/llama/lora_argument_pissa.json deleted file mode 100644 index ba9e770add54..000000000000 --- a/llm/llama/lora_argument_pissa.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "model_name_or_path": "facebook/llama-7b", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_lora_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 32, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":16, - "num_train_epochs": 3, - "learning_rate": 2e-05, - "warmup_steps": 10, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 1024, - "max_length": 2048, - "fp16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 1, - "pipeline_parallel_degree": 1, - "lora": true, - "pissa": false, - "zero_padding": false, - "use_flash_attention": false - } diff --git a/llm/llama/megre_tp_and_pp.py b/llm/llama/megre_tp_and_pp.py deleted file mode 100644 index 1758ecf59710..000000000000 --- a/llm/llama/megre_tp_and_pp.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import paddle - -from paddlenlp.transformers import LlamaConfig, LlamaForCausalLM -from paddlenlp.utils.log import logger - - -def merge_pipeline_parallel(tp_degree, pp_degree, path): - tp_state_dict_list = [] - for tp in range(tp_degree): - tp_state_dict = {} - for pp in range(pp_degree): - tmp = paddle.load(os.path.join(path, f"model_state.tp{tp:0>2d}_pp{pp:0>2d}.pdparams"), return_numpy=True) - for k, v in tmp.items(): - tp_state_dict[k] = v - - tp_state_dict_list.append(tp_state_dict) - - return tp_state_dict_list - - -def merge_tensor_parallel(cls, state_dict_list, config) -> None: - """the entry of converting config and converting model file - - Args: - input_dir (str | None): the input dir which contains `pytorch_model.bin` and `config.json` file - config (PretrainedConfig): the PretrainedConfig instance of model - """ - name_action_mappings = cls._get_tensor_parallel_mappings(config, is_split=False) - state_keys_map = cls._resolve_prefix_keys(name_action_mappings.keys(), state_dict_list[0].keys()) - - for k, v in state_keys_map.items(): - name_action_mappings[v] = name_action_mappings.pop(k) - - state_dict_to_save = {} - for key in state_dict_list[0].keys(): - tensor = state_dict_list[0][key] - if key in name_action_mappings: - ret = [x[key] for x in state_dict_list] - action = name_action_mappings.pop(key) - tensor = action(ret) - - state_dict_to_save[key] = tensor - - if len(name_action_mappings) > 0: - for x in name_action_mappings.keys(): - logger.warning(f"key <{x}> need to merge tensor parallel but we can't find in model state.") - - print("Finally, we merging state dict to fellowing tensors.") - for k, v in state_dict_to_save.items(): - print(k, v.shape, v.dtype) - - return state_dict_to_save - - -def main(): - tp_degree = 2 - pp_degree = 2 - model_name_or_path = "temp_dir_to_your_ckpt" - - assert tp_degree > 1 - assert pp_degree > 1 - config = LlamaConfig.from_pretrained(model_name_or_path) - cls = LlamaForCausalLM - - tp_state_dict_list = merge_pipeline_parallel(tp_degree, pp_degree, model_name_or_path) - state_dict_to_save = merge_tensor_parallel(cls=cls, state_dict_list=tp_state_dict_list, config=config) - print("saving") - paddle.save(state_dict_to_save, os.path.join(model_name_or_path, "model_state.pdparams")) - - -if __name__ == "__main__": - main() diff --git a/llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json b/llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json deleted file mode 100644 index 928ef5510687..000000000000 --- a/llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base", - "tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base", - "input_dir": "./data", - "output_dir": "./checkpoints/baichuan_pretrain_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 16, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": true, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json b/llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json deleted file mode 100644 index 6840fc73b24b..000000000000 --- a/llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "FlagAlpha/Llama2-Chinese-13b-Chat", - "tokenizer_name_or_path": "FlagAlpha/Llama2-Chinese-13b-Chat", - "input_dir": "./data", - "output_dir": "./checkpoints/flagalpha_pretrain_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 16, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json b/llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json deleted file mode 100644 index f2edb150e011..000000000000 --- a/llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "FlagAlpha/Llama2-Chinese-7b-Chat", - "tokenizer_name_or_path": "FlagAlpha/Llama2-Chinese-7b-Chat", - "input_dir": "./data", - "output_dir": "./checkpoints/flagalpha_pretrain_ckpts", - "per_device_train_batch_size": 2, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json b/llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json deleted file mode 100644 index 4f6965a3bd3a..000000000000 --- a/llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "linly-ai/chinese-llama-2-7b", - "tokenizer_name_or_path": "linly-ai/chinese-llama-2-7b", - "input_dir": "./data", - "output_dir": "./checkpoints/linly_pretrain_ckpts", - "per_device_train_batch_size": 2, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 2048, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json b/llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json deleted file mode 100644 index 33b20ec2b568..000000000000 --- a/llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "meta-llama/Llama-2-7b", - "tokenizer_name_or_path": "meta-llama/Llama-2-7b", - "input_dir": "./data", - "output_dir": "./checkpoints/llama2_pretrain_ckpts", - "per_device_train_batch_size": 2, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/llama/pretrain-llama_13b-tp2sd4_stage2.json b/llm/llama/pretrain-llama_13b-tp2sd4_stage2.json deleted file mode 100644 index 545665f502d3..000000000000 --- a/llm/llama/pretrain-llama_13b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "facebook/llama-13b", - "tokenizer_name_or_path": "facebook/llama-13b", - "input_dir": "./data", - "output_dir": "./checkpoints/llama_pretrain_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 16, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/llama/pretrain-llama_7b-tp2sd4_stage2.json b/llm/llama/pretrain-llama_7b-tp2sd4_stage2.json deleted file mode 100644 index 12e1029e09bc..000000000000 --- a/llm/llama/pretrain-llama_7b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "facebook/llama-7b", - "tokenizer_name_or_path": "facebook/llama-7b", - "input_dir": "./data", - "output_dir": "./checkpoints/llama_pretrain_ckpts", - "per_device_train_batch_size": 2, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 2048, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/llama/run_pretrain.py b/llm/llama/run_pretrain.py deleted file mode 120000 index f4873c94b357..000000000000 --- a/llm/llama/run_pretrain.py +++ /dev/null @@ -1 +0,0 @@ -../run_pretrain.py \ No newline at end of file diff --git a/llm/llama/sft_pp_argument.json b/llm/llama/sft_pp_argument.json deleted file mode 100644 index 8f03f20e97d7..000000000000 --- a/llm/llama/sft_pp_argument.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "model_name_or_path": "facebook/llama-7b", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/llama_sft_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":16, - "num_train_epochs": 3, - "learning_rate": 3e-05, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 256, - "max_length": 512, - "fp16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 2, - "pipeline_parallel_config": "disable_p2p_cache_shape", - "zero_padding": false, - "use_flash_attention": false - } \ No newline at end of file diff --git a/llm/llama/tests/test_pipeline_parallel.py b/llm/llama/tests/test_pipeline_parallel.py deleted file mode 100644 index a232642e2987..000000000000 --- a/llm/llama/tests/test_pipeline_parallel.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.distributed.fleet as fleet -from paddle.distributed.fleet.meta_parallel.pipeline_parallel import PipelineParallel - -from paddlenlp.transformers import AutoModelForCausalLM, AutoModelForCausalLMPipe - - -class TestLlama(unittest.TestCase): - def test_pipeline_model(self): - world_size = paddle.distributed.get_world_size() - pp_degree = world_size - tp_degree = 1 - if world_size > 2: - pp_degree = 2 - assert world_size % pp_degree == 0 - tp_degree = world_size // pp_degree - - pp_degree = -1 - if pp_degree == -1: - tp_degree = world_size - pp_degree = 1 - - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": 1, - "mp_degree": tp_degree, - "pp_degree": pp_degree, - "sharding_degree": 1, - } - fleet.init(is_collective=True, strategy=strategy) - hcg = fleet.get_hybrid_communicate_group() - - if pp_degree > 1: - model_class = AutoModelForCausalLMPipe - else: - model_class = AutoModelForCausalLM - - model_name_or_path = "./llama-7b-2l" - # model_name_or_path = "__internal_testing__/tiny-random-llama" - model = model_class.from_pretrained( - model_name_or_path, - tensor_parallel_degree=tp_degree, - tensor_parallel_rank=hcg.get_model_parallel_rank(), - tensor_parallel_output=False, - # use_flash_attention=True, - ) - - model.eval() - - input_ids = paddle.to_tensor([[x for x in range(100, 110)]], dtype="int64") - labels = paddle.to_tensor([[x for x in range(101, 111)]], dtype="int64") - attention_mask = None - - if pp_degree > 1: - pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy) - ret = pp_model.eval_batch(data=[input_ids, labels], compute_loss=True) - else: - ret = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask) - ret = ret[0] - - # np.testing.assert_allclose(ret.item(), 10.49988270, atol=1e-7) - print(f"ret mp{tp_degree} pp", ret.item()) - ret_mp_pp = ret.item() - - single_model = AutoModelForCausalLM.from_pretrained( - model_name_or_path, - tensor_parallel_output=False, - ) - single_model.eval() - ret = single_model(input_ids=input_ids, labels=labels, attention_mask=attention_mask) - print("ret single", ret[0].item()) - print( - f"diff: {(ret[0].item()- ret_mp_pp)/ret[0].item()}", - ) - np.testing.assert_allclose(ret[0].item(), ret_mp_pp, rtol=1.5e-7) - # 15.526779174804688 - # 16.879518508911133 - - -if __name__ == "__main__": - TestLlama().test_pipeline_model() - -# 3 bugs to fix in paddlepaddle -# pp_layers.py -# def _construct_shared_comm(self): -# shared_comm = {} -# if self._topo.get_dim("pipe") == 1: -# return shared_comm - -# topology.py -# def _set_p2p_group(self): -# self.send_next_group = None -# self.send_prev_group = None -# self.recv_next_group = None -# self.recv_prev_group = None -# if self._pp_degree <= 1: -# return - -# pipeline_parallel.py -# def _load_micro_batch(self, cache_id, stage=None): -# inputs = self.data -# if stage == "fisrt": -# assert self.is_pipeline_first_stage() -# assert len(inputs) == 2, "length of input should be 2" -# return self._load_micro_batch_impl(inputs[0], cache_id) -# elif stage== "last": -# assert self.is_pipeline_last_stage() -# assert len(inputs) == 2, "length of input should be 2" -# return self._load_micro_batch_impl(inputs[1], cache_id) -# else: -# inputs = None -# -# -# CUDA_VISIBLE_DEVICES=2 PYTHONPATH=./ pytest -s -v tests/test_pipeline_parallel.py -# PYTHONPATH=/ssd2/zhonghui03/Datasets/PaddleNLP:$PYTHONPATH PYTHONPATH=$PYTHONPATH:./ python -m paddle.distributed.launch --gpus 0,1,2,3 tests/test_pipeline_parallel.py diff --git a/llm/llama/tests/test_sequence_parallel.py b/llm/llama/tests/test_sequence_parallel.py deleted file mode 100644 index f46330e85cd5..000000000000 --- a/llm/llama/tests/test_sequence_parallel.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.distributed.fleet as fleet -from paddle.distributed.fleet.meta_parallel.pipeline_parallel import PipelineParallel - -from paddlenlp.transformers import LlamaConfig, LlamaForCausalLM, LlamaForCausalLMPipe - - -class TestLlama(unittest.TestCase): - def test_sequence_model(self): - world_size = paddle.distributed.get_world_size() - pp_degree = world_size - tp_degree = 1 - - if world_size > 2: - pp_degree = 2 - assert world_size % pp_degree == 0 - tp_degree = world_size // pp_degree - - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": 1, - "mp_degree": tp_degree, - "pp_degree": pp_degree, - "sharding_degree": 1, - } - strategy.pipeline_configs = {"enable_partial_send_recv": False if pp_degree > 1 else True} - fleet.init(is_collective=True, strategy=strategy) - hcg = fleet.get_hybrid_communicate_group() - mp_group = hcg.get_model_parallel_group() - tensor_parallel_rank = mp_group.rank - - if pp_degree > 1: - model_class = LlamaForCausalLMPipe - else: - model_class = LlamaForCausalLM - - # model_name_or_path = "facebook/llama-7b" - model_name_or_path = "__internal_testing__/tiny-random-llama" - - seq_len = 2048 - batch_size = 2 - - config = LlamaConfig.from_pretrained(model_name_or_path) - config.seq_length = seq_len - config.use_flash_attention = False - config.use_fused_rms_norm = False - config.fuse_attention_qkv = False - config.recompute_granularity = "full" - config.virtual_pp_degree = 1 - config.use_recompute = False - - config.tensor_parallel_degree = tp_degree - config.tensor_parallel_rank = tensor_parallel_rank - config.tensor_parallel_output = False - config.sequence_parallel = True - - config.fuse_sequence_parallel_allreduce = False - - # hidden_size = 4096 - model = model_class.from_pretrained( - model_name_or_path, - config=config, - dtype="float32", - ) - - model.eval() - - input_ids = paddle.arange(100, 100 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len]) - labels = paddle.arange(101, 101 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len]) - - attention_mask = None - if pp_degree > 1: - pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy) - pp_model.accumulate_steps = batch_size # for micro_batch_size * acc_steps == batch_size - ret = pp_model.eval_batch(data=[input_ids, labels], compute_loss=True) - else: - # pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy) - # pp_model.data = [input_ids, labels] - # ret = pp_model._forward_step(None) - ret = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask) - ret = ret[0] - - # np.testing.assert_allclose(ret.item(), 10.49988270, atol=1e-7) - print(f"ret mp{tp_degree} pp{pp_degree}", ret.item()) - ret_mp_pp = ret.item() - - single_model = LlamaForCausalLM.from_pretrained(model_name_or_path, config=config) - single_model.eval() - ret = single_model(input_ids=input_ids, labels=labels, attention_mask=attention_mask) - print("ret single", ret[0].item()) - print( - f"diff: {(ret[0].item()- ret_mp_pp)/ret[0].item()}", - ) - np.testing.assert_allclose(ret[0].item(), ret_mp_pp, rtol=1.5e-7) - - -if __name__ == "__main__": - TestLlama().test_sequence_model() - -# CUDA_VISIBLE_DEVICES=2 PYTHONPATH=./ pytest -s -v tests/test_pipeline_parallel.py -# PYTHONPATH=/ssd2/zhonghui03/Datasets/PaddleNLP:$PYTHONPATH PYTHONPATH=$PYTHONPATH:./ python -m paddle.distributed.launch --gpus 0,1,2,3 tests/test_pipeline_parallel.py diff --git a/llm/llama/tests/unified-ckpt-llama-500m/config.json b/llm/llama/tests/unified-ckpt-llama-500m/config.json deleted file mode 100644 index 470d93f73fa4..000000000000 --- a/llm/llama/tests/unified-ckpt-llama-500m/config.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_size": 2048, - "initializer_range": 0.02, - "intermediate_size": 5504, - "max_position_embeddings": 2048, - "model_type": "llama", - "num_attention_heads": 8, - "num_hidden_layers": 8, - "pad_token_id": 0, - "rms_norm_eps": 1e-06, - "vocab_size": 32000 -} diff --git a/llm/merge_lora_params.py b/llm/merge_lora_params.py deleted file mode 100644 index 065a2585ebc0..000000000000 --- a/llm/merge_lora_params.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import copy -import os - -import paddle - -from paddlenlp.peft import LoRAConfig, LoRAModel - -try: - from paddle.nn.quant import weight_dequantize, weight_quantize -except: - weight_dequantize = None - weight_quantize = None -try: - from paddlenlp.quantization.qlora import qlora_weight_quantize_dequantize -except: - qlora_weight_quantize_dequantize = None - -from paddlenlp.quantization.quantization_config import QuantizationConfig -from paddlenlp.transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from paddlenlp.transformers.utils import device_guard -from paddlenlp.utils.env import CONFIG_NAME - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name_or_path", default=None, help="The directory of pretrained model.") - parser.add_argument( - "--lora_path", default=None, required=True, help="The directory of LoRA parameters. Default to None" - ) - parser.add_argument( - "--merge_lora_model_path", - default=None, - required=True, - help="The directory of merged parameters. Default to None", - ) - parser.add_argument("--device", type=str, default="gpu", help="Device") - parser.add_argument( - "--low_gpu_mem", type=bool, default=False, help="Whether to use low gpu memory. Default to False" - ) - return parser.parse_args() - - -def weight_process(name, quant_config, lora_config, state_dict): - weight = state_dict.pop(name + ".weight").cuda() - if quant_config.weight_quantize_algo is None: - pass - elif quant_config.weight_quantize_algo in ["nf4", "fp4"]: - weight = qlora_weight_quantize_dequantize( - weight, - quant_algo=quant_config.weight_quantize_algo, - double_quant=quant_config.weight_double_quant, - block_size=quant_config.weight_blocksize, - double_quant_block_size=quant_config.weight_double_quant_block_size, - ) - elif quant_config.weight_quantize_algo in ["weight_only_int8"]: - out, scale = weight_quantize(weight, algo=quant_config.weight_quantize_algo) - weight = weight_dequantize(out, scale) - else: - raise ValueError(f"quant_config.weight_quantize_algo {quant_config.weight_quantize_algo} is not supported.") - lora_A = state_dict.pop(name + ".lora_A").cuda() - lora_B = state_dict.pop(name + ".lora_B").cuda() - scaling = lora_config.lora_alpha / lora_config.r - state_dict[name + ".weight"] = (weight + lora_A @ lora_B * scaling).cpu() - - -def merge(): - args = parse_arguments() - paddle.set_device(args.device) - - lora_config = LoRAConfig.from_pretrained(args.lora_path) - if lora_config.base_model_name_or_path is None: - if args.model_name_or_path is not None: - raise ValueError("We can not find a valid model_name_or_path.") - else: - lora_config.base_model_name_or_path = args.model_name_or_path - - if os.path.isfile(os.path.join(args.lora_path, CONFIG_NAME)): - config = AutoConfig.from_pretrained(args.lora_path) - elif args.model_name_or_path is not None: - config = AutoConfig.from_pretrained(args.model_name_or_path) - else: - raise ValueError( - f"We can not find config.json in lora_path: {args.lora_path} or find a valid model_name_or_path." - ) - config.dtype = lora_config.dtype - if ( - lora_config.dtype == "bfloat16" or config.quantization_config.weight_quantize_algo in ["nf4", "fp4"] - ) and args.device == "cpu": - raise ValueError("We can not apply bfloat16 or nf4/fp4 lora merge on cpu.") - - if args.low_gpu_mem and args.device == "gpu": - quant_config = copy.deepcopy(config.quantization_config) - config.quantization_config = QuantizationConfig() - lora_config.merge_weights = False - with device_guard(): - model = AutoModelForCausalLM.from_pretrained( - lora_config.base_model_name_or_path, - config=config, - low_cpu_mem_usage=True, - ) - model = LoRAModel.from_pretrained(model=model, lora_path=args.lora_path, lora_config=lora_config) - model.eval() - model_state_dict = model.model.state_dict() - lora_name_list = [] - for key in model_state_dict.keys(): - if "lora_A" in key: - lora_name_list.append(key[:-7]) - for name in lora_name_list: - weight_process(name, quant_config, lora_config, model_state_dict) - else: - model = AutoModelForCausalLM.from_pretrained( - lora_config.base_model_name_or_path, - config=config, - low_cpu_mem_usage=args.low_gpu_mem, - ) - lora_config.merge_weights = True - model = LoRAModel.from_pretrained(model=model, lora_path=args.lora_path, lora_config=lora_config) - model.eval() - model_state_dict = model.model.state_dict() - for key in list(model_state_dict): - if "lora" in key: - del model_state_dict[key] - if "quant" in key: - del model_state_dict[key] - model.model.config.quantization_config = QuantizationConfig() - model.model.save_pretrained(args.merge_lora_model_path, state_dict=model_state_dict) - - tokenizer = AutoTokenizer.from_pretrained(lora_config.base_model_name_or_path) - tokenizer.save_pretrained(args.merge_lora_model_path) - - -if __name__ == "__main__": - merge() diff --git a/llm/llama/npu/export_utils.py b/llm/npu/llama/export_utils.py similarity index 91% rename from llm/llama/npu/export_utils.py rename to llm/npu/llama/export_utils.py index db7a1f4ad27f..84bd0018a767 100644 --- a/llm/llama/npu/export_utils.py +++ b/llm/npu/llama/export_utils.py @@ -1,11 +1,11 @@ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -61,23 +61,23 @@ def process_params(model_path): for op in tqdm(block.ops, desc="processing the linear layer for NPU"): if op.type == "matmul_v2": w_name = op.input_arg_names[-1] - if w_name.endswith("qkv_weight") and op.attr("trans_y") == False: + if w_name.endswith("qkv_weight") and not op.attr("trans_y"): op._set_attr("trans_y", True) w = block.var(w_name) trans_weight(w) - elif w_name.endswith("out_proj_weight") and op.attr("trans_y") == False: + elif w_name.endswith("out_proj_weight") and not op.attr("trans_y"): op._set_attr("trans_y", True) w = block.var(w_name) trans_weight(w) - elif w_name.endswith("ffn1_weight") and op.attr("trans_y") == False: + elif w_name.endswith("ffn1_weight") and not op.attr("trans_y"): op._set_attr("trans_y", True) w = block.var(w_name) trans_weight(w) - elif w_name.endswith("ffn2_weight") and op.attr("trans_y") == False: + elif w_name.endswith("ffn2_weight") and not op.attr("trans_y"): op._set_attr("trans_y", True) w = block.var(w_name) trans_weight(w) - elif w_name == "llama_lm_head_0.w_0" and op.attr("trans_y") == False: + elif w_name == "llama_lm_head_0.w_0" and not op.attr("trans_y"): op._set_attr("trans_y", True) w = block.var(w_name) trans_weight(w) diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/npu/llama/llama_npu_opt_lora.sh similarity index 94% rename from llm/llama/npu/llama_npu_opt_lora.sh rename to llm/npu/llama/llama_npu_opt_lora.sh index 6523dbae2321..65992492f587 100644 --- a/llm/llama/npu/llama_npu_opt_lora.sh +++ b/llm/npu/llama/llama_npu_opt_lora.sh @@ -27,12 +27,12 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh rm -rf lora_bf16_llama_N1C8 rm -rf output/lora_bf16_llama_N1C8 -ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "run_finetune.py" | grep -v grep | awk '{print $2}' | xargs kill -9 export PYTHONPATH=../../../:$PYTHONPATH python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./lora_bf16_llama_N1C8" \ - ../../finetune_generation.py \ + ../../run_finetune.py \ --device "npu" \ --model_name_or_path "meta-llama/Llama-2-13b-chat" \ --dataset_name_or_path "data/" \ diff --git a/llm/llama/npu/llama_npu_opt_ppt.sh b/llm/npu/llama/llama_npu_opt_ppt.sh similarity index 100% rename from llm/llama/npu/llama_npu_opt_ppt.sh rename to llm/npu/llama/llama_npu_opt_ppt.sh diff --git a/llm/llama/npu/llama_npu_opt_sft.sh b/llm/npu/llama/llama_npu_opt_sft.sh similarity index 95% rename from llm/llama/npu/llama_npu_opt_sft.sh rename to llm/npu/llama/llama_npu_opt_sft.sh index e0e7e5ccbaea..64ada00e420c 100644 --- a/llm/llama/npu/llama_npu_opt_sft.sh +++ b/llm/npu/llama/llama_npu_opt_sft.sh @@ -33,11 +33,11 @@ export MULTI_STREAM_MEMORY_REUSE=1 export PYTHONPATH=../../../:$PYTHONPATH rm -rf sft_bf16_llama_N1C8 rm -rf output/sft_bf16_llama_N1C8 -ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "run_finetune.py" | grep -v grep | awk '{print $2}' | xargs kill -9 python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./sft_bf16_llama_N1C8" \ - ../../finetune_generation.py \ + ../../run_finetune.py \ --device "npu" \ --model_name_or_path "meta-llama/Llama-2-13b" \ --dataset_name_or_path "data/" \ diff --git a/llm/export_model.py b/llm/predict/export_model.py similarity index 96% rename from llm/export_model.py rename to llm/predict/export_model.py index 5f5dd30bd97c..ce0a5ed76c02 100644 --- a/llm/export_model.py +++ b/llm/predict/export_model.py @@ -18,9 +18,9 @@ import paddle from paddle.distributed import fleet -from predictor import ModelArgument, PredictorArgument, create_predictor +from predict.predictor import ModelArgument, PredictorArgument, create_predictor from tqdm import tqdm -from utils import generate_rank_mapping, get_infer_model_path +from utils.utils import generate_rank_mapping, get_infer_model_path from paddlenlp.trainer import PdArgumentParser from paddlenlp.utils.log import logger diff --git a/llm/flask_server.py b/llm/predict/flask_server.py similarity index 98% rename from llm/flask_server.py rename to llm/predict/flask_server.py index aba727fa6311..d467d6dac688 100644 --- a/llm/flask_server.py +++ b/llm/predict/flask_server.py @@ -22,7 +22,12 @@ import requests from filelock import FileLock -from predictor import BasePredictor, ModelArgument, PredictorArgument, create_predictor +from predict.predictor import ( + BasePredictor, + ModelArgument, + PredictorArgument, + create_predictor, +) from paddlenlp.trainer import PdArgumentParser from paddlenlp.utils.log import logger diff --git a/llm/gradio_ui.py b/llm/predict/gradio_ui.py similarity index 100% rename from llm/gradio_ui.py rename to llm/predict/gradio_ui.py diff --git a/llm/predictor.py b/llm/predict/predictor.py similarity index 99% rename from llm/predictor.py rename to llm/predict/predictor.py index f8f39577cfb6..262a21fa6a0b 100644 --- a/llm/predictor.py +++ b/llm/predict/predictor.py @@ -28,7 +28,7 @@ import paddle.incubate.multiprocessing as mp from paddle.base.framework import in_cinn_mode, in_pir_executor_mode from paddle.distributed import fleet -from utils import ( +from utils.utils import ( dybatch_preprocess, get_alibi_slopes, get_default_max_decoding_length, diff --git a/llm/request_flask_server.py b/llm/predict/request_flask_server.py similarity index 100% rename from llm/request_flask_server.py rename to llm/predict/request_flask_server.py diff --git a/llm/qwen/lora_argument_pissa.json b/llm/qwen/lora_argument_pissa.json deleted file mode 100644 index e3e51eb1bee0..000000000000 --- a/llm/qwen/lora_argument_pissa.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "model_name_or_path": "qwen/qwen-7b", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen_lora_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 32, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":16, - "num_train_epochs": 3, - "learning_rate": 2e-05, - "warmup_steps": 10, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 1024, - "max_length": 2048, - "bf16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 1, - "pipeline_parallel_degree": 1, - "lora": true, - "pissa": true, - "zero_padding": false, - "use_flash_attention": false - } diff --git a/llm/qwen/lora_argument_qwen2_7b.json b/llm/qwen/lora_argument_qwen2_7b.json deleted file mode 100644 index 7cf88075ab87..000000000000 --- a/llm/qwen/lora_argument_qwen2_7b.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "model_name_or_path": "Qwen/Qwen2-7B", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen2_7b__lora_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 4, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps": 16, - "num_train_epochs": 3, - "learning_rate": 3e-04, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 2048, - "max_length": 4096, - "bf16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 1, - "pipeline_parallel_degree": 1, - "lora": true, - "zero_padding": false, - "use_flash_attention": false -} \ No newline at end of file diff --git a/llm/qwen/lora_argument_qwen2moe.json b/llm/qwen/lora_argument_qwen2moe.json deleted file mode 100644 index 0344e3885ba0..000000000000 --- a/llm/qwen/lora_argument_qwen2moe.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen2moe_lora_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 4, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":16, - "num_train_epochs": 3, - "learning_rate": 3e-04, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 1024, - "max_length": 32768, - "bf16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 8, - "pipeline_parallel_degree": 1, - "lora": true, - "zero_padding": false, - "use_flash_attention": false - } \ No newline at end of file diff --git a/llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json b/llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json deleted file mode 100644 index 5bcdc1158680..000000000000 --- a/llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "model_name_or_path": "Qwen/Qwen1.5-7B", - "tokenizer_name_or_path": "Qwen/Qwen1.5-7B", - "input_dir": "./data", - "output_dir": "./checkpoints/qwen1.5_7b_pretrain_ckpts", - "per_device_train_batch_size": 2, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding_parallel_degree": 4, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": true, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json b/llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json deleted file mode 100644 index d67ddfc01c6a..000000000000 --- a/llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "model_name_or_path": "Qwen/Qwen2-7B", - "tokenizer_name_or_path": "Qwen/Qwen2-7B", - "input_dir": "./data", - "output_dir": "./checkpoints/qwen2_7b_pretrain_ckpts", - "per_device_train_batch_size": 2, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding_parallel_degree": 4, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json b/llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json deleted file mode 100644 index ed2bdb9cf7f3..000000000000 --- a/llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "qwen/qwen-7b", - "tokenizer_name_or_path": "qwen/qwen-7b", - "input_dir": "./data", - "output_dir": "./checkpoints/qwen_pretrain_ckpts", - "per_device_train_batch_size": 2, - "gradient_accumulation_steps": 8, - "per_device_eval_batch_size": 2, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 1, - "sharding": "stage2", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": false, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/qwen/pretrain_argument_tp2pp4.json b/llm/qwen/pretrain_argument_tp2pp4.json deleted file mode 100644 index f2272ca3b7c6..000000000000 --- a/llm/qwen/pretrain_argument_tp2pp4.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "model_name_or_path": "qwen/qwen-7b", - "tokenizer_name_or_path": "qwen/qwen-7b", - "input_dir": "./data", - "output_dir": "./checkpoints/qwen_pretrain_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 16, - "per_device_eval_batch_size": 16, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 4, - "sharding": "stage1", - "virtual_pp_degree": 1, - "sequence_parallel": 0, - "use_flash_attention": true, - "use_fused_rms_norm": true, - "use_fused_rope": true, - "max_seq_length": 4096, - "learning_rate": 3e-05, - "min_learning_rate": 3e-06, - "warmup_steps": 30, - "logging_steps": 1, - "max_steps": 10000, - "save_steps": 5000, - "eval_steps": 1000, - "weight_decay": 0.01, - "bf16": true, - "fp16_opt_level": "O2", - "warmup_ratio": 0.01, - "max_grad_norm": 1.0, - "dataloader_num_workers": 1, - "continue_training": 1, - "do_train": true, - "do_eval": true, - "do_predict": true, - "disable_tqdm": true, - "recompute": true, - "distributed_dataloader": 1, - "recompute_granularity": "full", - "save_total_limit": 2 - } diff --git a/llm/qwen/pt_argument_qwen2_7b.json b/llm/qwen/pt_argument_qwen2_7b.json deleted file mode 100644 index 5d7c85c32503..000000000000 --- a/llm/qwen/pt_argument_qwen2_7b.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "model_name_or_path": "Qwen/Qwen2-7B", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen2_7b_pt_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 4, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps": 16, - "num_train_epochs": 3, - "learning_rate": 3e-02, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 2048, - "max_length": 4096, - "bf16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 1, - "pipeline_parallel_degree": 1, - "prefix_tuning": true, - "zero_padding": false, - "use_flash_attention": false - } - \ No newline at end of file diff --git a/llm/qwen/sft_argument_qwen2_7b.json b/llm/qwen/sft_argument_qwen2_7b.json deleted file mode 100644 index 70822bcc04f9..000000000000 --- a/llm/qwen/sft_argument_qwen2_7b.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "model_name_or_path": "Qwen/Qwen2-7B", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen2-7b_sft_ckpts", - "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 4, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":16, - "num_train_epochs": 3, - "learning_rate": 3e-05, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 2048, - "max_length": 4096, - "bf16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 4, - "pipeline_parallel_degree": 1, - "zero_padding": false, - "use_flash_attention": false - } diff --git a/llm/qwen/sft_argument_qwen2moe.json b/llm/qwen/sft_argument_qwen2moe.json deleted file mode 100644 index 75d3a93500f5..000000000000 --- a/llm/qwen/sft_argument_qwen2moe.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", - "dataset_name_or_path": "./data", - "output_dir": "./checkpoints/qwen2moe_sft_ckpts", - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 4, - "per_device_eval_batch_size": 8, - "eval_accumulation_steps":16, - "num_train_epochs": 3, - "learning_rate": 3e-05, - "warmup_steps": 30, - "logging_steps": 1, - "evaluation_strategy": "epoch", - "save_strategy": "epoch", - "src_length": 1024, - "max_length": 32768, - "bf16": true, - "fp16_opt_level": "O2", - "do_train": true, - "do_eval": true, - "disable_tqdm": true, - "load_best_model_at_end": true, - "eval_with_do_generation": false, - "metric_for_best_model": "accuracy", - "recompute": true, - "save_total_limit": 1, - "tensor_parallel_degree": 8, - "sharding": "stage2", - "pipeline_parallel_degree": 1 -} \ No newline at end of file diff --git a/llm/finetune_generation.py b/llm/run_finetune.py similarity index 95% rename from llm/finetune_generation.py rename to llm/run_finetune.py index 8c72c553fb70..683bd871946b 100644 --- a/llm/finetune_generation.py +++ b/llm/run_finetune.py @@ -14,20 +14,18 @@ import json import os import sys -from dataclasses import dataclass, field from functools import partial -from typing import Optional import paddle -from argument import ( +from utils.argument import ( DataArgument, GenerateArgument, ModelArgument, QuantArgument, TrainingArguments, ) -from data import get_convert_example -from utils import ( +from utils.data import get_convert_example +from utils.utils import ( CausalLMTrainer, ZeroPaddingIterDatasetCallback, compute_metrics, @@ -54,44 +52,16 @@ Llama3Tokenizer, LlamaTokenizer, ) -from paddlenlp.transformers.configuration_utils import LlmMetaConfig, llmmetaclass +from paddlenlp.transformers.configuration_utils import LlmMetaConfig from paddlenlp.utils.log import logger # Fine-tune Environment Variables to support sharding stage1 overlap optimization. os.environ["USE_CASUAL_MASK"] = "False" -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator - - -@dataclass -@llmmetaclass -@add_start_docstrings(TrainingArguments.__doc__) -class FinetuneArguments(TrainingArguments): - decay_steps: int = field( - default=0, - metadata={"help": "The steps use to control the learing rate."}, - ) - tensor_parallel_output: Optional[bool] = field( - default=False, - metadata={"help": "whether to output logits in distributed status"}, - ) - - -def read_local_dataset(path): - with open(path, "r", encoding="utf-8") as fp: - for line in fp: - yield json.loads(line.strip()) - - def main(): # Arguments - parser = PdArgumentParser((GenerateArgument, QuantArgument, ModelArgument, DataArgument, FinetuneArguments)) + parser = PdArgumentParser((GenerateArgument, QuantArgument, ModelArgument, DataArgument, TrainingArguments)) # Support format as "args.json --arg1 value1 --arg2 value2.” # In case of conflict, command line arguments take precedence. if len(sys.argv) >= 2 and sys.argv[1].endswith(".json"): @@ -161,6 +131,8 @@ def main(): model_config.hidden_dropout_prob = model_args.hidden_dropout_prob if hasattr(model_config, "attention_probs_dropout_prob"): model_config.attention_probs_dropout_prob = model_args.attention_probs_dropout_prob + if hasattr(model_config, "ignore_index"): + model_config.ignore_index = -100 if model_args.fuse_attention_qkv is not None: model_config.fuse_attention_qkv = model_args.fuse_attention_qkv @@ -169,7 +141,7 @@ def main(): model_config.seq_length = data_args.max_length - print("Final model config:", model_config) + logger.info(f"Final model config: {model_config}") model_class = AutoModelForCausalLM if training_args.pipeline_parallel_degree > 1: @@ -342,7 +314,7 @@ def neft_post_hook(module, input, output): train_ds = train_ds.skip(consumed_samples) if training_args.pipeline_parallel_degree > 1: - from data import convert_example_common + from utils.data import convert_example_common trans_func = partial(convert_example_common, tokenizer=tokenizer, data_args=data_args) else: @@ -584,7 +556,7 @@ def compute_metrics_do_generation(eval_preds): # QAT if quant_args.do_qat: - from quant import create_qat_model + from utils.quant import create_qat_model trainer.model = create_qat_model(quant_args, trainer.model, dtype) train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) @@ -599,7 +571,7 @@ def compute_metrics_do_generation(eval_preds): raise NotImplementedError( "PTQ strategy not supported for LoRA model. Please merge lora parameters to pretrain model first." ) - from quant import ( + from utils.quant import ( apply_autoclip, apply_ptq, apply_shift, @@ -635,7 +607,7 @@ def compute_metrics_do_generation(eval_preds): raise NotImplementedError( "PTQ strategy not supported for LoRA model. Please merge lora parameters to pretrain model first." ) - from quant import apply_gptq + from utils.quant import apply_gptq ptq_dataloader = trainer.get_ptq_dataloader(ptq_ds) apply_gptq(quant_args, trainer, ptq_dataloader) diff --git a/llm/run_pretrain.py b/llm/run_pretrain.py index 04a6fc22dc24..12364e47118f 100644 --- a/llm/run_pretrain.py +++ b/llm/run_pretrain.py @@ -85,6 +85,14 @@ class PreTrainingArguments(TrainingArguments): default=False, metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, ) + unified_checkpoint: bool = field( + default=False, + metadata={"help": "Enable fused linear grad add strategy."}, + ) + unified_checkpoint_config: Optional[str] = field( + default="", + metadata={"help": "Configs to unify hybrid parallel checkpoint.\n"}, + ) def __post_init__(self): super().__post_init__() @@ -360,7 +368,7 @@ def main(): training_args.no_recompute_layers.sort() if training_args.enable_linear_fused_grad_add: - from fused_layers import mock_layers + from utils.fused_layers import mock_layers mock_layers() @@ -473,7 +481,7 @@ def main(): model_class = AutoModelForCausalLMPipe if "LLama" in str(config.architectures): try: - from register_reshard import register_pp_reshard_information + from utils.register_reshard import register_pp_reshard_information register_pp_reshard_information(config.num_hidden_layers) except: diff --git a/llm/tests/test_best_pretrain_speed.py b/llm/tests/test_best_pretrain_speed.py deleted file mode 100644 index 79f6ea455a5c..000000000000 --- a/llm/tests/test_best_pretrain_speed.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# import copy -import json -import os -import shutil - -# import numpy as np -from llama.tests.parallel_launch import TestMultipleGpus - -# export NVIDIA_TF32_OVERRIDE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=xgbe0 -# export NCCL_IB_TIMEOUT=22 -# export NCCL_DEBUG=INFO -# export NCCL_IB_DISABLE=1 -# export NCCL_IB_GDR_LEVEL=4 -# export NCCL_SOCKET_IFNAME=eth2 - - -environment_variables = { - # "NCCL_ALGO": "Tree", - # "NVIDIA_TF32_OVERRIDE": "0", - "NCCL_IB_TIMEOUT": "22", - # "NCCL_DEBUG": "INFO", - # "FLAGS_embedding_deterministic": "1", - # "FLAGS_cudnn_deterministic": "1", - # "Flags_mp_aysnc_allreduce": "1", - # "Flags_skip_mp_c_identity": "1", - # "FLAGS_shard_norm_align_dp": "0", - # "FLAGS_shard_use_reduce": "1", - "test_ci_no_save_model": "1", -} - -pretrain_arguments = { - "learning_rate": 1e-04, - "min_learning_rate": 1e-05, - "warmup_steps": 100, - "logging_steps": 1, - "max_steps": 10, - "save_steps": 2000, - "eval_steps": 1000, - "continue_training": 0, - "skip_memory_metrics": 0, - "do_train": "true", - "do_eval": "false", - "do_predict": "false", - "disable_tqdm": "true", - "save_total_limit": 2, -} - -best_pretrain_config_for_a100_80g = { - # "qwen/qwen-7b": "./qwen/pretrain_argument_stage2.json", - # "baichuan-inc/Baichuan2-13B-Base" "./llama/pretrain-baichuan2_13b-tp4sd2_stage2.json", - # "baichuan-inc/Baichuan2-13B-Base": "./llama/pretrain-baichuan2_13b-tp2sd4_stage2.json", - "facebook/llama-7b": "./llama/pretrain-llama_7b-tp2sd4_stage2.json", - "facebook/llama-13b": "./llama/pretrain-llama_13b-tp2sd4_stage2.json", - "meta-llama/Llama-2-7b": "./llama/pretrain-llama2_7b-tp2sd4_stage2.json", - "meta-llama/Llama-2-13b": "./llama/pretrain-llama2_13b-tp2sd4_stage2.json", - "qwen/qwen-7b": "./qwen/pretrain-qwen_7b-tp2sd4_stage2.json", - "baichuan-inc/Baichuan2-13B-Base": "./baichuan/pretrain-baichuan2_13b-sd8_stage2.json", - "baichuan-inc/Baichuan2-7B-Base": "./baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json", - "FlagAlpha/Llama2-Chinese-13b-Chat": "./llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json", - "FlagAlpha/Llama2-Chinese-7b-Chat": "./llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json", - "linly-ai/chinese-llama-2-7b": "./llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json", - "idea-ccnl/ziya-llama-13b-v1": "./llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json", -} - - -def log_test_result(model_name_or_path, config_name, config, log_dir="log"): - model_name_or_path = model_name_or_path - max_seq_len = config["max_seq_length"] - distribued_info = config_name.split("b-")[-1].split(".json")[0] - speed = "NA" - memory = "NA" - config_name = config_name - time = "NA" - - file_path = os.path.join(log_dir, "workerlog.n0.c0") - - get_memory_cmd = ( - "grep -aE 'gpu_mem_max_memory_reserved ' " + file_path + " | awk '{print $8}' | awk -F '\x1b' '{print $1}'" - ) - get_time_cmd = ( - "grep -aE 'gpu_mem_max_memory_reserved ' " - + file_path - + " | awk -F '[' '{print $3}' | awk -F ',' '{print $1}'" - ) - get_ips_cmd = "grep -aE 'global_step: ' " + file_path + " | awk -F ',' '{print $6}' | awk '{print $2}' " - - import subprocess - - res = subprocess.check_output(get_memory_cmd, shell=True, text=True) - if "MB" in res: - memory = res.strip() - - res = subprocess.check_output(get_time_cmd, shell=True, text=True) - if len(res) > 0: - time = res.strip() - - res = subprocess.check_output(get_ips_cmd, shell=True, text=True) - ips = [float(x) for x in res.strip().split()] - if len(ips) > 4: - ips = sum(ips[2:-2]) / (len(ips) - 4) - speed = round(ips * max_seq_len / 8, 2) - - write_result( - [ - f"`{model_name_or_path}`", - max_seq_len, - f"`{distribued_info}`", - speed, - memory, - f"`{config_name}`", - time, - ] - ) - - return res - - -result_title = r"""| 模型 | 序列长度 | 分布式策略 | 速度(`tokens/card/sec`) | 显存占用(`MB^1`) | 配置文件| 测试时间 |""" -result_file_name = "results_of_best_pretrain_config_for_a100_80g.md" - - -def write_result(res): - fileds_name = [x.strip() for x in result_title.split("|")[1:-1]] - assert len(fileds_name) == len(res) - - def format_list_to_str(lst): - content = "|".join([""] + ["{:10}".format(x) for x in lst] + [""]) - return content - - if not os.path.exists(result_file_name): - with open(result_file_name, "w") as f: - f.write(format_list_to_str(fileds_name) + "\n") - f.write(format_list_to_str([" :-: "] * len(fileds_name)) + "\n") - - with open(result_file_name, "a+") as f: - f.write(format_list_to_str(res) + "\n") - - -def remove_logs(log_dir="log"): - if os.path.exists(log_dir): - shutil.rmtree(log_dir) - - -def remove_ckpt(ckpt_dir): - if os.path.exists(ckpt_dir): - shutil.rmtree(ckpt_dir) - - -class TestModelOnN1C8(TestMultipleGpus): - def setUp(self): - os.environ.update(environment_variables) - - def test_facebook_llama_7b(self): - name = "facebook/llama-7b" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_facebook_llama_13b(self): - name = "facebook/llama-13b" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_metallama_Llama2_7b(self): - name = "meta-llama/Llama-2-7b" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_metallama_Llama2_13b(self): - name = "meta-llama/Llama-2-13b" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_qwen_qwen_7b(self): - name = "qwen/qwen-7b" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_baichuan_Baichuan2_13B_Base(self): - name = "baichuan-inc/Baichuan2-13B-Base" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_baichuan_Baichuan2_7B_Base(self): - name = "baichuan-inc/Baichuan2-7B-Base" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_FlagAlpha_Llama2Chinese_13b_Chat(self): - name = "FlagAlpha/Llama2-Chinese-13b-Chat" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_FlagAlpha_Llama2Chinese_7b_Chat(self): - name = "FlagAlpha/Llama2-Chinese-7b-Chat" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_linlyai_chinesellama2_7b(self): - name = "linly-ai/chinese-llama-2-7b" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") - - def test_ideaccnl_ziyallama_13b(self): - name = "idea-ccnl/ziya-llama-13b-v1" - arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r")) - arguments.update(pretrain_arguments) - remove_logs() - remove_ckpt(arguments["output_dir"]) - self.run_n1c8("run_pretrain.py", **arguments) - log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log") diff --git a/llm/tools/merge_lora_params.py b/llm/tools/merge_lora_params.py new file mode 100644 index 000000000000..06d2e2d7a9bd --- /dev/null +++ b/llm/tools/merge_lora_params.py @@ -0,0 +1,222 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import copy +import math +import os + +import numpy as np +import paddle +from paddle.nn.quant import weight_dequantize + +from paddlenlp.peft import LoRAConfig, LoRAModel + +try: + from paddlenlp.quantization.qlora import qlora_weight_quantize_dequantize + from paddlenlp.quantization.quantization_config import QuantizationConfig + from paddlenlp.quantization.quantization_linear import QuantizationLinear +except: + pass + +from paddlenlp.trainer.argparser import strtobool +from paddlenlp.transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from paddlenlp.transformers.utils import device_guard +from paddlenlp.utils.env import CONFIG_NAME +from paddlenlp.utils.log import logger + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_name_or_path", default=None, required=True, type=str, help="The directory of model.") + parser.add_argument( + "--lora_path", default=None, type=str, required=True, help="The directory of LoRA parameters. Default to None" + ) + parser.add_argument("--output_path", default=None, type=str, required=True, help="The directory of saved model ") + parser.add_argument("--safe_serialization", default="False", type=strtobool, help="Whether save as safetensor.") + parser.add_argument( + "--device", + type=str, + default="gpu", + choices=["gpu", "npu", "cpu"], + help="Device for selecting for merging lora weights, currently only supports gpu/npu/cpu.", + ) + return parser.parse_args() + + +def weight_process(name, quant_config, lora_config, state_dict, device): + target_device = device if device == "cpu" else device + ":0" + + if (name + ".weight") not in state_dict.keys(): + return + + if quant_config.weight_quantize_algo is None: + return + elif quant_config.weight_quantize_algo in ["nf4", "fp4"]: + weight = state_dict.pop(name + ".weight").to(target_device) + state_dict[name + ".weight"] = qlora_weight_quantize_dequantize( + weight, + quant_algo=quant_config.weight_quantize_algo, + double_quant=quant_config.weight_double_quant, + block_size=quant_config.weight_blocksize, + double_quant_block_size=quant_config.weight_double_quant_block_size, + ).cpu() + elif quant_config.weight_quantize_algo in ["weight_only_int8"]: + quant_weight = state_dict.pop(name + ".quant_weight").to(target_device) + quant_scale = state_dict.pop(name + ".quant_scale").to(target_device) + state_dict[name + ".weight"] = weight_dequantize(quant_weight, quant_scale, out_dtype=lora_config.dtype).cpu() + else: + raise ValueError(f"quant_config.weight_quantize_algo {quant_config.weight_quantize_algo} is not supported.") + + +def lora_process(name, lora_config, state_dict, device, lora_state_dict=None): + target_device = device if device == "cpu" else device + ":0" + + if (name + ".weight") not in state_dict.keys(): + return + + weight = state_dict.pop(name + ".weight") + if lora_state_dict is None: + lora_A = state_dict.pop(name + ".lora_A") + lora_B = state_dict.pop(name + ".lora_B") + else: + lora_A = lora_state_dict.pop(name + ".lora_A") + lora_B = lora_state_dict.pop(name + ".lora_B") + if device != "cpu": + weight = weight.to(target_device) + lora_A = lora_A.to(target_device) + lora_B = lora_B.to(target_device) + if not lora_config.rslora: + scaling = lora_config.lora_alpha / lora_config.r + else: + scaling = lora_config.lora_alpha / math.sqrt(lora_config.r) + + if device == "cpu" and weight.dtype.name == "BF16": + weight = weight.astype("float32") + lora_A = lora_A.astype("float32") + lora_B = lora_B.astype("float32") + out = (weight + lora_A @ lora_B * scaling).astype("bfloat16") + else: + out = (weight + lora_A @ lora_B * scaling).cpu() + + state_dict[name + ".weight"] = out + + +def merge_old_lora(lora_config, args): + lora_config.merge_weight = True + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + dtype=lora_config.dtype, + ) + model = LoRAModel.from_pretrained(model, args.lora_path) + model.eval() + model_state_dict = model.model.state_dict() + for key in list(model_state_dict): + if "lora" in key: + del model_state_dict[key] + return model, model_state_dict + + +def read_file(file_name): + if file_name.endswith("safetensors"): + try: + from paddlenlp.utils.safetensors import fast_load_file as load_file + except: + from safetensors.numpy import load_file + + read_tensors = load_file(file_name) + for key in list(read_tensors.keys()): + if isinstance(read_tensors[key], np.ndarray): + with device_guard("cpu"): + read_tensors[key] = paddle.Tensor(read_tensors.pop(key), zero_copy=True) + else: + with device_guard("cpu"): + read_tensors = paddle.load(file_name) + return read_tensors + + +def save_file(output_path, file_name, tensors, safe_serialization=True): + if safe_serialization: + from safetensors.numpy import save_file as _save_file + + if file_name == "model_state.pdparams": + file_name = "model.safetensors" + + for key in list(tensors.keys()): + if isinstance(tensors[key], paddle.Tensor): + tensors[key] = tensors.pop(key).cpu().numpy() + _save_file(tensors, os.path.join(output_path, file_name), metadata={"format": "np"}) + else: + paddle.save(tensors, os.path.join(output_path, file_name)) + + +def merge(): + args = parse_arguments() + paddle.set_device(args.device) + + lora_config = LoRAConfig.from_pretrained(args.lora_path) + if os.path.isfile(os.path.join(args.lora_path, CONFIG_NAME)): + config = AutoConfig.from_pretrained(args.lora_path) + elif args.model_name_or_path is not None: + config = AutoConfig.from_pretrained(args.model_name_or_path) + else: + raise ValueError( + f"We can not find config.json in lora_path: {args.lora_path} or find a valid model_name_or_path." + ) + config.dtype = lora_config.dtype + quant_config = copy.deepcopy(config.quantization_config) + lora_config.merge_weights = False + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) + tokenizer.save_pretrained(args.output_path) + + if lora_config.enable_lora_list is not None: + model, model_state_dict = merge_old_lora(lora_config, args) + else: + if quant_config.weight_quantize_algo in ["nf4", "fp4"]: + config.quantization_config = QuantizationConfig() + with device_guard(args.device): + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + config=config, + low_cpu_mem_usage=True, + ) + logger.info("load model done") + model = LoRAModel.from_pretrained(model=model, lora_path=args.lora_path, lora_config=lora_config) + logger.info("load lora model done") + if quant_config.weight_quantize_algo in ["weight_only_int8"]: + model.config.quantization_config = QuantizationConfig() + model.eval() + model_state_dict = model.model.state_dict() + if quant_config.weight_quantize_algo in ["nf4", "fp4", "weight_only_int8"]: + for name, layer in model.model.named_sublayers(): + if isinstance(layer, paddle.nn.Linear) or isinstance(layer, QuantizationLinear): + weight_process(name, quant_config, lora_config, model_state_dict, args.device) + + lora_name_list = [] + for key in model_state_dict.keys(): + if "lora_A" in key: + lora_name_list.append(key[:-7]) + for name in lora_name_list: + lora_process(name, lora_config, model_state_dict, args.device) + + logger.info("Begin to save merged model") + if args.safe_serialization: + model.model.save_pretrained( + args.output_path, state_dict=model_state_dict, safe_serialization=args.safe_serialization + ) + else: + model.model.save_pretrained(args.output_path, state_dict=model_state_dict, max_shard_size="100GB") + + +if __name__ == "__main__": + merge() diff --git a/llm/merge_tp_and_pp_params.py b/llm/tools/merge_tp_and_pp_params.py similarity index 100% rename from llm/merge_tp_and_pp_params.py rename to llm/tools/merge_tp_and_pp_params.py diff --git a/llm/argument.py b/llm/utils/argument.py similarity index 89% rename from llm/argument.py rename to llm/utils/argument.py index 79ce3fe4df16..67ad7c5dbe2a 100644 --- a/llm/argument.py +++ b/llm/utils/argument.py @@ -16,10 +16,21 @@ from paddlenlp.trainer import TrainingArguments from paddlenlp.trainer.trainer_utils import IntervalStrategy +from paddlenlp.transformers.configuration_utils import llmmetaclass from paddlenlp.utils.log import logger +def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + @dataclass +@llmmetaclass +@add_start_docstrings(TrainingArguments.__doc__) class TrainingArguments(TrainingArguments): benchmark: bool = field(default=False, metadata={"help": "Whether runs benchmark"}) # NOTE(gongenlei): new add autotuner_benchmark @@ -27,6 +38,22 @@ class TrainingArguments(TrainingArguments): default=False, metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, ) + decay_steps: int = field( + default=0, + metadata={"help": "The steps use to control the learing rate."}, + ) + tensor_parallel_output: Optional[bool] = field( + default=False, + metadata={"help": "whether to output logits in distributed status"}, + ) + unified_checkpoint: bool = field( + default=False, + metadata={"help": "Unify hybrid parallel checkpoint."}, + ) + unified_checkpoint_config: Optional[str] = field( + default="", + metadata={"help": "Configs to unify hybrid parallel checkpoint.\n"}, + ) def __post_init__(self): super().__post_init__() @@ -42,6 +69,16 @@ def __post_init__(self): self.report_to = [] self.save_strategy = IntervalStrategy.NO self.evaluation_strategy = IntervalStrategy.NO + if self.benchmark: + self.do_train = True + self.do_export = False + self.do_predict = False + self.do_eval = False + self.overwrite_output_dir = True + self.load_best_model_at_end = False + self.report_to = [] + self.save_strategy = IntervalStrategy.NO + self.evaluation_strategy = IntervalStrategy.NO @dataclass diff --git a/llm/data.py b/llm/utils/data.py similarity index 99% rename from llm/data.py rename to llm/utils/data.py index 8000bd598455..eabac7456cbe 100644 --- a/llm/data.py +++ b/llm/utils/data.py @@ -53,6 +53,7 @@ def get_convert_example(model): "gemma", "qwen2", "qwen2_moe", + "gpt", ]: return convert_example_common else: diff --git a/llm/llama/fused_layers.py b/llm/utils/fused_layers.py similarity index 100% rename from llm/llama/fused_layers.py rename to llm/utils/fused_layers.py diff --git a/llm/quant.py b/llm/utils/quant.py similarity index 100% rename from llm/quant.py rename to llm/utils/quant.py diff --git a/llm/llama/register_reshard.py b/llm/utils/register_reshard.py similarity index 100% rename from llm/llama/register_reshard.py rename to llm/utils/register_reshard.py diff --git a/llm/utils.py b/llm/utils/utils.py similarity index 99% rename from llm/utils.py rename to llm/utils/utils.py index 10c27a0b0594..2f51711b496b 100644 --- a/llm/utils.py +++ b/llm/utils/utils.py @@ -125,6 +125,16 @@ def get_lora_target_modules(model): ".*dense_h_to_4h.*", ".*dense_4h_to_h.*", ] + elif model.base_model_prefix == "gpt": + target_modules = [ + ".*qkv_proj.*", + ".*q_proj.*", + ".*k_proj.*", + ".*v_proj.*", + ".*linear1.*", + ".*linear2.*", + ".*out_proj.*", + ] elif model.base_model_prefix == "bloom": target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"] elif model.base_model_prefix == "llama" or isinstance(model, LlamaForCausalLMPipe): diff --git a/scripts/ci_approval/run_ci_approval.sh b/scripts/ci_approval/run_ci_approval.sh index bc55cc58d5df..328834caba99 100644 --- a/scripts/ci_approval/run_ci_approval.sh +++ b/scripts/ci_approval/run_ci_approval.sh @@ -40,7 +40,7 @@ for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{pri dir3=${arr_file_name[2]} dir4=${arr_file_name[3]} echo "file_name:"${file_name}, "dir1:"${dir1}, "dir2:"${dir2},"dir3:"${dir3},".xx:" ${file_name##*.} - if [[ ${file_name} =~ "paddlenlp/trainer/training_args.py" ]] || [[ ${file_name} =~ "paddlenlp/trainer/trainer.py" ]] || [[ ${file_name} =~ "llm/run_pretrain.py" ]] || [[ ${file_name} =~ "llm/finetune_generation.py" ]];then + if [[ ${file_name} =~ "paddlenlp/trainer/training_args.py" ]] || [[ ${file_name} =~ "paddlenlp/trainer/trainer.py" ]] || [[ ${file_name} =~ "llm/run_pretrain.py" ]] || [[ ${file_name} =~ "llm/run_finetune.py" ]];then echo_line="You must have two RD: one from(ZHUI, wawltor),one from(ForFishes,sneaxiy,zhiqiu) approval for the changes of training_args.py/trainer.py/run_pretrain.py " check_approval 2 ZHUI wawltor ForFishes sneaxiy zhiqiu elif [[ ${dir1} =~ "paddlenlp" ]];then diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index 167a86fc468e..89cffb71a578 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -22,9 +22,9 @@ export root_path=/workspace/PaddleNLP export gpt_case_path=$root_path/legacy/model_zoo/gpt-3 export gpt_data_path=/fleetx_data -export llama_case_path=$root_path/llm/llama/auto_parallel +export llama_case_path=$root_path/llm/auto_parallel/llama export llama_data_path=/llama_data -export llm_gpt_case_path=$root_path/llm/gpt-3/auto_parallel +export llm_gpt_case_path=$root_path/llm/auto_parallel/gpt-3 unset CUDA_VISIBLE_DEVICES diff --git a/tests/llm/test_finetune.py b/tests/llm/test_finetune.py index 016720ce8789..672f7e07e023 100644 --- a/tests/llm/test_finetune.py +++ b/tests/llm/test_finetune.py @@ -46,7 +46,7 @@ def test_finetune(self): finetune_config["output_dir"] = self.output_dir with argv_context_guard(finetune_config): - from finetune_generation import main + from run_finetune import main main() diff --git a/tests/llm/test_finetune_prefix_tuning.py b/tests/llm/test_finetune_prefix_tuning.py index 4f70a10d1c0e..b4324066e822 100644 --- a/tests/llm/test_finetune_prefix_tuning.py +++ b/tests/llm/test_finetune_prefix_tuning.py @@ -61,7 +61,7 @@ def test_prefix_tuning(self): prefix_tuning_config["dataset_name_or_path"] = self.data_dir prefix_tuning_config["output_dir"] = self.output_dir with argv_context_guard(prefix_tuning_config): - from finetune_generation import main + from run_finetune import main main() diff --git a/tests/llm/test_gradio.py b/tests/llm/test_gradio.py index 88661c6fbc74..731c5f9bf6d3 100644 --- a/tests/llm/test_gradio.py +++ b/tests/llm/test_gradio.py @@ -46,7 +46,7 @@ def setUp(self): self.model_path = "__internal_testing__/micro-random-llama" command = ( "cd ./llm && PYTHONPATH=../:$PYTHONPATH" - + ' {python} flask_server.py --model_name_or_path {model_path} --port {port} --flask_port {flask_port} --src_length 1024 --dtype "float16"'.format( + + ' {python} predict/flask_server.py --model_name_or_path {model_path} --port {port} --flask_port {flask_port} --src_length 1024 --dtype "float16"'.format( flask_port=self.flask_port, port=self.port, model_path=self.model_path, python=sys.executable ) ) diff --git a/tests/llm/test_long_sequence_strategies.py b/tests/llm/test_long_sequence_strategies.py index 687889c54178..169c329d274b 100644 --- a/tests/llm/test_long_sequence_strategies.py +++ b/tests/llm/test_long_sequence_strategies.py @@ -5079,6 +5079,6 @@ def test_dynamic_to_static_inference(self): config["model_name_or_path"] = save_path with argv_context_guard(config): - from export_model import main + from predict.export_model import main main() diff --git a/tests/llm/test_lora.py b/tests/llm/test_lora.py index bed84c39d96b..2e222e495688 100644 --- a/tests/llm/test_lora.py +++ b/tests/llm/test_lora.py @@ -61,17 +61,18 @@ def test_lora(self): lora_config["use_quick_lora"] = True with argv_context_guard(lora_config): - from finetune_generation import main + from run_finetune import main main() # merge weights merge_lora_weights_config = { "lora_path": lora_config["output_dir"], - "merge_lora_model_path": lora_config["output_dir"], + "model_name_or_path": lora_config["model_name_or_path"], + "output_path": lora_config["output_dir"], } with argv_context_guard(merge_lora_weights_config): - from merge_lora_params import merge + from tools.merge_lora_params import merge merge() @@ -90,17 +91,18 @@ def test_rslora_plus(self): lora_config["dataset_name_or_path"] = self.data_dir with argv_context_guard(lora_config): - from finetune_generation import main + from run_finetune import main main() # merge weights merge_lora_weights_config = { "lora_path": lora_config["output_dir"], - "merge_lora_model_path": lora_config["output_dir"], + "model_name_or_path": lora_config["model_name_or_path"], + "output_path": lora_config["output_dir"], } with argv_context_guard(merge_lora_weights_config): - from merge_lora_params import merge + from tools.merge_lora_params import merge merge() @@ -169,7 +171,7 @@ def test_rslora_plus(self): # lora_config["output_dir"] = self.output_dir # with argv_context_guard(lora_config): -# from finetune_generation import main +# from run_finetune import main # main() @@ -180,7 +182,7 @@ def test_rslora_plus(self): # "merge_model_path": lora_config["output_dir"], # } # with argv_context_guard(merge_lora_weights_config): -# from merge_lora_params import merge +# from tools.merge_lora_params import merge # merge() diff --git a/tests/llm/test_predictor.py b/tests/llm/test_predictor.py index c16d723375c1..0044f2ece476 100644 --- a/tests/llm/test_predictor.py +++ b/tests/llm/test_predictor.py @@ -185,7 +185,7 @@ def load_test_config(self): return config def test_create_predictor_with_unexpected_length(self): - from predictor import predict + from predict.predictor import predict config = self.load_test_config() config.pop("src_length", None) @@ -430,6 +430,6 @@ def test_export(self): config["model_type"] = "qwen-img2txt" with argv_context_guard(config): - from export_model import main + from predict.export_model import main main() diff --git a/tests/llm/test_ptq.py b/tests/llm/test_ptq.py index 2f41cead554d..43512dd7c4e2 100644 --- a/tests/llm/test_ptq.py +++ b/tests/llm/test_ptq.py @@ -46,7 +46,7 @@ def test_ptq(self): finetune_config["output_dir"] = self.output_dir with argv_context_guard(finetune_config): - from finetune_generation import main + from run_finetune import main main() @@ -59,7 +59,7 @@ def test_blha(self): finetune_config["output_dir"] = self.output_dir with argv_context_guard(finetune_config): - from finetune_generation import main + from run_finetune import main main() @@ -73,7 +73,7 @@ def test_ptq_smooth(self): finetune_config["smooth"] = True with argv_context_guard(finetune_config): - from finetune_generation import main + from run_finetune import main main() @@ -88,7 +88,7 @@ def test_ptq_shift(self): finetune_config["shift"] = True with argv_context_guard(finetune_config): - from finetune_generation import main + from run_finetune import main main() diff --git a/tests/llm/testing_utils.py b/tests/llm/testing_utils.py index 583e5479549f..3684ec243576 100644 --- a/tests/llm/testing_utils.py +++ b/tests/llm/testing_utils.py @@ -68,7 +68,7 @@ def run_predictor(self, config_params=None): predict_config.update(config_params) with argv_context_guard(predict_config): - from predictor import predict + from predict.predictor import predict predict() @@ -83,7 +83,7 @@ def run_predictor(self, config_params=None): config["model_name_or_path"] = self.output_dir config.update(config_params) with argv_context_guard(config): - from export_model import main + from predict.export_model import main main() @@ -96,7 +96,7 @@ def run_predictor(self, config_params=None): config_params.pop("model_name_or_path", None) config.update(config_params) with argv_context_guard(config): - from predictor import predict + from predict.predictor import predict predict() diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh index 785adab372df..c215c15351f1 100644 --- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh @@ -81,20 +81,20 @@ function _train(){ case ${device_num} in N1C1) echo "Run with: device_num=${device_num} run_mode=${run_mode}" train_cmd="python -m paddle.distributed.launch --gpus=0 \ - --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}" + --auto_tuner_json ${autoconfig_json_file} run_finetune.py ${modle_json_file}" ;; N1C8) echo "Run with: device_num=${device_num} run_mode=${run_mode}" train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ - --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}" + --auto_tuner_json ${autoconfig_json_file} run_finetune.py ${modle_json_file}" ;; N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}" train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ --auto_tuner_json ${autoconfig_json_file} --master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes \ - finetune_generation.py ${modle_json_file}" + run_finetune.py ${modle_json_file}" ;; *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ - --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}" + --auto_tuner_json ${autoconfig_json_file} run_finetune.py ${modle_json_file}" ;; esac cd ../llm/ diff --git a/tests/test_tipc/configs/llama/train_infer_python.txt b/tests/test_tipc/configs/llama/train_infer_python.txt index fd7488fa7c16..8242d0f0f744 100644 --- a/tests/test_tipc/configs/llama/train_infer_python.txt +++ b/tests/test_tipc/configs/llama/train_infer_python.txt @@ -13,7 +13,7 @@ null:null null:null ## trainer:norm_train -norm_train:../llm/llama/benchmark.py --model_name_or_path facebook/llama-7b-2l --do_train --max_steps 500 --recompute False --overwrite_output_dir --output_dir ./checkpoints/ --fp16_opt_level O2 --learning_rate 3e-5 --warmup_steps 0 --seed 23 --logging_steps 1 --dataloader_num_workers 1 +norm_train:../legacy/examples/benchmark/llm/llama_single_gpu/benchmark.py --model_name_or_path facebook/llama-7b-2l --do_train --max_steps 500 --recompute False --overwrite_output_dir --output_dir ./checkpoints/ --fp16_opt_level O2 --learning_rate 3e-5 --warmup_steps 0 --seed 23 --logging_steps 1 --dataloader_num_workers 1 pact_train:null fpgm_train:null distill_train:null diff --git a/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh index 6853ce963ad2..6c6413c08b86 100644 --- a/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh @@ -118,11 +118,11 @@ function _train(){ cd ../llm/ echo "run run_mode: ${run_mode} device_num: ${device_num}" if [ "N1C1" = ${device_num} ]; then - train_cmd="python -u finetune_generation.py ${train_cmd}" + train_cmd="python -u run_finetune.py ${train_cmd}" else rm -rf ./mylog # 注意执行前删掉log目录 train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \ - finetune_generation.py ${train_cmd}" + run_finetune.py ${train_cmd}" fi echo "train_cmd: ${train_cmd} log_file: ${log_file}" diff --git a/tests/trainer/test_lora_unified_checkpoint.py b/tests/trainer/test_lora_unified_checkpoint.py index 3a5533f65c1f..e04e8db907ad 100644 --- a/tests/trainer/test_lora_unified_checkpoint.py +++ b/tests/trainer/test_lora_unified_checkpoint.py @@ -119,7 +119,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - self.run_lora_file = "llm/finetune_generation.py" + self.run_lora_file = "llm/run_finetune.py" self.num_nodes = 1 def runfirst(self, train_args): @@ -169,7 +169,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - self.run_lora_file = "llm/finetune_generation.py" + self.run_lora_file = "llm/run_finetune.py" def runfirst(self, train_args): self.run_n1c8(self.run_lora_file, **train_args) diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py index 1230dda9a2bc..5044eeaad5f5 100644 --- a/tests/trainer/test_unified_checkpoint.py +++ b/tests/trainer/test_unified_checkpoint.py @@ -194,7 +194,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - self.run_pretrain_file = "llm/llama/run_pretrain.py" + self.run_pretrain_file = "llm/run_pretrain.py" def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) diff --git a/tests/transformers/test_chat_template.py b/tests/transformers/test_chat_template.py index 216286df1678..4e443b54a2e2 100644 --- a/tests/transformers/test_chat_template.py +++ b/tests/transformers/test_chat_template.py @@ -218,7 +218,7 @@ def test_must_have_system(self): system = tokenizer.chat_template.render_system() system_ids = tokenizer.encode(system, add_special_tokens=False)["input_ids"] - from data import tokenize_rounds_example + from utils.data import tokenize_rounds_example fake_data_args = self.DataArg(len(system_ids) + 5, src_length=len(system_ids) + 5) @@ -244,7 +244,7 @@ def test_at_least_one_turn(self): all_sentence_ids = tokenizer(all_sentence, add_special_tokens=False)["input_ids"] # get the max_length of conversation - from data import tokenize_rounds_example + from utils.data import tokenize_rounds_example fake_data_args = self.DataArg(1024) example = {"src": ["你好", "今天吃啥"], "tgt": ["您好,我是个人人工智能助手", "你可以选择不同的菜系"]} @@ -342,7 +342,7 @@ def test_jinja_syntax_error(self): self.tokenizer.init_chat_template(error_jinja) def test_train_format(self): - from data import tokenize_rounds_example + from utils.data import tokenize_rounds_example fake_data_args = self.DataArg(50, src_length=50) example = {"src": ["你好"], "tgt": ["您好,我是个人人工智能助手"]} @@ -360,7 +360,7 @@ def test_train_format(self): self.assertNotEqual(tgt_id[tgt_idx], -100) def test_train_format_multi(self): - from data import tokenize_rounds_example + from utils.data import tokenize_rounds_example fake_data_args = self.DataArg(50, src_length=50) example = {"src": ["用户Round 1", "用户Round 2"], "tgt": ["回答Round 1", "回答Round 2"]} From 65e721e7887ec5f9d46b8a84d464972500033763 Mon Sep 17 00:00:00 2001 From: Difer <707065510@qq.com> Date: Fri, 21 Jun 2024 11:13:11 +0800 Subject: [PATCH 3/3] [LLM] Add sequence_parallel support for qwen (#8558) * add sequence_parallel for qwen * add sequence_parallel in qwen pp --- paddlenlp/transformers/qwen/configuration.py | 1 - paddlenlp/transformers/qwen/modeling.py | 128 +++++++++++++++---- paddlenlp/transformers/qwen/modeling_pp.py | 9 ++ 3 files changed, 110 insertions(+), 28 deletions(-) diff --git a/paddlenlp/transformers/qwen/configuration.py b/paddlenlp/transformers/qwen/configuration.py index 836c99027a6c..1841622ea225 100644 --- a/paddlenlp/transformers/qwen/configuration.py +++ b/paddlenlp/transformers/qwen/configuration.py @@ -70,7 +70,6 @@ def __init__( self.use_dynamic_ntk = use_dynamic_ntk self.use_logn_attn = use_logn_attn self.no_bias = no_bias - self.long_sequence_strategy_type = long_sequence_strategy_type self.long_sequence_strategy_name = long_sequence_strategy_name self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args diff --git a/paddlenlp/transformers/qwen/modeling.py b/paddlenlp/transformers/qwen/modeling.py index 406e097651ee..91203a3b717c 100755 --- a/paddlenlp/transformers/qwen/modeling.py +++ b/paddlenlp/transformers/qwen/modeling.py @@ -49,6 +49,15 @@ def swiglu(x, y=None): from ..model_outputs import ModelOutput from .configuration import QWenConfig +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + GatherOp, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass + __all__ = [ "QWenBlock", "QWenForCausalLM", @@ -132,17 +141,26 @@ def __init__(self, config): assert self.projection_size % config.num_attention_heads == 0 self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads + self.sequence_parallel = config.sequence_parallel + + if config.sequence_parallel: + ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear + RowParallelLinear = linear_utils.RowSequenceParallelLinear + else: + ColumnParallelLinear = linear_utils.ColumnParallelLinear + RowParallelLinear = linear_utils.RowParallelLinear + if config.tensor_parallel_degree > 1: if config.num_attention_heads % config.tensor_parallel_degree != 0: raise ValueError("num_attention_heads has to be divisible by tensor_parallel_degree") self.num_heads = config.num_attention_heads // config.tensor_parallel_degree - self.c_attn = mpu.ColumnParallelLinear( + self.c_attn = ColumnParallelLinear( config.hidden_size, 3 * self.projection_size, has_bias=True, gather_output=False, ) - self.c_proj = mpu.RowParallelLinear( + self.c_proj = RowParallelLinear( config.hidden_size, self.projection_size, has_bias=not config.no_bias, @@ -150,7 +168,11 @@ def __init__(self, config): ) else: self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size, bias_attr=True) - self.c_proj = nn.Linear(config.hidden_size, self.projection_size, bias_attr=not config.no_bias) + self.c_proj = nn.Linear( + config.hidden_size, + self.projection_size, + bias_attr=not config.no_bias, + ) if config.rotary_pct == 1.0: self.rotary_ndims = None @@ -205,6 +227,11 @@ def _attn(self, query, key, value, attention_mask=None): is_causal=attention_mask is None, ) attn_weights = None + + if self.sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) return attn_output, attn_weights else: # [bz, sql, nh, hid] ==> [bz, nh, sql hdim] @@ -230,6 +257,11 @@ def _attn(self, query, key, value, attention_mask=None): attn_weights = self.attn_dropout(attn_weights) attn_output = paddle.matmul(attn_weights, value) attn_output = attn_output.transpose([0, 2, 1, 3]) + + if self.sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) return attn_output, attn_weights def _split_heads(self, tensor, num_heads, attn_head_size): @@ -237,12 +269,6 @@ def _split_heads(self, tensor, num_heads, attn_head_size): tensor = tensor.reshape(new_shape) return tensor - def _merge_heads(self, tensor, num_heads, attn_head_size): - new_shape = tensor.shape[:-2] + [ - num_heads * attn_head_size, - ] - return tensor.reshape(new_shape) - def forward( self, hidden_states, @@ -256,14 +282,18 @@ def forward( ): # [bz, sql, hid] ==> [bz, sql, 3*hid] mixed_x_layer = self.c_attn(hidden_states) - # [bz, sql, 3*hid] ==> [bz, sql, hid] - query, key, value = paddle.split(mixed_x_layer, num_or_sections=3, axis=-1) + + if self.sequence_parallel: + target_shape = [-1, self.seq_length, self.num_heads * 3 * self.head_dim] + mixed_x_layer = paddle.reshape_(mixed_x_layer, target_shape) # [bz, sql, hid] ==> [bz, sql, nh, hdim] + query, key, value = paddle.split(mixed_x_layer, num_or_sections=3, axis=-1) query = self._split_heads(query, self.num_heads, self.head_dim) key = self._split_heads(key, self.num_heads, self.head_dim) value = self._split_heads(value, self.num_heads, self.head_dim) - kv_seq_len = hidden_states.shape[1] + + kv_seq_len = key.shape[-3] if layer_past: # layer past[0] shape: bs * seq_len * head_num * dim kv_seq_len += layer_past[0].shape[1] @@ -322,17 +352,22 @@ def forward( has_gradient = not (query.stop_gradient and key.stop_gradient and value.stop_gradient) if self.enable_recompute and self.training and has_gradient and self.recompute_granularity == "core_attn": attn_output, attn_weight = recompute( - self._attn, query, key, value, attention_mask, use_reentrant=self.config.recompute_use_reentrant + self._attn, + query, + key, + value, + attention_mask, + use_reentrant=self.config.recompute_use_reentrant, ) else: attn_output, attn_weight = self._attn(query, key, value, attention_mask) - context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim) - attn_output = self.c_proj(context_layer) + # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim] + # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism. + attn_output = self.c_proj(attn_output) outputs = (attn_output, present) if output_attentions: outputs += (attn_weight,) - return outputs @@ -401,6 +436,7 @@ def forward(self, hidden_states): class QWenBlock(nn.Layer): def __init__(self, config): super().__init__() + self.sequence_parallel = config.sequence_parallel self.ln_1 = QWenRMSNorm(config) self.attn = QWenAttention(config) self.ln_2 = QWenRMSNorm(config) @@ -417,6 +453,8 @@ def forward( use_cache=False, output_attentions=False, ): + # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel) + residual = hidden_states layernorm_output = self.ln_1(hidden_states) attn_outputs = self.attn( @@ -431,7 +469,6 @@ def forward( outputs = attn_outputs[1:] - residual = hidden_states layernorm_input = attn_output + residual layernorm_output = self.ln_2(layernorm_input) @@ -448,7 +485,6 @@ def forward( # remove empty tuple for pipeline parallel if type(outputs) is tuple and len(outputs) == 1: outputs = outputs[0] - return outputs @@ -476,8 +512,6 @@ def get_tensor_parallel_split_mappings(num_hidden_layers): base_actions = { # Column Linear "lm_head.weight": partial(fn, is_column=True), - "qwen.h.0.mlp.w2.weight": partial(fn, is_column=True), - "qwen.h.0.mlp.w1.weight": partial(fn, is_column=True), "qwen.h.0.attn.c_attn.weight": partial(fn, is_column=True, is_naive_3fuse=True), "qwen.h.0.attn.c_attn.bias": partial(fn, is_column=True, is_naive_3fuse=True), # Row Linear @@ -485,6 +519,15 @@ def get_tensor_parallel_split_mappings(num_hidden_layers): "qwen.h.0.mlp.c_proj.weight": partial(fn, is_column=False), "qwen.h.0.attn.c_proj.weight": partial(fn, is_column=False), } + + if config.fuse_attention_ffn: + base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial( + fn, is_column=True, is_naive_2fuse=True + ) + else: + base_actions["qwen.h.0.mlp.w2.weight"] = partial(fn, is_column=True) + base_actions["qwen.h.0.mlp.w1.weight"] = partial(fn, is_column=True) + for key, action in base_actions.items(): if "h.0." in key: for i in range(num_hidden_layers): @@ -569,6 +612,8 @@ def _get_name_mappings(cls, config: QWenConfig) -> List[StateDictNameMapping]: def _init_weights(self, module): """Initialize the weights.""" + if self.config.tensor_parallel_degree > 1: + rng_tracker = get_rng_state_tracker().rng_state if isinstance( module, ( @@ -578,11 +623,24 @@ def _init_weights(self, module): mpu.RowParallelLinear, mpu.VocabParallelEmbedding, QWenLMHead, + linear_utils.ColumnSequenceParallelLinear, + linear_utils.RowSequenceParallelLinear, ), ): - module.weight.set_value( - paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=module.weight.shape) - ) + if isinstance(module.weight, paddle.Tensor): + if module.weight.is_distributed: + with rng_tracker(): + module.weight.set_value( + paddle.tensor.normal( + mean=0.0, + std=self.config.initializer_range, + shape=module.weight.shape, + ) + ) + else: + module.weight.set_value( + paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=module.weight.shape) + ) for name, p in module.named_parameters(): if name == "c_proj.weight": @@ -604,6 +662,7 @@ def __init__(self, config): self.embed_dim = config.hidden_size self.enable_recompute = False self.recompute_granularity = config.recompute_granularity + self.sequence_parallel = config.sequence_parallel if config.tensor_parallel_degree > 1: self.wte = mpu.VocabParallelEmbedding( @@ -705,6 +764,9 @@ def forward( output_hidden_states=None, return_dict=None, ): + if self.sequence_parallel and use_cache: + raise ValueError("We currently only support sequence parallel without cache.") + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -731,6 +793,14 @@ def forward( encoder_attention_mask = None if inputs_embeds is None: inputs_embeds = self.wte(input_ids) + + if self.sequence_parallel: + # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim] + bs, seq_len, hidden_size = inputs_embeds.shape + inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size]) + # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism) + inputs_embeds = ScatterOp.apply(inputs_embeds) + hidden_states = inputs_embeds # bool 4D mask @@ -741,9 +811,6 @@ def forward( attention_mask = paddle.where(attention_mask, zero, neg_inf) hidden_states = self.drop(hidden_states) - output_shape = input_shape + [ - hidden_states.shape[-1], - ] if self.enable_recompute and self.training: if use_cache: @@ -794,7 +861,7 @@ def forward( all_self_attentions = all_self_attentions + (outputs[1],) hidden_states = self.ln_f(hidden_states) - hidden_states = hidden_states.reshape(output_shape) + # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -836,6 +903,11 @@ def __init__(self, config: QWenConfig): self.weight.split_axis = 1 def forward(self, hidden_states, tensor_parallel_output=None): + if self.config.sequence_parallel: + hidden_states = GatherOp.apply(hidden_states) + seq_length = self.config.seq_length + hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size]) + if tensor_parallel_output is None: tensor_parallel_output = self.config.tensor_parallel_output and self.config.tensor_parallel_degree > 1 @@ -1091,6 +1163,8 @@ def __init__(self, config): dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(1.0), ) + if config.sequence_parallel: + mark_as_sequence_parallel_parameter(self.weight) def _norm(self, x): return x * paddle.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) diff --git a/paddlenlp/transformers/qwen/modeling_pp.py b/paddlenlp/transformers/qwen/modeling_pp.py index 238737ecda66..47357d6921e3 100644 --- a/paddlenlp/transformers/qwen/modeling_pp.py +++ b/paddlenlp/transformers/qwen/modeling_pp.py @@ -76,6 +76,7 @@ class QWenEmbeddingPipe(nn.Layer): def __init__(self, config): super(QWenEmbeddingPipe, self).__init__() self.hidden_size = config.hidden_size + self.sequence_parallel = config.sequence_parallel if config.tensor_parallel_degree > 1: self.wte = fleet.meta_parallel.VocabParallelEmbedding( config.vocab_size, @@ -96,6 +97,14 @@ def forward(self, args): """ input_ids, attention_mask, position_ids = parse_args(args) input_embeds = self.wte(input_ids) + if self.sequence_parallel: + from paddlenlp.transformers import ScatterOp + + # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim] + bs, seq_len, hidden_size = input_embeds.shape + input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size]) + # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism) + input_embeds = ScatterOp.apply(input_embeds) batch_size, seq_length = input_ids.shape if attention_mask is not None: