From 34fcced27f53934d872893c52690f92e817c84a2 Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Thu, 20 Jun 2024 19:51:27 +0800
Subject: [PATCH 1/3] =?UTF-8?q?=E3=80=90benchmark=E3=80=91=20fix=20model?=
 =?UTF-8?q?=5Fzoo=20path=20(#8643)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add llama-7b_auto_dp2mp2pp2 benchmark sh

* add llama-7b_auto_dp2mp2pp2 benchmark script for cinn

* update llama-7b_auto_dp2mp2pp2 benchmark script

* Update run_benchmark.sh

* Update run_benchmark.sh

* fix llama-7b_auto_dp2mp2pp2/benchmark_common

* Update run_benchmark.sh

* Update prepare.sh

* Update prepare.sh

* Update prepare.sh

* Update prepare.sh

* Update prepare.sh
---
 .../auto_tuner/llama_pretrain/benchmark_common/prepare.sh   | 4 ++--
 .../hybrid_parallelism/llama/benchmark_common/prepare.sh    | 6 +++---
 .../hybrid_parallelism/llama2/benchmark_common/prepare.sh   | 4 ++--
 .../hybrid_parallelism/qwen/benchmark_common/prepare.sh     | 6 +++---
 .../static/auto_parallel/llama2/benchmark_common/prepare.sh | 4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
index 5472a36e94c4..3e51c5dc6fb7 100644
--- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
+++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 rm -rf data && mkdir data
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
index 0563a1aaabac..388b179e6905 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
@@ -28,4 +28,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe
 
 mkdir data
 mv llama_openwebtext_100k_ids.npy ./data
-mv llama_openwebtext_100k_idx.npz ./data
\ No newline at end of file
+mv llama_openwebtext_100k_idx.npz ./data
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
index 45fd82fad914..9405521c7b3f 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 # download data
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
index 1d8a79cc2a0e..bf6952c135ca 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
@@ -18,11 +18,11 @@ python -m pip install -r ../requirements-dev.txt
 python -m pip install tiktoken
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/qwen
+cd ../../../../llm/qwen
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
@@ -30,4 +30,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe
 
 mkdir data
 mv llama_openwebtext_100k_ids.npy ./data
-mv llama_openwebtext_100k_idx.npz ./data
\ No newline at end of file
+mv llama_openwebtext_100k_idx.npz ./data
diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
index edb4590e2f15..697d5d1d92e0 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 # download data

From 498f70988431be278dac618411fbfb0287853cd9 Mon Sep 17 00:00:00 2001
From: lugimzzz <63761690+lugimzzz@users.noreply.github.com>
Date: Fri, 21 Jun 2024 10:51:22 +0800
Subject: [PATCH 2/3] [LLM] change llm content (#8627)

* change llm

* fix

* fix

* fix ci

* fix

* fix

* fix

---------

Co-authored-by: zhangjunjun04 <zhangjunjun04@baidu.com>
---
 docs/llm/peft.md                              |   2 +-
 .../llm/llama_single_gpu}/benchmark.py        |   0
 .../llm/llama_single_gpu}/benchmark_utils.py  |   0
 llm/.gitignore                                |  12 -
 llm/Alignment/RM/models                       |   1 -
 llm/README.md                                 |  84 ++--
 llm/{Alignment => alignment}/README.md        |  28 +-
 llm/{ => alignment/dpo}/dpo_argument.py       |   0
 .../dpo/run_dpo.py}                           |   0
 .../PPO => alignment/ppo}/comm_utils.py       |   0
 .../PPO => alignment/ppo}/data/__init__.py    |   0
 .../PPO => alignment/ppo}/data/alpaca.py      |   0
 .../PPO => alignment/ppo}/data/base.py        |   0
 .../PPO => alignment/ppo}/data/preference.py  |   0
 .../PPO => alignment/ppo}/data/prompt_only.py |   0
 .../PPO => alignment/ppo}/data/safe_rlhf.py   |   0
 .../PPO => alignment/ppo}/data/supervised.py  |   0
 .../PPO => alignment/ppo}/infer_utils.py      |   0
 .../PPO => alignment/ppo}/models/__init__.py  |   0
 .../ppo}/models/infer_model_utils.py          |   0
 .../PPO => alignment/ppo}/models/model_pp.py  |   0
 .../ppo}/models/pp_model_utils.py             |   0
 .../PPO => alignment/ppo}/models/ppo_model.py |   0
 .../ppo}/models/ppo_model_utils.py            |   0
 .../ppo}/models/score_model.py                |   0
 .../ppo}/models/score_model_utils.py          |   0
 .../PPO => alignment/ppo}/ppo_trainer.py      |   0
 .../ppo_main.py => alignment/ppo/run_ppo.py}  |   0
 .../PPO => alignment/ppo}/tests/run_model.py  |   0
 .../ppo}/tests/test_export.py                 |   0
 .../PPO => alignment/ppo}/trainer_utils.py    |   0
 llm/alignment/rm/models                       |   1 +
 .../RM => alignment/rm}/reward_trainer.py     |   0
 .../rm/run_reward.py}                         |   0
 .../gpt-3}/run_pretrain_auto.py               | 237 ++++-------
 .../gpt-3}/run_pretrain_auto_dp2mp2pp2.sh     |  14 +
 .../llama}/README.md                          |   0
 .../llama}/run_llama3.sh                      |   0
 .../llama}/run_pretrain_auto.py               |   0
 .../llama}/run_pretrain_auto.sh               |   0
 .../llama}/run_pretrain_auto_static.py        |   0
 .../llama}/run_pretrain_auto_static.sh        |   0
 .../llama}/run_pretrain_auto_static_sp.sh     |   0
 .../llama}/run_pretrain_hand.py               |   0
 .../llama}/run_pretrain_hand.sh               |   0
 .../pretrain_argument_auto_dp2tp2pp2.json     |   0
 .../qwen}/run_pretrain_3D_auto.py             |   0
 .../qwen}/run_pretrain_3D_auto.sh             |   0
 .../pretrain-baichuan2_13b-sd8_stage2.json    |  40 --
 llm/benchmark.sh                              |  36 --
 llm/config/baichuan/README.md                 |  15 +
 llm/config/baichuan/awq_argument.json         |  23 +
 llm/config/baichuan/dpo_argument.json         |  38 ++
 .../baichuan}/gptq_argument.json              |   5 +-
 llm/config/baichuan/lora_argument.json        |  35 ++
 .../baichuan/pretrain_argument.json}          |   4 +-
 llm/config/baichuan/ptq_argument.json         |  23 +
 llm/config/baichuan/qlora_argument.json       |  34 ++
 llm/{ => config}/bloom/README.md              |   3 -
 .../bloom}/gptq_argument.json                 |   5 +-
 llm/{ => config}/bloom/lora_argument.json     |   3 +-
 llm/{ => config}/bloom/pt_argument.json       |   3 +-
 .../bloom}/ptq_argument.json                  |   5 +-
 llm/{ => config}/bloom/sft_argument.json      |   3 +-
 llm/{ => config}/chatglm/README.md            |   3 -
 .../chatglm}/gptq_argument.json               |   5 +-
 llm/{ => config}/chatglm/lora_argument.json   |   3 +-
 llm/{ => config}/chatglm/pt_argument.json     |   2 +-
 llm/{ => config}/chatglm/ptq_argument.json    |   5 +-
 llm/{ => config}/chatglm/sft_argument.json    |   3 +-
 llm/{ => config}/chatglm2/README.md           |   3 -
 .../chatglm2}/gptq_argument.json              |   5 +-
 llm/{ => config}/chatglm2/lora_argument.json  |   3 +-
 llm/{ => config}/chatglm2/pt_argument.json    |   2 +-
 .../chatglm2}/ptq_argument.json               |   5 +-
 llm/{ => config}/chatglm2/sft_argument.json   |   7 +-
 llm/{ => config}/gemma/README.md              |   0
 llm/{ => config}/gemma/sft_argument.json      |  12 +-
 llm/config/gpt-3/README.md                    |   5 +
 .../gpt-3}/lora_argument.json                 |   5 +-
 .../gpt-3/pretrain_argument.json}             |   1 +
 llm/config/gpt-3/sft_argument.json            |  33 ++
 llm/{ => config}/llama/README.md              |  12 +-
 llm/{ => config}/llama/awq_argument.json      |   7 +-
 llm/{ => config}/llama/dpo_argument.json      |   3 +-
 llm/config/llama/gptq_argument.json           |  17 +
 llm/config/llama/lora_argument.json           |  35 ++
 .../llama/{ppo.json => ppo_argument.json}     |   0
 .../llama/pretrain_argument.json}             |   7 +-
 llm/{qwen => config/llama}/pt_argument.json   |   6 +-
 llm/{ => config}/llama/ptq_argument.json      |   5 +-
 llm/{ => config}/llama/qlora_argument.json    |   7 +-
 .../llama/{rm.json => rm_argument.json}       |   0
 llm/{ => config}/llama/sft_argument.json      |  15 +-
 .../llama/wint8_lora_argument.json            |   5 +-
 llm/{ => config}/mixtral/lora_argument.json   |   5 +-
 .../mixtral/pretrain_argument.json}           |   9 +-
 llm/{ => config}/mixtral/sft_argument.json    |  11 +-
 llm/{ => config}/opt/README.md                |   3 -
 llm/{ => config}/opt/lora_argument.json       |   2 +-
 llm/{ => config}/opt/sft_argument.json        |   2 +-
 llm/{ => config}/qwen/README.md               |   4 -
 llm/{ => config}/qwen/dpo_argument.json       |   3 +-
 llm/{ => config}/qwen/lora_argument.json      |   8 +-
 .../qwen/pretrain_argument.json}              |   7 +-
 llm/{llama => config/qwen}/pt_argument.json   |   9 +-
 llm/{ => config}/qwen/sft_argument.json       |  10 +-
 llm/docs/chat_template.md                     |   6 +-
 llm/docs/finetune.md                          |  36 +-
 llm/docs/inference.md                         |  62 +--
 llm/docs/pretrain.rst                         |   4 +-
 llm/docs/quantization.md                      |   6 +-
 llm/{ => experimental}/ernie-3.5-se/README.md |   0
 .../ernie-3.5-se/configuration.py             |   0
 .../ernie-3.5-se/conversion_utils.py          |   0
 llm/{ => experimental}/ernie-3.5-se/data.py   |   0
 .../ernie-tokenizer/sentencepiece.bpe.model   | Bin
 .../ernie-tokenizer/special_tokens_map.json   |   0
 .../ernie-tokenizer/tokenizer_config.json     |   0
 .../ernie-3.5-se/ernie_dataset.py             |   0
 .../ernie-3.5-se/finetune_generation.py       |   0
 .../ernie-3.5-se/modeling.py                  |   0
 .../ernie-3.5-se/predict_generation.py        |   0
 .../ernie-3.5-se/run_pretrain.py              |   0
 .../ernie-3.5-se/run_trainer_stage2.sh        |   0
 .../ernie-3.5-se/tokenizer.py                 |   0
 llm/{ => experimental}/ernie-3.5-se/utils.py  |   0
 .../scripts}/run_sharding_v2.sh               |   0
 .../scripts}/run_trainer.sh                   |   0
 .../scripts}/run_trainer_tp2cp2.sh            |   0
 .../scripts}/run_trainer_tp4pp2.sh            |   0
 .../scripts}/run_trainer_tp4sep2.sh           |   0
 llm/fused_layers.py                           |   1 -
 llm/gemma/sft_argument_7b.json                |  32 --
 llm/gemma/sft_argument_7b_sharding.json       |  33 --
 llm/gemma/sft_argument_sharding.json          |  31 --
 llm/glm/README.md                             | 102 -----
 llm/glm/data.py                               |  67 ---
 llm/glm/finetune_generation.py                | 188 ---------
 llm/glm/predict_generation.py                 | 151 -------
 llm/glm/utils.py                              |  79 ----
 llm/gpt-3/README.md                           | 205 ---------
 llm/gpt-3/finetune_generation.py              | 250 -----------
 llm/gpt-3/predict_generation.py               | 165 --------
 llm/gpt-3/run_pretrain.py                     |   1 -
 llm/gpt-3/tests/test_sequence_parallel.py     |  98 -----
 llm/gpt-3/utils.py                            | 393 ------------------
 llm/llama/lora_argument_pissa.json            |  33 --
 llm/llama/megre_tp_and_pp.py                  |  88 ----
 .../pretrain-baichuan2_13b-tp2sd4_stage2.json |  40 --
 ...in-flagalpha_llama2_13b-tp2sd4_stage2.json |  40 --
 ...ain-flagalpha_llama2_7b-tp2sd4_stage2.json |  40 --
 ...retrain-linly_llama2_7b-tp2sd4_stage2.json |  40 --
 .../pretrain-llama2_7b-tp2sd4_stage2.json     |  40 --
 .../pretrain-llama_13b-tp2sd4_stage2.json     |  40 --
 .../pretrain-llama_7b-tp2sd4_stage2.json      |  40 --
 llm/llama/run_pretrain.py                     |   1 -
 llm/llama/sft_pp_argument.json                |  31 --
 llm/llama/tests/test_pipeline_parallel.py     | 132 ------
 llm/llama/tests/test_sequence_parallel.py     | 118 ------
 .../tests/unified-ckpt-llama-500m/config.json |  17 -
 llm/merge_lora_params.py                      | 147 -------
 llm/{llama/npu => npu/llama}/export_utils.py  |  16 +-
 .../npu => npu/llama}/llama_npu_opt_lora.sh   |   4 +-
 .../npu => npu/llama}/llama_npu_opt_ppt.sh    |   0
 .../npu => npu/llama}/llama_npu_opt_sft.sh    |   4 +-
 llm/{ => predict}/export_model.py             |   4 +-
 llm/{ => predict}/flask_server.py             |   7 +-
 llm/{ => predict}/gradio_ui.py                |   0
 llm/{ => predict}/predictor.py                |   2 +-
 llm/{ => predict}/request_flask_server.py     |   0
 llm/qwen/lora_argument_pissa.json             |  33 --
 llm/qwen/lora_argument_qwen2_7b.json          |  32 --
 llm/qwen/lora_argument_qwen2moe.json          |  32 --
 .../pretrain-qwen1.5_7b-tp2sd4_stage2.json    |  41 --
 llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json |  41 --
 llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json  |  40 --
 llm/qwen/pretrain_argument_tp2pp4.json        |  40 --
 llm/qwen/pt_argument_qwen2_7b.json            |  33 --
 llm/qwen/sft_argument_qwen2_7b.json           |  31 --
 llm/qwen/sft_argument_qwen2moe.json           |  30 --
 ...finetune_generation.py => run_finetune.py} |  52 +--
 llm/run_pretrain.py                           |  12 +-
 llm/tests/test_best_pretrain_speed.py         | 266 ------------
 llm/tools/merge_lora_params.py                | 222 ++++++++++
 llm/{ => tools}/merge_tp_and_pp_params.py     |   0
 llm/{ => utils}/argument.py                   |  37 ++
 llm/{ => utils}/data.py                       |   1 +
 llm/{llama => utils}/fused_layers.py          |   0
 llm/{ => utils}/quant.py                      |   0
 llm/{llama => utils}/register_reshard.py      |   0
 llm/{ => utils}/utils.py                      |  10 +
 scripts/ci_approval/run_ci_approval.sh        |   2 +-
 scripts/distribute/ci_case_auto.sh            |   4 +-
 tests/llm/test_finetune.py                    |   2 +-
 tests/llm/test_finetune_prefix_tuning.py      |   2 +-
 tests/llm/test_gradio.py                      |   2 +-
 tests/llm/test_long_sequence_strategies.py    |   2 +-
 tests/llm/test_lora.py                        |  18 +-
 tests/llm/test_predictor.py                   |   4 +-
 tests/llm/test_ptq.py                         |   8 +-
 tests/llm/testing_utils.py                    |   6 +-
 .../benchmark_common/run_benchmark.sh         |   8 +-
 .../configs/llama/train_infer_python.txt      |   2 +-
 .../ft/benchmark_common/run_benchmark.sh      |   4 +-
 tests/trainer/test_lora_unified_checkpoint.py |   4 +-
 tests/trainer/test_unified_checkpoint.py      |   2 +-
 tests/transformers/test_chat_template.py      |   8 +-
 208 files changed, 945 insertions(+), 3828 deletions(-)
 rename {llm/llama => legacy/examples/benchmark/llm/llama_single_gpu}/benchmark.py (100%)
 rename {llm/llama => legacy/examples/benchmark/llm/llama_single_gpu}/benchmark_utils.py (100%)
 delete mode 100644 llm/.gitignore
 delete mode 120000 llm/Alignment/RM/models
 rename llm/{Alignment => alignment}/README.md (87%)
 rename llm/{ => alignment/dpo}/dpo_argument.py (100%)
 rename llm/{dpo_train.py => alignment/dpo/run_dpo.py} (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/comm_utils.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/data/__init__.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/data/alpaca.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/data/base.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/data/preference.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/data/prompt_only.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/data/safe_rlhf.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/data/supervised.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/infer_utils.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/__init__.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/infer_model_utils.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/model_pp.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/pp_model_utils.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/ppo_model.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/ppo_model_utils.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/score_model.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/models/score_model_utils.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/ppo_trainer.py (100%)
 rename llm/{Alignment/PPO/ppo_main.py => alignment/ppo/run_ppo.py} (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/tests/run_model.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/tests/test_export.py (100%)
 rename llm/{Alignment/PPO => alignment/ppo}/trainer_utils.py (100%)
 create mode 120000 llm/alignment/rm/models
 rename llm/{Alignment/RM => alignment/rm}/reward_trainer.py (100%)
 rename llm/{Alignment/RM/reward_main.py => alignment/rm/run_reward.py} (100%)
 rename llm/{gpt-3/auto_parallel => auto_parallel/gpt-3}/run_pretrain_auto.py (70%)
 rename llm/{gpt-3/auto_parallel => auto_parallel/gpt-3}/run_pretrain_auto_dp2mp2pp2.sh (72%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/README.md (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_llama3.sh (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto.py (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto.sh (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto_static.py (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto_static.sh (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_auto_static_sp.sh (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_hand.py (100%)
 rename llm/{llama/auto_parallel => auto_parallel/llama}/run_pretrain_hand.sh (100%)
 rename llm/{qwen/auto_parallel => auto_parallel/qwen}/pretrain_argument_auto_dp2tp2pp2.json (100%)
 rename llm/{qwen/auto_parallel => auto_parallel/qwen}/run_pretrain_3D_auto.py (100%)
 rename llm/{qwen/auto_parallel => auto_parallel/qwen}/run_pretrain_3D_auto.sh (100%)
 delete mode 100644 llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json
 delete mode 100644 llm/benchmark.sh
 create mode 100644 llm/config/baichuan/README.md
 create mode 100644 llm/config/baichuan/awq_argument.json
 create mode 100644 llm/config/baichuan/dpo_argument.json
 rename llm/{chatglm2 => config/baichuan}/gptq_argument.json (71%)
 create mode 100644 llm/config/baichuan/lora_argument.json
 rename llm/{baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json => config/baichuan/pretrain_argument.json} (90%)
 create mode 100644 llm/config/baichuan/ptq_argument.json
 create mode 100644 llm/config/baichuan/qlora_argument.json
 rename llm/{ => config}/bloom/README.md (92%)
 rename llm/{llama => config/bloom}/gptq_argument.json (72%)
 rename llm/{ => config}/bloom/lora_argument.json (91%)
 rename llm/{ => config}/bloom/pt_argument.json (92%)
 rename llm/{chatglm2 => config/bloom}/ptq_argument.json (79%)
 rename llm/{ => config}/bloom/sft_argument.json (91%)
 rename llm/{ => config}/chatglm/README.md (92%)
 rename llm/{bloom => config/chatglm}/gptq_argument.json (73%)
 rename llm/{ => config}/chatglm/lora_argument.json (91%)
 rename llm/{ => config}/chatglm/pt_argument.json (94%)
 rename llm/{ => config}/chatglm/ptq_argument.json (73%)
 rename llm/{ => config}/chatglm/sft_argument.json (91%)
 rename llm/{ => config}/chatglm2/README.md (91%)
 rename llm/{chatglm => config/chatglm2}/gptq_argument.json (73%)
 rename llm/{ => config}/chatglm2/lora_argument.json (91%)
 rename llm/{ => config}/chatglm2/pt_argument.json (94%)
 rename llm/{bloom => config/chatglm2}/ptq_argument.json (79%)
 rename llm/{ => config}/chatglm2/sft_argument.json (85%)
 rename llm/{ => config}/gemma/README.md (100%)
 rename llm/{ => config}/gemma/sft_argument.json (71%)
 create mode 100644 llm/config/gpt-3/README.md
 rename llm/{llama => config/gpt-3}/lora_argument.json (86%)
 rename llm/{gpt-3/pretrain-gpt_medium_en-stage2.json => config/gpt-3/pretrain_argument.json} (97%)
 create mode 100644 llm/config/gpt-3/sft_argument.json
 rename llm/{ => config}/llama/README.md (92%)
 rename llm/{ => config}/llama/awq_argument.json (76%)
 rename llm/{ => config}/llama/dpo_argument.json (92%)
 create mode 100644 llm/config/llama/gptq_argument.json
 create mode 100644 llm/config/llama/lora_argument.json
 rename llm/config/llama/{ppo.json => ppo_argument.json} (100%)
 rename llm/{llama/pretrain-llama2_13b-tp2sd4_stage2.json => config/llama/pretrain_argument.json} (83%)
 rename llm/{qwen => config/llama}/pt_argument.json (85%)
 rename llm/{ => config}/llama/ptq_argument.json (83%)
 rename llm/{ => config}/llama/qlora_argument.json (84%)
 rename llm/config/llama/{rm.json => rm_argument.json} (100%)
 rename llm/{ => config}/llama/sft_argument.json (68%)
 rename llm/{ => config}/llama/wint8_lora_argument.json (89%)
 rename llm/{ => config}/mixtral/lora_argument.json (88%)
 rename llm/{llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json => config/mixtral/pretrain_argument.json} (79%)
 rename llm/{ => config}/mixtral/sft_argument.json (74%)
 rename llm/{ => config}/opt/README.md (88%)
 rename llm/{ => config}/opt/lora_argument.json (94%)
 rename llm/{ => config}/opt/sft_argument.json (94%)
 rename llm/{ => config}/qwen/README.md (96%)
 rename llm/{ => config}/qwen/dpo_argument.json (93%)
 rename llm/{ => config}/qwen/lora_argument.json (82%)
 rename llm/{qwen/pretrain_argument_stage2.json => config/qwen/pretrain_argument.json} (84%)
 rename llm/{llama => config/qwen}/pt_argument.json (81%)
 rename llm/{ => config}/qwen/sft_argument.json (78%)
 rename llm/{ => experimental}/ernie-3.5-se/README.md (100%)
 rename llm/{ => experimental}/ernie-3.5-se/configuration.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/conversion_utils.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/data.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model (100%)
 rename llm/{ => experimental}/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json (100%)
 rename llm/{ => experimental}/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json (100%)
 rename llm/{ => experimental}/ernie-3.5-se/ernie_dataset.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/finetune_generation.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/modeling.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/predict_generation.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/run_pretrain.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/run_trainer_stage2.sh (100%)
 rename llm/{ => experimental}/ernie-3.5-se/tokenizer.py (100%)
 rename llm/{ => experimental}/ernie-3.5-se/utils.py (100%)
 rename llm/{llama => experimental/scripts}/run_sharding_v2.sh (100%)
 rename llm/{llama => experimental/scripts}/run_trainer.sh (100%)
 rename llm/{llama => experimental/scripts}/run_trainer_tp2cp2.sh (100%)
 rename llm/{llama => experimental/scripts}/run_trainer_tp4pp2.sh (100%)
 rename llm/{llama => experimental/scripts}/run_trainer_tp4sep2.sh (100%)
 delete mode 120000 llm/fused_layers.py
 delete mode 100644 llm/gemma/sft_argument_7b.json
 delete mode 100644 llm/gemma/sft_argument_7b_sharding.json
 delete mode 100644 llm/gemma/sft_argument_sharding.json
 delete mode 100644 llm/glm/README.md
 delete mode 100644 llm/glm/data.py
 delete mode 100644 llm/glm/finetune_generation.py
 delete mode 100644 llm/glm/predict_generation.py
 delete mode 100644 llm/glm/utils.py
 delete mode 100644 llm/gpt-3/README.md
 delete mode 100644 llm/gpt-3/finetune_generation.py
 delete mode 100644 llm/gpt-3/predict_generation.py
 delete mode 120000 llm/gpt-3/run_pretrain.py
 delete mode 100644 llm/gpt-3/tests/test_sequence_parallel.py
 delete mode 100644 llm/gpt-3/utils.py
 delete mode 100644 llm/llama/lora_argument_pissa.json
 delete mode 100644 llm/llama/megre_tp_and_pp.py
 delete mode 100644 llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json
 delete mode 100644 llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json
 delete mode 100644 llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json
 delete mode 100644 llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json
 delete mode 100644 llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json
 delete mode 100644 llm/llama/pretrain-llama_13b-tp2sd4_stage2.json
 delete mode 100644 llm/llama/pretrain-llama_7b-tp2sd4_stage2.json
 delete mode 120000 llm/llama/run_pretrain.py
 delete mode 100644 llm/llama/sft_pp_argument.json
 delete mode 100644 llm/llama/tests/test_pipeline_parallel.py
 delete mode 100644 llm/llama/tests/test_sequence_parallel.py
 delete mode 100644 llm/llama/tests/unified-ckpt-llama-500m/config.json
 delete mode 100644 llm/merge_lora_params.py
 rename llm/{llama/npu => npu/llama}/export_utils.py (91%)
 rename llm/{llama/npu => npu/llama}/llama_npu_opt_lora.sh (94%)
 rename llm/{llama/npu => npu/llama}/llama_npu_opt_ppt.sh (100%)
 rename llm/{llama/npu => npu/llama}/llama_npu_opt_sft.sh (95%)
 rename llm/{ => predict}/export_model.py (96%)
 rename llm/{ => predict}/flask_server.py (98%)
 rename llm/{ => predict}/gradio_ui.py (100%)
 rename llm/{ => predict}/predictor.py (99%)
 rename llm/{ => predict}/request_flask_server.py (100%)
 delete mode 100644 llm/qwen/lora_argument_pissa.json
 delete mode 100644 llm/qwen/lora_argument_qwen2_7b.json
 delete mode 100644 llm/qwen/lora_argument_qwen2moe.json
 delete mode 100644 llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json
 delete mode 100644 llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json
 delete mode 100644 llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json
 delete mode 100644 llm/qwen/pretrain_argument_tp2pp4.json
 delete mode 100644 llm/qwen/pt_argument_qwen2_7b.json
 delete mode 100644 llm/qwen/sft_argument_qwen2_7b.json
 delete mode 100644 llm/qwen/sft_argument_qwen2moe.json
 rename llm/{finetune_generation.py => run_finetune.py} (95%)
 delete mode 100644 llm/tests/test_best_pretrain_speed.py
 create mode 100644 llm/tools/merge_lora_params.py
 rename llm/{ => tools}/merge_tp_and_pp_params.py (100%)
 rename llm/{ => utils}/argument.py (89%)
 rename llm/{ => utils}/data.py (99%)
 rename llm/{llama => utils}/fused_layers.py (100%)
 rename llm/{ => utils}/quant.py (100%)
 rename llm/{llama => utils}/register_reshard.py (100%)
 rename llm/{ => utils}/utils.py (99%)

diff --git a/docs/llm/peft.md b/docs/llm/peft.md
index 234756e0f71b..f720138c6d23 100644
--- a/docs/llm/peft.md
+++ b/docs/llm/peft.md
@@ -277,4 +277,4 @@ key function
         该函数会遍历整个权重参数列表，对于每个权重参数weight，统计所有进行梯度更新的参数，最后将信息打印出来。
 ```
 
-更详细的使用可以参考[finetuning 脚本](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/causallm/finetune_generation.py)版本, 以及对应的启动脚本编写方式（写在 [README.md](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/causallm/README.md)文件中)。
+更详细的使用可以参考[finetuning 脚本](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/run_finetune.py)版本, 以及对应的启动脚本编写方式（写在 [README.md](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/causallm/README.md)文件中)。
diff --git a/llm/llama/benchmark.py b/legacy/examples/benchmark/llm/llama_single_gpu/benchmark.py
similarity index 100%
rename from llm/llama/benchmark.py
rename to legacy/examples/benchmark/llm/llama_single_gpu/benchmark.py
diff --git a/llm/llama/benchmark_utils.py b/legacy/examples/benchmark/llm/llama_single_gpu/benchmark_utils.py
similarity index 100%
rename from llm/llama/benchmark_utils.py
rename to legacy/examples/benchmark/llm/llama_single_gpu/benchmark_utils.py
diff --git a/llm/.gitignore b/llm/.gitignore
deleted file mode 100644
index d81fdef50031..000000000000
--- a/llm/.gitignore
+++ /dev/null
@@ -1,12 +0,0 @@
-# tmp files
-infer.json
-output.json 
-
-# data
-AdvertiseGen.tar.gz
-
-# checkpoints
-checkpoints/
-
-# inference_model
-inference*/
\ No newline at end of file
diff --git a/llm/Alignment/RM/models b/llm/Alignment/RM/models
deleted file mode 120000
index 39963209bbb5..000000000000
--- a/llm/Alignment/RM/models
+++ /dev/null
@@ -1 +0,0 @@
-../PPO/models
\ No newline at end of file
diff --git a/llm/README.md b/llm/README.md
index 36311c9980d1..c3009a1ceab2 100644
--- a/llm/README.md
+++ b/llm/README.md
@@ -19,17 +19,17 @@
 
 ##  🛠️ 支持模型列表 🛠️
 
-| Model | Pretrain | SFT | LoRA | Prefix Tuning |  Quantization | Weight convert |
-| --- | --- | --- | --- | --- | --- |  --- |
-| [LLaMA/LLaMA2](./llama) | ✅  | ✅ | ✅ | ✅ | ✅  | ✅  |
-| [Baichuan/Baichuan2](./llama) | ✅  | ✅ | ✅ | ✅ | ✅  | ✅  |
-| [ChatGLM-6B](./chatglm) |  ❌  |  ✅  |    ✅  |  ✅  |  ✅  | ❌  |
-| [ChatGLM2/ChatGLM3](./chatglm2) |  ❌  |    ✅  |  ✅  |  ✅  |  ✅  | ✅  |
-| [Qwen](./qwen) | ✅ | ✅ | ✅ | ✅ |  🚧 | ✅  |j
-| [Bloom](./bloom) | ❌  | ✅ | ✅ |  ✅ | ✅ | ✅  |
-| [GPT-3](./gpt-3) |   ✅  |  ✅  |    🚧  | 🚧  | 🚧 | ✅  |
-| [OPT](./opt) | 🚧 | ✅ | ✅ | 🚧 |  🚧 | ✅  |
-| [GLM](./glm) | ❌  | ✅ | ✅ | 🚧 |   🚧 | ✅  |
+| Model | Pretrain | SFT | LoRA | Prefix Tuning |  DPO |  Quantization | Weight convert |
+| --- | --- | --- | --- | --- | --- |  --- | --- |
+| [LLaMA](./llama) | ✅  | ✅ | ✅ | ✅ | ✅  | ✅  | ✅  |
+| [Qwen](./qwen) | ✅ | ✅ | ✅ | ✅ | ✅  | 🚧 | ✅  |
+| [Mixtral](./mixtral) | ✅  | ✅ | ✅ | ❌  |  🚧 |🚧 | 🚧  |
+| [Baichuan/Baichuan2](./llama) | ✅  | ✅ | ✅ | ✅ | ✅  | ✅  |  ✅  |
+| [ChatGLM-6B](./chatglm) |  ❌  |  ✅  |    ✅  |  ✅  |  🚧  |  ✅  | ❌  |
+| [ChatGLM2/ChatGLM3](./chatglm2) |  ❌  |    ✅  |  ✅  |  ✅  | 🚧  | ✅  | ✅  |
+| [Bloom](./bloom) | ❌  | ✅ | ✅ |  ✅ |🚧 | ✅ | ✅  |
+| [GPT-3](./gpt-3) |   ✅  |  ✅  |    🚧  | 🚧  |🚧 | 🚧 | ✅  |
+| [OPT](./opt) | 🚧 | ✅ | ✅ | 🚧 |  🚧 |🚧 | ✅  |
 
 * ✅: Supported
 * 🚧: In Progress
@@ -39,7 +39,7 @@
 ##  🚀 快速开始 🚀
 
 ### 1. 预训练
-PaddleNLP将飞桨4D并行策略加入到Trainer API中， 用户只需修改Trainer配置即可使用不同的分布式策略。目前工具链提供[LLaMA/LLaMA2](./llama)、[GPT-3](./gpt-3)、[Qwen](./qwen)、[Baichuan/Baichuan2](./llama) 等模型预训练功能，更多模型支持持续更新中。
+PaddleNLP将飞桨4D并行策略加入到Trainer API中， 用户只需修改Trainer配置即可使用不同的分布式策略。目前工具链提供[LLaMA/LLaMA2](./llama)、[GPT-3](./gpt-3)、[Qwen](./qwen)、[Baichuan/Baichuan2](./llama)、[Mixtral](./mixtral) 等模型预训练功能，更多模型支持持续更新中。
 
 <div align="center">
     <img width="500" alt="llm" src="https://github.com/PaddlePaddle/PaddleNLP/assets/37530985/a2f0261d-7f76-4faf-ae01-cc9d37d5fcc0">
@@ -54,7 +54,7 @@ PaddleNLP将飞桨4D并行策略加入到Trainer API中， 用户只需修改Tra
 我们在此处提供了更详细的[预训练数据制作]()，[分布式策略支持情况]( https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html#model-capability)，[性能测试报告文档](https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html#model-performance)，参见: https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html. 大模型权重列表参见[此处](https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/index.html#model-weight)
 
 
-此项目支持了LLaMA、GPT-3、BaiChuan、Qwen 等大模型的预训练。用户切换配置config文件，即可一键运行。
+此项目支持了LLaMA、GPT-3、BaiChuan、Qwen、Mixtral 等大模型的预训练。用户切换配置config文件，即可一键运行。
 
 数据详细制作流程可参考[此处](https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/dataset.html) : https://paddlenlp.readthedocs.io/zh/latest/llm/pretraining/dataset.html
 
@@ -79,30 +79,26 @@ mv llama_openwebtext_100k.idx ./data
 
 ```shell
 # 编译自定义算子，可选
-cd ../model_zoo/gpt-3/external_ops/ && python3 setup.py install && cd -
+cd ..legacy/model_zoo/gpt-3/external_ops/ && python3 setup.py install && cd -
 
-# llama 模型预训练
-python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./llama/pretrain-llama2_7b-tp2sd4_stage2.json
-
-# Qwen 模型预训练
-python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./qwen/pretrain_argument_stage2.json
+# 模型预训练参考
+python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/llama/pretrain_argument.json
 ```
 
 注意：
 1. 建议使用paddle develop版本训练，需要安装`pip install tool_helpers visualdl==2.5.3`等相关缺失whl包
 2. `use_flash_attention` 需要在A100机器开启，建议使用cuda11.8环境。
-3. `use_fused_rms_norm` 需要安装[此目录](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt-3/external_ops)下的自定义OP, `python setup.py install`。如果安装后仍然找不到算子，需要额外设置PYTHONPATH
+3. `use_fused_rms_norm` 需要安装自定义算子。如果安装后仍然找不到算子，需要额外设置PYTHONPATH
 4. `continue_training` 表示从现有的预训练模型加载训练。7b模型初始loss大概为2.xx, 随机初始化模型loss从11.x左右下降。
-5. 当前脚本为sharding版本，需要4D并行训练（数据、sharding、张量、流水线并行）的用户，请参考 `run_trainer_tp4pp2.sh`脚本。
-6. 多机训练时，若各机器使用的训练数据文件位置相同（例如挂载共享硬盘情况），请指定`--share_folder true`使全局0号卡制作缓存数据。否则默认各台机器的0号卡独立制作缓存数据，
-7. 若数据集文件夹中存在默认缓存文件夹`index-cache/`，则额外指定的`--data_cache`不生效，训练时优先加载默认缓存文件夹中的内容。
+5. 多机训练时，若各机器使用的训练数据文件位置相同（例如挂载共享硬盘情况），请指定`--share_folder true`使全局0号卡制作缓存数据。否则默认各台机器的0号卡独立制作缓存数据，
+6. 若数据集文件夹中存在默认缓存文件夹`index-cache/`，则额外指定的`--data_cache`不生效，训练时优先加载默认缓存文件夹中的内容。
 
 
 
 ### 2. 精调
 PaddleNLP支持多个主流大模型的SFT、LoRA、Prefix Tuning等精调策略，提供统一、高效精调方案：
 - **统一训练入口**。飞桨大模型套件精调方案可适配业界主流大模型，用户只需修改配置文件，即能在单卡或多卡（支持4D并行分布式策略）进行多种大模型精调。
-- **高效数据和分布式策略**。Zero Padding零填充优化策略有效减少了pad token的占比，提高模型训练效率高达100%。独创PEFT结合低比特和分布式并行策略，大幅降低大模型精调硬件门槛，支持单卡（A100 80G）百亿模型微调、单机（A100 80G * 8）千亿模型微调。
+- **高效数据和分布式策略**。Zero Padding零填充优化策略结合FlashMask策略有效提升模型训练效率。独创PEFT结合低比特和分布式并行策略，大幅降低大模型精调硬件门槛，支持单卡（A100 80G）百亿模型微调、单机（A100 80G * 8）千亿模型微调。
 - **支持多轮对话**。支持统一对话模板，支持多轮对话高效训练，详参[多轮对话文档](./docs/chat_template.md)。
 
 
@@ -137,26 +133,26 @@ tar -zxvf AdvertiseGen.tar.gz
 
 **全参精调：SFT**
 ```bash
-# 四卡llama SFT启动命令参考
-python -u  -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./llama/sft_argument.json
+# SFT启动命令参考
+python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py ./config/llama/sft_argument.json
 ```
 
 **LoRA**
 ```bash
-# 单卡llama LoRA启动命令参考
-python  finetune_generation.py ./llama/lora_argument.json
+# LoRA启动命令参考
+python  run_finetune.py ./config/llama/lora_argument.json
 ```
 
 **Prefix Tuning**
 ```bash
-# 单卡llama Prefix Tuning启动命令参考
-python  finetune_generation.py ./llama/pt_argument.json
+# Prefix Tuning启动命令参考
+python  run_finetune.py ./config/llama/pt_argument.json
 ```
 
 更多大模型精调分布式使用文档、训练细节和效果请参见[大模型精调教程](./docs/finetune.md)。
 
 ### 3. 对齐
-我们支持DPO等偏好对齐策略。
+我们支持DPO等偏好对齐策略。DPO策略采用zero_padding策略，结合FlashMask策略，有效提升模型训练效率。
 
 **数据准备**：
 
@@ -189,10 +185,10 @@ wget https://bj.bcebos.com/paddlenlp/datasets/examples/ultrafeedback_binarized.t
 tar -zxvf ultrafeedback_binarized.tar.gz
 ```
 
-**全参精调：SFT**
+**全参DPO**
 ```bash
-# 四卡llama SFT启动命令参考
-python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" dpo_train.py ./llama/dpo_argument.json
+# DPO启动命令参考
+python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./dpo/run_dpo.py ./config/llama/dpo_argument.json
 ```
 
 ### 4. 量化
@@ -215,10 +211,10 @@ python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" dpo_train.py ./
 
 ```
 # PTQ 量化启动命令参考
-python  finetune_generation.py ./llama/ptq_argument.json
+python  run_finetune.py ./config/llama/ptq_argument.json
 
 # GPTQ 量化启动命令参考
-python  finetune_generation.py ./llama/ptq_argument.json
+python  run_finetune.py ./config/llama/ptq_argument.json
 ```
 
 更多技术细节和模型量化使用详见[量化文档](./docs/quantization.md)。
@@ -231,13 +227,13 @@ PaddleNLP除了提供常用模型推理外，还提供了高性能推理，内
 
 ```shell
 # 动态图模型推理命令参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16
 
 # 静态图模型推理命令参考
 # step1 : 静态图导出
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16
 # step2: 静态图推理
-python predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static
+python ./predict/predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static
 ```
 
 - **InferenceModel 高性能推理**：PaddleNLP 还提供了高性能推理模型加快并行推理的速度，同时支持FP16、Prefix Tuning、WINT8、A8W8多种推理方式。
@@ -253,13 +249,13 @@ python predictor.py --model_name_or_path ./inference --data_file ./data/dev.json
 
 ```shell
 # 高性能动态图模型推理命令参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16
 
 # 高性能静态图模型推理命令参考
 # step1 : 静态图导出
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16
 # step2: 静态图推理
-python predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static"
+python ./predict/predictor.py --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static"
 ```
 
 更多常用模型推理和高性能模型使用方法详见[大模型推理文档](./docs/inference.md)。
@@ -277,7 +273,7 @@ python predictor.py --model_name_or_path ./inference --inference_model --dtype "
 我们提供了一套基于动态图推理的简单易用UI服务化部署脚本，用户可以快速部署服务化推理。
 
 ```
-python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" flask_server.py \
+python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./predict/flask_server.py \
     --model_name_or_path meta-llama/Llama-2-7b-chat \
     --port 8010 \
     --flask_port 8011 \
@@ -287,7 +283,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" flask_server.py \
 - `flask_port`: Flask服务端口号，默认8010。
 - 其他参数请参见[推理文档](./docs/inference.md)中推理参数配置。
 
-此外，如果想通过API脚本的方式跑推理，可参考：`./request_flask_server.py` 文件。
+此外，如果想通过API脚本的方式跑推理，可参考：`./predict/request_flask_server.py` 文件。
 
 </div></details>
 
diff --git a/llm/Alignment/README.md b/llm/alignment/README.md
similarity index 87%
rename from llm/Alignment/README.md
rename to llm/alignment/README.md
index fbf978dc208c..7a9a54408d92 100644
--- a/llm/Alignment/README.md
+++ b/llm/alignment/README.md
@@ -8,7 +8,7 @@
 
 ```
 .
-├── PPO                          # PPO 训练相关目录
+├── ppo                          # PPO 训练相关目录
 │   ├── comm_utils.py            # 通信相关工具py文件
 │   ├── data                     # 数据集相关目录
 │   │   ├── alpaca.py            # alpaca(raw)数据集py文件
@@ -28,16 +28,16 @@
 │   │   ├── ppo_model_utils.py   # PPO loss等模型策略py文件
 │   │   ├── score_model.py       # score model模型定义py文件
 │   │   └── score_model_utils.py # score model基类及工具py文件
-│   ├── ppo_main.py              # RLHF训练脚本
+│   ├── run_ppo.py              # RLHF训练脚本
 │   ├── ppo_trainer.py           # RLHF训练执行器py脚本
 │   ├── tests                    # 测试相关目录
 │   │   ├── run_model.py
 │   │   └── test_export.py
 │   └── trainer_utils.py         # Trainer补丁及工具py脚本
 ├── README.md
-└── RM                           # Reward Model 训练相关目录
-    ├── models -> ../PPO/models
-    ├── reward_main.py            # reward model训练脚本
+└── rm                         # Reward Model 训练相关目录
+    ├── models -> ../ppo/models
+    ├── run_reward.py            # reward model训练脚本
     └── reward_trainer.py         # reward训练执行器py脚本
 ```
 
@@ -179,14 +179,14 @@ PPO 完整的训练过程包括以下 3 个阶段，如下图所示（来自[Dee
 
 2. Reward Model Fine-Tuning
 
-使用 `reward_main.py` 脚本根据 `rm.json` 训练奖励模型
+使用 `run_reward.py` 脚本根据 `rm_argument.json` 训练奖励模型
 
 ```
-cd RM
-python -u -m paddle.distributed.launch reward_main.py ../../config/llama/rm.json
+cd rm
+python -u -m paddle.distributed.launch run_reward.py ../../config/llama/rm_argument.json
 ```
 
-`rm.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述；稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
+`rm_argument.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述；稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
 
 - `normalize_score_during_training`：是否在训练过程中对奖励进行 normalize，默认为 `False`。
 - `normalizer_type`：使用 normalizer 时计算 mean、var 的方式，可选`"RunningMeanStd", "ExponentialMovingAverage"`。
@@ -196,15 +196,15 @@ python -u -m paddle.distributed.launch reward_main.py ../../config/llama/rm.json
 
 3. RLHF：
 
-RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型；actor-model/reference-model 使用 SFT 模型进行 initialize/frozen；critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并）。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型（[PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced)）和 reward 模型（[PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward)，注意该模型只关注 helpful 未考量 harmless）作为示例，使用 `ppo_main.py` 脚本根据 `ppo.json` 进行 RLHF 训练。
+RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型；actor-model/reference-model 使用 SFT 模型进行 initialize/frozen；critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并)。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型（[PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced)）和 reward 模型（[PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward)，注意该模型只关注 helpful 未考量 harmless）作为示例，使用 `run_ppo.py` 脚本根据 `ppo_argument.json` 进行 RLHF 训练。
 
 ```
 # 类型提升 warning 暂时通过 loglevel 屏蔽，待后续修复
-cd PPO
-PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_main.py ../../config/llama/ppo.json
+cd ppo
+PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch run_ppo.py ../../config/llama/ppo_argument.json
 ```
 
-`ppo.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述，重点给出以下参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
+`ppo_argument.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述，重点给出以下参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
 
 - `train_datasets`：使用数据集定义注册时的`NAME`属性给出训练集。
 - `eval_datasets`：使用数据集定义注册时的`NAME`属性给出验证集。
@@ -230,7 +230,7 @@ PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_
 此外为了支持更高性、更大规模的 RLHF 训练提供了以下特殊参数配置，可以按需使用：
 - `use_fusemt`：安装 paddlenlp_ops 后将在 rollout 生成时开启生成加速（开启流水线并行时不支持生成加速），通过此设置可以禁用生成加速。
 - `eval_mode`：支持为空或者设置为 "single"、"tensor_parallel"；通常可以在使用流水线并行训练时设置为"tensor_parallel"，以此在 rollout 生成阶段使用非流水线并行模型并进行生成加速。
-- `offload_level`：支持设置为"freeze_model"、"optimizer"、"train_model"或者同时使用(空格分隔），分别指示 reward+reference 两个冻结模型、actor+critic 两个训练模型的优化器状态和模型参数的 offload/reload，用于在不同阶段 model/optimizer 使用结束后及时 offload 并在下次使用时 reload 相应参数权重以节省显存。
+- `offload_level`：支持设置为"freeze_model"、"optimizer"、"train_model"或者同时使用(空格分隔)，分别指示 reward+reference 两个冻结模型、actor+critic 两个训练模型的优化器状态和模型参数的 offload/reload，用于在不同阶段 model/optimizer 使用结束后及时 offload 并在下次使用时 reload 相应参数权重以节省显存。
 
 另外注意，在使用流水线并行时（pipeline_parallel_degree大于1）建议将 `dataloader_drop_last` 设置为 true, 以此避免不同batch size带来的问题。
 
diff --git a/llm/dpo_argument.py b/llm/alignment/dpo/dpo_argument.py
similarity index 100%
rename from llm/dpo_argument.py
rename to llm/alignment/dpo/dpo_argument.py
diff --git a/llm/dpo_train.py b/llm/alignment/dpo/run_dpo.py
similarity index 100%
rename from llm/dpo_train.py
rename to llm/alignment/dpo/run_dpo.py
diff --git a/llm/Alignment/PPO/comm_utils.py b/llm/alignment/ppo/comm_utils.py
similarity index 100%
rename from llm/Alignment/PPO/comm_utils.py
rename to llm/alignment/ppo/comm_utils.py
diff --git a/llm/Alignment/PPO/data/__init__.py b/llm/alignment/ppo/data/__init__.py
similarity index 100%
rename from llm/Alignment/PPO/data/__init__.py
rename to llm/alignment/ppo/data/__init__.py
diff --git a/llm/Alignment/PPO/data/alpaca.py b/llm/alignment/ppo/data/alpaca.py
similarity index 100%
rename from llm/Alignment/PPO/data/alpaca.py
rename to llm/alignment/ppo/data/alpaca.py
diff --git a/llm/Alignment/PPO/data/base.py b/llm/alignment/ppo/data/base.py
similarity index 100%
rename from llm/Alignment/PPO/data/base.py
rename to llm/alignment/ppo/data/base.py
diff --git a/llm/Alignment/PPO/data/preference.py b/llm/alignment/ppo/data/preference.py
similarity index 100%
rename from llm/Alignment/PPO/data/preference.py
rename to llm/alignment/ppo/data/preference.py
diff --git a/llm/Alignment/PPO/data/prompt_only.py b/llm/alignment/ppo/data/prompt_only.py
similarity index 100%
rename from llm/Alignment/PPO/data/prompt_only.py
rename to llm/alignment/ppo/data/prompt_only.py
diff --git a/llm/Alignment/PPO/data/safe_rlhf.py b/llm/alignment/ppo/data/safe_rlhf.py
similarity index 100%
rename from llm/Alignment/PPO/data/safe_rlhf.py
rename to llm/alignment/ppo/data/safe_rlhf.py
diff --git a/llm/Alignment/PPO/data/supervised.py b/llm/alignment/ppo/data/supervised.py
similarity index 100%
rename from llm/Alignment/PPO/data/supervised.py
rename to llm/alignment/ppo/data/supervised.py
diff --git a/llm/Alignment/PPO/infer_utils.py b/llm/alignment/ppo/infer_utils.py
similarity index 100%
rename from llm/Alignment/PPO/infer_utils.py
rename to llm/alignment/ppo/infer_utils.py
diff --git a/llm/Alignment/PPO/models/__init__.py b/llm/alignment/ppo/models/__init__.py
similarity index 100%
rename from llm/Alignment/PPO/models/__init__.py
rename to llm/alignment/ppo/models/__init__.py
diff --git a/llm/Alignment/PPO/models/infer_model_utils.py b/llm/alignment/ppo/models/infer_model_utils.py
similarity index 100%
rename from llm/Alignment/PPO/models/infer_model_utils.py
rename to llm/alignment/ppo/models/infer_model_utils.py
diff --git a/llm/Alignment/PPO/models/model_pp.py b/llm/alignment/ppo/models/model_pp.py
similarity index 100%
rename from llm/Alignment/PPO/models/model_pp.py
rename to llm/alignment/ppo/models/model_pp.py
diff --git a/llm/Alignment/PPO/models/pp_model_utils.py b/llm/alignment/ppo/models/pp_model_utils.py
similarity index 100%
rename from llm/Alignment/PPO/models/pp_model_utils.py
rename to llm/alignment/ppo/models/pp_model_utils.py
diff --git a/llm/Alignment/PPO/models/ppo_model.py b/llm/alignment/ppo/models/ppo_model.py
similarity index 100%
rename from llm/Alignment/PPO/models/ppo_model.py
rename to llm/alignment/ppo/models/ppo_model.py
diff --git a/llm/Alignment/PPO/models/ppo_model_utils.py b/llm/alignment/ppo/models/ppo_model_utils.py
similarity index 100%
rename from llm/Alignment/PPO/models/ppo_model_utils.py
rename to llm/alignment/ppo/models/ppo_model_utils.py
diff --git a/llm/Alignment/PPO/models/score_model.py b/llm/alignment/ppo/models/score_model.py
similarity index 100%
rename from llm/Alignment/PPO/models/score_model.py
rename to llm/alignment/ppo/models/score_model.py
diff --git a/llm/Alignment/PPO/models/score_model_utils.py b/llm/alignment/ppo/models/score_model_utils.py
similarity index 100%
rename from llm/Alignment/PPO/models/score_model_utils.py
rename to llm/alignment/ppo/models/score_model_utils.py
diff --git a/llm/Alignment/PPO/ppo_trainer.py b/llm/alignment/ppo/ppo_trainer.py
similarity index 100%
rename from llm/Alignment/PPO/ppo_trainer.py
rename to llm/alignment/ppo/ppo_trainer.py
diff --git a/llm/Alignment/PPO/ppo_main.py b/llm/alignment/ppo/run_ppo.py
similarity index 100%
rename from llm/Alignment/PPO/ppo_main.py
rename to llm/alignment/ppo/run_ppo.py
diff --git a/llm/Alignment/PPO/tests/run_model.py b/llm/alignment/ppo/tests/run_model.py
similarity index 100%
rename from llm/Alignment/PPO/tests/run_model.py
rename to llm/alignment/ppo/tests/run_model.py
diff --git a/llm/Alignment/PPO/tests/test_export.py b/llm/alignment/ppo/tests/test_export.py
similarity index 100%
rename from llm/Alignment/PPO/tests/test_export.py
rename to llm/alignment/ppo/tests/test_export.py
diff --git a/llm/Alignment/PPO/trainer_utils.py b/llm/alignment/ppo/trainer_utils.py
similarity index 100%
rename from llm/Alignment/PPO/trainer_utils.py
rename to llm/alignment/ppo/trainer_utils.py
diff --git a/llm/alignment/rm/models b/llm/alignment/rm/models
new file mode 120000
index 000000000000..46643733d940
--- /dev/null
+++ b/llm/alignment/rm/models
@@ -0,0 +1 @@
+../ppo/models
\ No newline at end of file
diff --git a/llm/Alignment/RM/reward_trainer.py b/llm/alignment/rm/reward_trainer.py
similarity index 100%
rename from llm/Alignment/RM/reward_trainer.py
rename to llm/alignment/rm/reward_trainer.py
diff --git a/llm/Alignment/RM/reward_main.py b/llm/alignment/rm/run_reward.py
similarity index 100%
rename from llm/Alignment/RM/reward_main.py
rename to llm/alignment/rm/run_reward.py
diff --git a/llm/gpt-3/auto_parallel/run_pretrain_auto.py b/llm/auto_parallel/gpt-3/run_pretrain_auto.py
similarity index 70%
rename from llm/gpt-3/auto_parallel/run_pretrain_auto.py
rename to llm/auto_parallel/gpt-3/run_pretrain_auto.py
index 0ee470d37255..5afb828d0e2f 100644
--- a/llm/gpt-3/auto_parallel/run_pretrain_auto.py
+++ b/llm/auto_parallel/gpt-3/run_pretrain_auto.py
@@ -18,7 +18,6 @@
 import random
 import sys
 import types
-from collections import OrderedDict
 from dataclasses import dataclass, field
 from typing import List, Optional
 
@@ -33,10 +32,10 @@
 from paddlenlp.transformers import (
     AutoTokenizer,
     CosineAnnealingWithWarmupDecay,
-    LinearAnnealingWithWarmupDecay,
     GPTConfig,
     GPTForCausalLMAuto,
     GPTPretrainingCriterionAuto,
+    LinearAnnealingWithWarmupDecay,
 )
 from paddlenlp.utils.log import logger
 
@@ -50,11 +49,10 @@
     print_rank_0,
 )
 
-def add_start_docstrings(*docstr):
 
+def add_start_docstrings(*docstr):
     def docstring_decorator(fn):
-        fn.__doc__ = "".join(docstr) + (fn.__doc__
-                                        if fn.__doc__ is not None else "")
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
         return fn
 
     return docstring_decorator
@@ -70,22 +68,19 @@ class PreTrainingArguments(TrainingArguments):
     decay_steps: float = field(
         default=None,
         metadata={
-            "help":
-            "The steps use to control the learing rate. If the step > decay_steps, will use the min_learning_rate."
+            "help": "The steps use to control the learing rate. If the step > decay_steps, will use the min_learning_rate."
         },
     )
     enable_linear_fused_grad_add: bool = field(
         default=False,
         metadata={
-            "help":
-            "Enable fused linear grad add strategy, which will reduce elementwise add for grad accumulation in the backward of nn.Linear ."
+            "help": "Enable fused linear grad add strategy, which will reduce elementwise add for grad accumulation in the backward of nn.Linear ."
         },
     )
     fused_linear_param_grad_add: bool = field(
         default=False,
         metadata={
-            "help":
-            "Enable fused_linear_param_grad pass, which should replace add_n_op with add_op for gradients accumulation."
+            "help": "Enable fused_linear_param_grad pass, which should replace add_n_op with add_op for gradients accumulation."
         },
     )
     job_schedule_profiler_start: int = field(
@@ -97,27 +92,19 @@ class PreTrainingArguments(TrainingArguments):
         metadata={"help": "The step to end job_schedule_profiler."},
     )
     pipeline_schedule_mode: str = field(
-        default="1F1B",
-        metadata={
-            "help":
-            "The pipeline schedule mode, support FThenB, 1F1B, VPP and Eager-1F1B."
-        })
-    sr: Optional[int] = field(
-        default=0, metadata={"help": "The count of chunks without recompute."})
+        default="1F1B", metadata={"help": "The pipeline schedule mode, support FThenB, 1F1B, VPP and Eager-1F1B."}
+    )
+    sr: Optional[int] = field(default=0, metadata={"help": "The count of chunks without recompute."})
     refined_ops_patterns: Optional[List[str]] = field(
-        default=None, metadata={"help": "The pattern of refined recompute."})
+        default=None, metadata={"help": "The pattern of refined recompute."}
+    )
     virtual_pipeline_seg_method: str = field(
-        default="LlamaDecoderLayerAuto",
-        metadata={
-            "help": "The seg method of spliting pp layer for virtual pipeline."
-        })
+        default="LlamaDecoderLayerAuto", metadata={"help": "The seg method of spliting pp layer for virtual pipeline."}
+    )
     # NOTE(gongenlei): new add autotuner_benchmark
     autotuner_benchmark: bool = field(
         default=False,
-        metadata={
-            "help":
-            "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."
-        },
+        metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
     )
 
     def __post_init__(self):
@@ -140,8 +127,7 @@ def __post_init__(self):
         if self.fused_linear_param_grad_add:
             fused_passes = self.strategy.fused_passes
             fused_passes.enable = True
-            fused_passes.fused_passes_list.append(
-                "fused_linear_param_grad_add_pass")
+            fused_passes.fused_passes_list.append("fused_linear_param_grad_add_pass")
 
         logger.info(self.strategy)
 
@@ -155,39 +141,28 @@ class DataArguments:
     """
 
     input_dir: str = field(
-        default=None,
-        metadata={
-            "help":
-            "The name of the dataset to use (via the datasets library)."
-        })
-    split: str = field(default="949,50,1",
-                       metadata={"help": "Train/valid/test data split."})
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    split: str = field(default="949,50,1", metadata={"help": "Train/valid/test data split."})
 
     max_seq_length: int = field(
         default=1024,
         metadata={
-            "help":
-            "The maximum total input sequence length after tokenization. Sequences longer "
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
             "than this will be truncated, sequences shorter will be padded."
         },
     )
     share_folder: bool = field(
         default=False,
-        metadata={
-            "help":
-            "Use share folder for data dir and output dir on multi machine."
-        },
+        metadata={"help": "Use share folder for data dir and output dir on multi machine."},
     )
 
-    data_impl: str = field(
-        default="mmap",
-        metadata={"help": "The format of the preprocessed data."})
+    data_impl: str = field(default="mmap", metadata={"help": "The format of the preprocessed data."})
     skip_warmup: bool = field(
         default=True,
         metadata={"help": "Whether to skip the warmup process of mmap files."},
     )
-    data_cache: str = field(
-        default=None, metadata={"help": "The path of the cached dataset."})
+    data_cache: str = field(default=None, metadata={"help": "The path of the cached dataset."})
 
 
 @dataclass
@@ -197,52 +172,35 @@ class ModelArguments:
     """
 
     model_type: Optional[str] = field(
-        default="llama",
-        metadata={"help": "Only support for llama pre-training for now."})
+        default="llama", metadata={"help": "Only support for llama pre-training for now."}
+    )
     model_name_or_path: str = field(
         default="__internal_testing__/tiny-random-llama",
         metadata={
-            "help":
-            "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html"
+            "help": "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html"
         },
     )
     tokenizer_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not the same as model_name"
-        })
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
 
     config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help":
-            "Pretrained config name or path if not the same as model_name"
-        })
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
     vocab_size: Optional[int] = field(
         default=None,
         metadata={
-            "help":
-            ".Vocabulary size of the Llama model. Defines the number of different tokens that can be represented by the `inputs_ids`"
+            "help": ".Vocabulary size of the Llama model. Defines the number of different tokens that can be represented by the `inputs_ids`"
         },
     )
-    hidden_size: Optional[int] = field(
-        default=None,
-        metadata={"help": "Dimension of the hidden representations."})
-    intermediate_size: Optional[int] = field(
-        default=None,
-        metadata={"help": "Dimension of the MLP representations."})
+    hidden_size: Optional[int] = field(default=None, metadata={"help": "Dimension of the hidden representations."})
+    intermediate_size: Optional[int] = field(default=None, metadata={"help": "Dimension of the MLP representations."})
     num_hidden_layers: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "Number of hidden layers in the Transformer encoder."
-        })
+        default=None, metadata={"help": "Number of hidden layers in the Transformer encoder."}
+    )
     num_attention_heads: Optional[int] = field(
         default=None,
-        metadata={
-            "help":
-            "Number of attention heads for each attention layer in the Transformer encoder."
-        },
+        metadata={"help": "Number of attention heads for each attention layer in the Transformer encoder."},
     )
     use_flash_attention: bool = field(
         default=False,
@@ -258,9 +216,7 @@ class ModelArguments:
     )
     fuse_attention_ffn: bool = field(
         default=False,
-        metadata={
-            "help": "whether to fuse first up and gate proj in mlp block"
-        },
+        metadata={"help": "whether to fuse first up and gate proj in mlp block"},
     )
     recompute_granularity: str = field(
         default="full",
@@ -273,15 +229,12 @@ class ModelArguments:
     continue_training: bool = field(
         default=False,
         metadata={
-            "help":
-            "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
+            "help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
         },
     )
 
-    hidden_dropout_prob: float = field(
-        default=0.1, metadata={"help": "The hidden dropout prob."})
-    attention_probs_dropout_prob: float = field(
-        default=0.1, metadata={"help": "The attention hidden dropout prob."})
+    hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."})
+    attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."})
 
     sequence_parallel: bool = field(
         default=False,
@@ -297,16 +250,12 @@ class ModelArguments:
     )
     no_recompute_layers: Optional[List[int]] = field(
         default=None,
-        metadata={
-            "help":
-            "Specify the full transformer layers that should not be recomputed."
-        },
+        metadata={"help": "Specify the full transformer layers that should not be recomputed."},
     )
     pp_recompute_interval: int = field(
         default=1,
         metadata={
-            "help":
-            "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0."
+            "help": "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0."
         },
     )
     recompute_use_reentrant: bool = field(
@@ -323,30 +272,27 @@ def create_pretrained_dataset(
     need_data=True,
 ):
 
-    check_data_split(data_args.split, training_args.do_train,
-                     training_args.do_eval, training_args.do_predict)
+    check_data_split(data_args.split, training_args.do_train, training_args.do_eval, training_args.do_predict)
 
     train_val_test_num_samples = [
-        training_args.per_device_train_batch_size *
-        training_args.data_parallel_degree * training_args.max_steps *
-        training_args.gradient_accumulation_steps,
-        training_args.per_device_eval_batch_size *
-        training_args.data_parallel_degree * training_args.eval_iters *
-        (training_args.max_steps // training_args.eval_steps + 1),
-        training_args.per_device_eval_batch_size *
-        training_args.data_parallel_degree * training_args.test_iters,
+        training_args.per_device_train_batch_size
+        * training_args.data_parallel_degree
+        * training_args.max_steps
+        * training_args.gradient_accumulation_steps,
+        training_args.per_device_eval_batch_size
+        * training_args.data_parallel_degree
+        * training_args.eval_iters
+        * (training_args.max_steps // training_args.eval_steps + 1),
+        training_args.per_device_eval_batch_size * training_args.data_parallel_degree * training_args.test_iters,
     ]
 
     print_rank_0(" > datasets target sizes (minimum size):")
     if training_args.do_train:
-        print_rank_0("    train:      {}".format(
-            train_val_test_num_samples[0]))
+        print_rank_0("    train:      {}".format(train_val_test_num_samples[0]))
     if training_args.do_eval:
-        print_rank_0("    validation: {}".format(
-            train_val_test_num_samples[1]))
+        print_rank_0("    validation: {}".format(train_val_test_num_samples[1]))
     if training_args.do_predict:
-        print_rank_0("    test:       {}".format(
-            train_val_test_num_samples[2]))
+        print_rank_0("    test:       {}".format(train_val_test_num_samples[2]))
 
     # Build the datasets.
     train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets(
@@ -399,9 +345,9 @@ def get_train_data_file(args):
         return args.input_dir.split()
     else:
         files = [
-            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
-            if (os.path.isfile(os.path.join(args.input_dir, f)) and (
-                "_idx.npz" in str(f) or ".idx" in str(f)))
+            os.path.join(args.input_dir, f)
+            for f in os.listdir(args.input_dir)
+            if (os.path.isfile(os.path.join(args.input_dir, f)) and ("_idx.npz" in str(f) or ".idx" in str(f)))
         ]
         files = [x.replace("_idx.npz", "") for x in files]
         files = [x.replace(".idx", "") for x in files]  # add
@@ -419,7 +365,6 @@ def get_train_data_file(args):
 
 
 class PretrainingTrainer(AutoTrainer):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -441,8 +386,7 @@ def print_config(args, key=""):
 
     logger.info("{:^40}".format("{} Configuration Arguments".format(key)))
     logger.info("{:30}: {}".format("paddle commit id", paddle.version.commit))
-    logger.info("{:30}: {}".format("paddlenlp commit id",
-                                   paddlenlp.version.commit))
+    logger.info("{:30}: {}".format("paddlenlp commit id", paddlenlp.version.commit))
 
     for a in dir(args):
         if a[:2] != "__":  # don't print double underscore methods
@@ -467,12 +411,10 @@ def init_seed(seed: int = 1234, args=None):
                 dp_degree=args.data_parallel_degree,
                 pp_degree=args.pipeline_parallel_degree,
                 mp_degree=args.tensor_parallel_degree,
-                sharding_degree=
-                1,  # auto_parallel's sharding is not orthogonal with dp, mp and pp
+                sharding_degree=1,  # auto_parallel's sharding is not orthogonal with dp, mp and pp
             )
 
-            global_seed, local_seed, random_seed = _get_distributed_seeds(
-                args.seed, topo)
+            global_seed, local_seed, random_seed = _get_distributed_seeds(args.seed, topo)
 
             paddle.seed(local_seed)
             random.seed(random_seed)
@@ -480,8 +422,8 @@ def init_seed(seed: int = 1234, args=None):
 
             logger.info(
                 "The global seed is set to {}, local seed is set to {} and "
-                "random seed is set to {}.".format(global_seed, local_seed,
-                                                   random_seed))
+                "random seed is set to {}.".format(global_seed, local_seed, random_seed)
+            )
         else:
             random.seed(args.seed)
             np.random.seed(args.seed)
@@ -489,14 +431,11 @@ def init_seed(seed: int = 1234, args=None):
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, PreTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        model_args, data_args, training_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1]))
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
-        )
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
     if training_args.enable_linear_fused_grad_add:
         from fused_layers import mock_layers
@@ -524,15 +463,12 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
-        +
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
     )
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if os.path.isdir(
-            training_args.output_dir
-    ) and training_args.do_train and not training_args.overwrite_output_dir:
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
@@ -540,41 +476,35 @@ def main():
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
             )
 
-    config_class, model_class, criterion_class = MODEL_CLASSES[
-        model_args.model_type]
+    config_class, model_class, criterion_class = MODEL_CLASSES[model_args.model_type]
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
 
     config = config_class.from_pretrained(model_args.model_name_or_path)
 
     config.seq_length = data_args.max_seq_length
     # There are some technique extend RotaryEmbedding context. so don't change max_position_embeddings
     if not model_args.continue_training:
-        config.max_position_embeddings = max(config.max_position_embeddings,
-                                             data_args.max_seq_length)
+        config.max_position_embeddings = max(config.max_position_embeddings, data_args.max_seq_length)
 
     if not model_args.continue_training:
-        config.vocab_size = max(config.vocab_size,
-                                ((tokenizer.vocab_size - 1) // 128 + 1) * 128)
-        logger.info(
-            f"Reset vocab size to {config.vocab_size} for batter amp peformance."
-        )
+        config.vocab_size = max(config.vocab_size, ((tokenizer.vocab_size - 1) // 128 + 1) * 128)
+        logger.info(f"Reset vocab size to {config.vocab_size} for batter amp peformance.")
 
     if model_args.no_recompute_layers is not None:
         model_args.no_recompute_layers.sort()
 
     config.vocab_size = model_args.vocab_size if model_args.vocab_size is not None else config.vocab_size
     config.hidden_size = model_args.hidden_size if model_args.hidden_size is not None else config.hidden_size
-    config.intermediate_size = (model_args.intermediate_size
-                                if model_args.intermediate_size is not None
-                                else config.intermediate_size)
-    config.num_hidden_layers = (model_args.num_hidden_layers
-                                if model_args.num_hidden_layers is not None
-                                else config.num_hidden_layers)
-    config.num_attention_heads = (model_args.num_attention_heads
-                                  if model_args.num_attention_heads is not None
-                                  else config.num_attention_heads)
+    config.intermediate_size = (
+        model_args.intermediate_size if model_args.intermediate_size is not None else config.intermediate_size
+    )
+    config.num_hidden_layers = (
+        model_args.num_hidden_layers if model_args.num_hidden_layers is not None else config.num_hidden_layers
+    )
+    config.num_attention_heads = (
+        model_args.num_attention_heads if model_args.num_attention_heads is not None else config.num_attention_heads
+    )
 
     config.use_flash_attention = model_args.use_flash_attention
     config.use_fused_rms_norm = model_args.use_fused_rms_norm
@@ -615,10 +545,7 @@ def main():
     if training_args.recompute:
 
         def fn(layer):
-            if hasattr(
-                    layer,
-                    "enable_recompute") and (layer.enable_recompute is False
-                                             or layer.enable_recompute == 0):
+            if hasattr(layer, "enable_recompute") and (layer.enable_recompute is False or layer.enable_recompute == 0):
                 layer.enable_recompute = True
 
         model.apply(fn)
diff --git a/llm/gpt-3/auto_parallel/run_pretrain_auto_dp2mp2pp2.sh b/llm/auto_parallel/gpt-3/run_pretrain_auto_dp2mp2pp2.sh
similarity index 72%
rename from llm/gpt-3/auto_parallel/run_pretrain_auto_dp2mp2pp2.sh
rename to llm/auto_parallel/gpt-3/run_pretrain_auto_dp2mp2pp2.sh
index 9219cd27e3a3..71578bb81532 100755
--- a/llm/gpt-3/auto_parallel/run_pretrain_auto_dp2mp2pp2.sh
+++ b/llm/auto_parallel/gpt-3/run_pretrain_auto_dp2mp2pp2.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 export PYTHONPATH="../../../":$PYTHONPATH
 export FLAGS_cudnn_deterministic=1
 export FLAGS_embedding_deterministic=1 
diff --git a/llm/llama/auto_parallel/README.md b/llm/auto_parallel/llama/README.md
similarity index 100%
rename from llm/llama/auto_parallel/README.md
rename to llm/auto_parallel/llama/README.md
diff --git a/llm/llama/auto_parallel/run_llama3.sh b/llm/auto_parallel/llama/run_llama3.sh
similarity index 100%
rename from llm/llama/auto_parallel/run_llama3.sh
rename to llm/auto_parallel/llama/run_llama3.sh
diff --git a/llm/llama/auto_parallel/run_pretrain_auto.py b/llm/auto_parallel/llama/run_pretrain_auto.py
similarity index 100%
rename from llm/llama/auto_parallel/run_pretrain_auto.py
rename to llm/auto_parallel/llama/run_pretrain_auto.py
diff --git a/llm/llama/auto_parallel/run_pretrain_auto.sh b/llm/auto_parallel/llama/run_pretrain_auto.sh
similarity index 100%
rename from llm/llama/auto_parallel/run_pretrain_auto.sh
rename to llm/auto_parallel/llama/run_pretrain_auto.sh
diff --git a/llm/llama/auto_parallel/run_pretrain_auto_static.py b/llm/auto_parallel/llama/run_pretrain_auto_static.py
similarity index 100%
rename from llm/llama/auto_parallel/run_pretrain_auto_static.py
rename to llm/auto_parallel/llama/run_pretrain_auto_static.py
diff --git a/llm/llama/auto_parallel/run_pretrain_auto_static.sh b/llm/auto_parallel/llama/run_pretrain_auto_static.sh
similarity index 100%
rename from llm/llama/auto_parallel/run_pretrain_auto_static.sh
rename to llm/auto_parallel/llama/run_pretrain_auto_static.sh
diff --git a/llm/llama/auto_parallel/run_pretrain_auto_static_sp.sh b/llm/auto_parallel/llama/run_pretrain_auto_static_sp.sh
similarity index 100%
rename from llm/llama/auto_parallel/run_pretrain_auto_static_sp.sh
rename to llm/auto_parallel/llama/run_pretrain_auto_static_sp.sh
diff --git a/llm/llama/auto_parallel/run_pretrain_hand.py b/llm/auto_parallel/llama/run_pretrain_hand.py
similarity index 100%
rename from llm/llama/auto_parallel/run_pretrain_hand.py
rename to llm/auto_parallel/llama/run_pretrain_hand.py
diff --git a/llm/llama/auto_parallel/run_pretrain_hand.sh b/llm/auto_parallel/llama/run_pretrain_hand.sh
similarity index 100%
rename from llm/llama/auto_parallel/run_pretrain_hand.sh
rename to llm/auto_parallel/llama/run_pretrain_hand.sh
diff --git a/llm/qwen/auto_parallel/pretrain_argument_auto_dp2tp2pp2.json b/llm/auto_parallel/qwen/pretrain_argument_auto_dp2tp2pp2.json
similarity index 100%
rename from llm/qwen/auto_parallel/pretrain_argument_auto_dp2tp2pp2.json
rename to llm/auto_parallel/qwen/pretrain_argument_auto_dp2tp2pp2.json
diff --git a/llm/qwen/auto_parallel/run_pretrain_3D_auto.py b/llm/auto_parallel/qwen/run_pretrain_3D_auto.py
similarity index 100%
rename from llm/qwen/auto_parallel/run_pretrain_3D_auto.py
rename to llm/auto_parallel/qwen/run_pretrain_3D_auto.py
diff --git a/llm/qwen/auto_parallel/run_pretrain_3D_auto.sh b/llm/auto_parallel/qwen/run_pretrain_3D_auto.sh
similarity index 100%
rename from llm/qwen/auto_parallel/run_pretrain_3D_auto.sh
rename to llm/auto_parallel/qwen/run_pretrain_3D_auto.sh
diff --git a/llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json b/llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json
deleted file mode 100644
index 51d55556a9c1..000000000000
--- a/llm/baichuan/pretrain-baichuan2_13b-sd8_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
-    "tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/baichuan_pretrain_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 1,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 1000,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": true,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/benchmark.sh b/llm/benchmark.sh
deleted file mode 100644
index d49858b42b76..000000000000
--- a/llm/benchmark.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH
-
-export FLAGS_control_flow_use_new_executor=1
-export FLAGS_new_executor_serial_run=1
-export FLAGS_allocator_strategy=naive_best_fit
-export FLAGS_fraction_of_gpu_memory_to_use=0.92
-
-export FLAGS_use_autotune=1
-export FLAGS_cublaslt_exhaustive_search_times=10
-export FLAGS_cache_inference_while_scope=1
-
-
-python predictor.py \
-    --model_name_or_path ./llama7b-inference_model_fp16 \
-    --dtype float16 \
-    --src_length 300 \
-    --max_length 100 \
-    --output_file "infer.json" \
-    --mode "static" \
-    --batch_size 1 \
-    --benchmark \
-    --inference_model 
diff --git a/llm/config/baichuan/README.md b/llm/config/baichuan/README.md
new file mode 100644
index 000000000000..98bf760a6caa
--- /dev/null
+++ b/llm/config/baichuan/README.md
@@ -0,0 +1,15 @@
+# Baichuan
+
+## 1. 模型介绍
+
+**支持模型权重:**
+
+| Model                            |
+| ---------------------------------|
+| baichuan-inc/Baichuan-7B          |
+| baichuan-inc/Baichuan-13B-Base    |
+| baichuan-inc/Baichuan-13B-Chat    |
+| baichuan-inc/Baichuan2-7B-Base    |
+| baichuan-inc/Baichuan2-7B-Chat    |
+| baichuan-inc/Baichuan2-13B-Base   |
+| baichuan-inc/Baichuan2-13B-Chat   |
diff --git a/llm/config/baichuan/awq_argument.json b/llm/config/baichuan/awq_argument.json
new file mode 100644
index 000000000000..23c1884ed768
--- /dev/null
+++ b/llm/config/baichuan/awq_argument.json
@@ -0,0 +1,23 @@
+{
+    "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
+    "per_device_train_batch_size": 8,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "src_length": 1024,
+    "max_length": 2048,
+    "fp16": true,
+    "fp16_opt_level": "O2",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/ptq_ckpts",
+    "do_eval": true,
+    "eval_with_do_generation": false,
+    "do_ptq": true,
+    "quant_type": "weight_only_int4",
+    "weight_quant_method": "groupwise",
+    "ptq_step": 16,
+    "smooth": true,
+    "auto_clip": true,
+    "autoclip_step": 1,
+    "do_awq": true,
+    "unified_checkpoint": true
+  }
\ No newline at end of file
diff --git a/llm/config/baichuan/dpo_argument.json b/llm/config/baichuan/dpo_argument.json
new file mode 100644
index 000000000000..376caef0eda7
--- /dev/null
+++ b/llm/config/baichuan/dpo_argument.json
@@ -0,0 +1,38 @@
+{
+    "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
+    "train_dataset_path": "./data/train.jsonl",
+    "dev_dataset_path": "./data/dev.jsonl",
+    "output_dir": "./checkpoints/dpo_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 8,
+    "per_device_eval_batch_size": 1,
+    "num_train_epochs": 1,
+    "max_steps": 100,
+    "learning_rate": 1e-06,
+    "warmup_steps": 10,
+    "logging_steps": 1,
+    "evaluation_strategy": "steps",
+    "save_strategy": "steps",
+    "eval_steps": 100,
+    "save_steps": 500,
+    "max_seq_len": 4096,
+    "max_prompt_len": 2048,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "tensor_parallel_degree": 8,
+    "sharding_parallel_degree": 1,
+    "sharding": "stage1",
+    "use_flash_attention": true,
+    "recompute": false,
+    "recompute_granularity": "full",
+    "dpo_beta": 0.1,
+    "benchmark": false,
+    "dpo_loss_type": "sigmoid",
+    "dpo_label_smoothing": 0.0,
+    "unified_checkpoint": true,
+    "autotuner_benchmark":false
+  }
diff --git a/llm/chatglm2/gptq_argument.json b/llm/config/baichuan/gptq_argument.json
similarity index 71%
rename from llm/chatglm2/gptq_argument.json
rename to llm/config/baichuan/gptq_argument.json
index 9285e8b628ad..593773a268e2 100644
--- a/llm/chatglm2/gptq_argument.json
+++ b/llm/config/baichuan/gptq_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./checkpoints/chatglm2_sft_ckpts",
+    "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
@@ -8,9 +8,10 @@
     "fp16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm2_gptq_ckpts",
+    "output_dir": "./checkpoints/gptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_gptq": true,
+    "unified_checkpoint": true,
     "gptq_step": 8
   }
\ No newline at end of file
diff --git a/llm/config/baichuan/lora_argument.json b/llm/config/baichuan/lora_argument.json
new file mode 100644
index 000000000000..8d2702551f4b
--- /dev/null
+++ b/llm/config/baichuan/lora_argument.json
@@ -0,0 +1,35 @@
+{
+    "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/lora_ckpts",
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 3,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "fp16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "sharding_parallel_degree": 1,
+    "sharding": "stage1",
+    "lora": true,
+    "zero_padding": false,
+    "unified_checkpoint": true,
+    "use_flash_attention": true
+  }
diff --git a/llm/baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json b/llm/config/baichuan/pretrain_argument.json
similarity index 90%
rename from llm/baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json
rename to llm/config/baichuan/pretrain_argument.json
index da31682d6949..aeb17cf475a4 100644
--- a/llm/baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json
+++ b/llm/config/baichuan/pretrain_argument.json
@@ -2,12 +2,13 @@
     "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
     "tokenizer_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
     "input_dir": "./data",
-    "output_dir": "./checkpoints/baichuan_pretrain_ckpts",
+    "output_dir": "./checkpoints/pretrain_ckpts",
     "per_device_train_batch_size": 2,
     "gradient_accumulation_steps": 8,
     "per_device_eval_batch_size": 2,
     "tensor_parallel_degree": 2,
     "pipeline_parallel_degree": 1,
+    "sharding_parallel_degree": 4,
     "sharding": "stage2",
     "virtual_pp_degree": 1,
     "sequence_parallel": 0,   
@@ -36,5 +37,6 @@
     "recompute": false,
     "distributed_dataloader": 1,
     "recompute_granularity": "full",
+    "unified_checkpoint": true,
     "save_total_limit": 2
   }
diff --git a/llm/config/baichuan/ptq_argument.json b/llm/config/baichuan/ptq_argument.json
new file mode 100644
index 000000000000..f15164f44eef
--- /dev/null
+++ b/llm/config/baichuan/ptq_argument.json
@@ -0,0 +1,23 @@
+{
+  "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "./data",
+  "output_dir": "./checkpoints/ptq_ckpts",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": true,
+  "smooth": true,
+  "smooth_step": 16,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 3,
+  "smooth_search_piece": true
+}
\ No newline at end of file
diff --git a/llm/config/baichuan/qlora_argument.json b/llm/config/baichuan/qlora_argument.json
new file mode 100644
index 000000000000..c820bcff63df
--- /dev/null
+++ b/llm/config/baichuan/qlora_argument.json
@@ -0,0 +1,34 @@
+{
+    "model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/qlora_ckpts",
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 3,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "fp16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "lora": true,
+    "zero_padding": false,
+    "use_flash_attention": true,
+    "unified_checkpoint": true,
+    "weight_quantize_algo": "nf4"
+  }
\ No newline at end of file
diff --git a/llm/bloom/README.md b/llm/config/bloom/README.md
similarity index 92%
rename from llm/bloom/README.md
rename to llm/config/bloom/README.md
index 2cdeafa66968..52311561818a 100644
--- a/llm/bloom/README.md
+++ b/llm/config/bloom/README.md
@@ -20,6 +20,3 @@ BLOOM是一种自回归大型语言模型(LLM)，在大量文本数据上训练
 | bigscience/bloomz-7b1-p3          |
 | bigscience/bloomz-7b1          |
 | bellegroup/belle-7b-2m |
-
-## 2. 模型精调
-请参考[LLM全流程工具介绍](../README.md)
diff --git a/llm/llama/gptq_argument.json b/llm/config/bloom/gptq_argument.json
similarity index 72%
rename from llm/llama/gptq_argument.json
rename to llm/config/bloom/gptq_argument.json
index 75944f076c29..615286908be0 100644
--- a/llm/llama/gptq_argument.json
+++ b/llm/config/bloom/gptq_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./checkpoints/llama_sft_ckpts",
+    "model_name_or_path": "bigscience/bloomz-7b1-mt",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
@@ -8,9 +8,10 @@
     "fp16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_gptq_ckpts",
+    "output_dir": "./checkpoints/gptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_gptq": true,
+    "unified_checkpoint": true,
     "gptq_step": 8
   }
\ No newline at end of file
diff --git a/llm/bloom/lora_argument.json b/llm/config/bloom/lora_argument.json
similarity index 91%
rename from llm/bloom/lora_argument.json
rename to llm/config/bloom/lora_argument.json
index 6867ecaeedf2..d36d821a35ce 100644
--- a/llm/bloom/lora_argument.json
+++ b/llm/config/bloom/lora_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "bigscience/bloomz-7b1-mt",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/bloom_lora_ckpts",
+    "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -28,5 +28,6 @@
     "pipeline_parallel_degree": 1,
     "lora": true,
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
\ No newline at end of file
diff --git a/llm/bloom/pt_argument.json b/llm/config/bloom/pt_argument.json
similarity index 92%
rename from llm/bloom/pt_argument.json
rename to llm/config/bloom/pt_argument.json
index 30d6839369cc..44801b6eb623 100644
--- a/llm/bloom/pt_argument.json
+++ b/llm/config/bloom/pt_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "bigscience/bloomz-7b1-mt",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/bloom_pt_ckpts",
+    "output_dir": "./checkpoints/pt_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -28,5 +28,6 @@
     "pipeline_parallel_degree": 1,
     "prefix_tuning": true,
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
\ No newline at end of file
diff --git a/llm/chatglm2/ptq_argument.json b/llm/config/bloom/ptq_argument.json
similarity index 79%
rename from llm/chatglm2/ptq_argument.json
rename to llm/config/bloom/ptq_argument.json
index 46a57083584a..fff4560700e7 100644
--- a/llm/chatglm2/ptq_argument.json
+++ b/llm/config/bloom/ptq_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./checkpoints/chatglm2_sft_ckpts",
+    "model_name_or_path": "bigscience/bloomz-7b1-mt",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
@@ -8,7 +8,7 @@
     "fp16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm2_ptq_ckpts",
+    "output_dir": "./checkpoints/ptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_ptq": true,
@@ -18,5 +18,6 @@
     "smooth_all_linears": true,
     "smooth_piecewise_search": true,
     "smooth_k_piece": 3,
+    "unified_checkpoint": true,
     "smooth_search_piece": true
   }
\ No newline at end of file
diff --git a/llm/bloom/sft_argument.json b/llm/config/bloom/sft_argument.json
similarity index 91%
rename from llm/bloom/sft_argument.json
rename to llm/config/bloom/sft_argument.json
index 2c793576b7e0..31b020da30a1 100644
--- a/llm/bloom/sft_argument.json
+++ b/llm/config/bloom/sft_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "bigscience/bloomz-7b1-mt",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/bloom_sft_ckpts",
+    "output_dir": "./checkpoints/sft_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -27,5 +27,6 @@
     "tensor_parallel_degree": 4,
     "pipeline_parallel_degree": 1,
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
\ No newline at end of file
diff --git a/llm/chatglm/README.md b/llm/config/chatglm/README.md
similarity index 92%
rename from llm/chatglm/README.md
rename to llm/config/chatglm/README.md
index 281a7ceea61f..c8cfb4f8b28b 100644
--- a/llm/chatglm/README.md
+++ b/llm/config/chatglm/README.md
@@ -14,6 +14,3 @@ ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型，
 ## 2. 模型协议
 
 ChatGLM-6B 模型的权重的使用需要遵循[License](../../paddlenlp/transformers/chatglm/LICENSE)。
-
-## 3. 模型精调
-请参考[LLM全流程工具介绍](../README.md)
diff --git a/llm/bloom/gptq_argument.json b/llm/config/chatglm/gptq_argument.json
similarity index 73%
rename from llm/bloom/gptq_argument.json
rename to llm/config/chatglm/gptq_argument.json
index 6a5cb7e882a7..d509f6aed280 100644
--- a/llm/bloom/gptq_argument.json
+++ b/llm/config/chatglm/gptq_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./checkpoints/bloom_sft_ckpts",
+    "model_name_or_path": "THUDM/chatglm-6b",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
@@ -8,9 +8,10 @@
     "fp16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/bloom_gptq_ckpts",
+    "output_dir": "./checkpoints/gptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_gptq": true,
+    "unified_checkpoint": true,
     "gptq_step": 8
   }
\ No newline at end of file
diff --git a/llm/chatglm/lora_argument.json b/llm/config/chatglm/lora_argument.json
similarity index 91%
rename from llm/chatglm/lora_argument.json
rename to llm/config/chatglm/lora_argument.json
index af49af041d72..11069e723f8f 100644
--- a/llm/chatglm/lora_argument.json
+++ b/llm/config/chatglm/lora_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "THUDM/chatglm-6b",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm_lora_ckpts",
+    "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -28,5 +28,6 @@
     "pipeline_parallel_degree": 1,
     "lora": true,
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
\ No newline at end of file
diff --git a/llm/chatglm/pt_argument.json b/llm/config/chatglm/pt_argument.json
similarity index 94%
rename from llm/chatglm/pt_argument.json
rename to llm/config/chatglm/pt_argument.json
index 03158f7f127f..54c95fd56744 100644
--- a/llm/chatglm/pt_argument.json
+++ b/llm/config/chatglm/pt_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "THUDM/chatglm-6b",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm_pt_ckpts",
+    "output_dir": "./checkpoints/pt_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
diff --git a/llm/chatglm/ptq_argument.json b/llm/config/chatglm/ptq_argument.json
similarity index 73%
rename from llm/chatglm/ptq_argument.json
rename to llm/config/chatglm/ptq_argument.json
index 63474a9e0a19..64b6e480776b 100644
--- a/llm/chatglm/ptq_argument.json
+++ b/llm/config/chatglm/ptq_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./checkpoints/llama_sft_ckpts",
+    "model_name_or_path": "THUDM/chatglm-6b",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
@@ -8,9 +8,10 @@
     "fp16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_ptq_ckpts",
+    "output_dir": "./checkpoints/ptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_ptq": true,
+    "unified_checkpoint": true,
     "ptq_step": 16
   }
\ No newline at end of file
diff --git a/llm/chatglm/sft_argument.json b/llm/config/chatglm/sft_argument.json
similarity index 91%
rename from llm/chatglm/sft_argument.json
rename to llm/config/chatglm/sft_argument.json
index 8309f28f1439..73286c3bb5c8 100644
--- a/llm/chatglm/sft_argument.json
+++ b/llm/config/chatglm/sft_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "THUDM/chatglm-6b",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm_sft_ckpts",
+    "output_dir": "./checkpoints/sft_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -27,5 +27,6 @@
     "tensor_parallel_degree": 4,
     "pipeline_parallel_degree": 1,
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
 }
\ No newline at end of file
diff --git a/llm/chatglm2/README.md b/llm/config/chatglm2/README.md
similarity index 91%
rename from llm/chatglm2/README.md
rename to llm/config/chatglm2/README.md
index f04166f5bd50..0929e7b20fac 100644
--- a/llm/chatglm2/README.md
+++ b/llm/config/chatglm2/README.md
@@ -15,6 +15,3 @@ ChatGLM2-6B 是开源中英双语对话模型 [ChatGLM-6B](https://github.com/TH
 
 
 ChatGLM2-6B 模型的权重的使用需要遵循[License](../../paddlenlp/transformers/chatglm_v2/LICENSE)。
-
-## 3. 模型精调
-请参考[LLM全流程工具介绍](../README.md)
diff --git a/llm/chatglm/gptq_argument.json b/llm/config/chatglm2/gptq_argument.json
similarity index 73%
rename from llm/chatglm/gptq_argument.json
rename to llm/config/chatglm2/gptq_argument.json
index 8b1c07742ba8..137f036a0552 100644
--- a/llm/chatglm/gptq_argument.json
+++ b/llm/config/chatglm2/gptq_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./checkpoints/chatglm_sft_ckpts",
+    "model_name_or_path": "THUDM/chatglm2-6b",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
@@ -8,9 +8,10 @@
     "fp16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm_gptq_ckpts",
+    "output_dir": "./checkpoints/gptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_gptq": true,
+    "unified_checkpoint": true,
     "gptq_step": 8
   }
\ No newline at end of file
diff --git a/llm/chatglm2/lora_argument.json b/llm/config/chatglm2/lora_argument.json
similarity index 91%
rename from llm/chatglm2/lora_argument.json
rename to llm/config/chatglm2/lora_argument.json
index c88636b9bd1d..6e734fc1f2a8 100644
--- a/llm/chatglm2/lora_argument.json
+++ b/llm/config/chatglm2/lora_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "THUDM/chatglm2-6b",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm2_lora_ckpts",
+    "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -28,5 +28,6 @@
     "pipeline_parallel_degree": 1,
     "lora": true,
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
\ No newline at end of file
diff --git a/llm/chatglm2/pt_argument.json b/llm/config/chatglm2/pt_argument.json
similarity index 94%
rename from llm/chatglm2/pt_argument.json
rename to llm/config/chatglm2/pt_argument.json
index a10f9b4d788c..52a80b837686 100644
--- a/llm/chatglm2/pt_argument.json
+++ b/llm/config/chatglm2/pt_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "THUDM/chatglm2-6b",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm2_pt_ckpts",
+    "output_dir": "./checkpoints/pt_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
diff --git a/llm/bloom/ptq_argument.json b/llm/config/chatglm2/ptq_argument.json
similarity index 79%
rename from llm/bloom/ptq_argument.json
rename to llm/config/chatglm2/ptq_argument.json
index 21a28735ecc1..806c80a3cf63 100644
--- a/llm/bloom/ptq_argument.json
+++ b/llm/config/chatglm2/ptq_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./checkpoints/bloom_sft_ckpts",
+    "model_name_or_path": "./checkpoints/sft_ckpts",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
@@ -8,11 +8,12 @@
     "fp16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/bloom_ptq_ckpts",
+    "output_dir": "./checkpoints/ptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_ptq": true,
     "ptq_step": 16,
+    "unified_checkpoint": true,
     "smooth": true,
     "smooth_step": 16,
     "smooth_all_linears": true,
diff --git a/llm/chatglm2/sft_argument.json b/llm/config/chatglm2/sft_argument.json
similarity index 85%
rename from llm/chatglm2/sft_argument.json
rename to llm/config/chatglm2/sft_argument.json
index 8508d9676379..ee2ffb4ee7ae 100644
--- a/llm/chatglm2/sft_argument.json
+++ b/llm/config/chatglm2/sft_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "THUDM/chatglm2-6b",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm2_sft_ckpts",
+    "output_dir": "./checkpoints/sft_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -24,8 +24,9 @@
     "metric_for_best_model": "accuracy",
     "recompute": true,
     "save_total_limit": 1,
-    "sharding_parallel_degree": 4,
-    "sharding": "stage3",
+    "sharding_parallel_degree": 8,
+    "sharding": "stage2",
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
\ No newline at end of file
diff --git a/llm/gemma/README.md b/llm/config/gemma/README.md
similarity index 100%
rename from llm/gemma/README.md
rename to llm/config/gemma/README.md
diff --git a/llm/gemma/sft_argument.json b/llm/config/gemma/sft_argument.json
similarity index 71%
rename from llm/gemma/sft_argument.json
rename to llm/config/gemma/sft_argument.json
index 45a483d7e52a..15d9c3b93807 100644
--- a/llm/gemma/sft_argument.json
+++ b/llm/config/gemma/sft_argument.json
@@ -1,7 +1,7 @@
 {
-    "model_name_or_path": "google/gemma-2b/",
+    "model_name_or_path": "google/gemma-2b",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/gemma_sft_ckpts",
+    "output_dir": "./checkpoints/sft_ckpts",
     "per_device_train_batch_size": 2,
     "gradient_accumulation_steps": 1,
     "per_device_eval_batch_size": 8,
@@ -24,7 +24,11 @@
     "metric_for_best_model": "accuracy",
     "recompute": true,
     "save_total_limit": 1,
-    "tensor_parallel_degree": 2,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "sharding_parallel_degree": 8,
+    "sharding": "stage2",
     "zero_padding": false,
-    "use_flash_attention": false
+    "unified_checkpoint": true,
+    "use_flash_attention": true
   }
\ No newline at end of file
diff --git a/llm/config/gpt-3/README.md b/llm/config/gpt-3/README.md
new file mode 100644
index 000000000000..472c2f74cd42
--- /dev/null
+++ b/llm/config/gpt-3/README.md
@@ -0,0 +1,5 @@
+# GPT
+
+## 1. 模型介绍
+
+GPT-3是一种预训练语言模型，它能够模拟人类语言思维和表达。GPT-3拥有巨大的参数，包含了1750亿个参数，这使得它具有强大的语言理解和生成能力。它可以完成的任务包括文本生成、文本摘要、回答问题、翻译、阅读理解等。GPT-3的预训练过程使用了大量的语料库，包括互联网上的大量文本。它通过分析这些文本，学习如何生成和理解人类语言。GPT-3在自然语言处理领域具有很高的影响力，它可以模拟人类对话和生成文本，这使得它在许多应用领域都有广泛的应用，比如智能客服、自然语言处理、游戏设计等。
diff --git a/llm/llama/lora_argument.json b/llm/config/gpt-3/lora_argument.json
similarity index 86%
rename from llm/llama/lora_argument.json
rename to llm/config/gpt-3/lora_argument.json
index 6817215e0c74..1ed0576d951b 100644
--- a/llm/llama/lora_argument.json
+++ b/llm/config/gpt-3/lora_argument.json
@@ -1,7 +1,7 @@
 {
-    "model_name_or_path": "facebook/llama-7b",
+    "model_name_or_path": "gpt2-medium-en",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_lora_ckpts",
+    "output_dir": "./checkpoints/gpt_lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -28,5 +28,6 @@
     "pipeline_parallel_degree": 1,
     "lora": true,
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
diff --git a/llm/gpt-3/pretrain-gpt_medium_en-stage2.json b/llm/config/gpt-3/pretrain_argument.json
similarity index 97%
rename from llm/gpt-3/pretrain-gpt_medium_en-stage2.json
rename to llm/config/gpt-3/pretrain_argument.json
index 3d7685a9696d..3959956bd21d 100644
--- a/llm/gpt-3/pretrain-gpt_medium_en-stage2.json
+++ b/llm/config/gpt-3/pretrain_argument.json
@@ -33,6 +33,7 @@
     "disable_tqdm": true,
     "recompute": false,
     "distributed_dataloader": 1,
+    "unified_checkpoint": true,
     "recompute_granularity": "full",
     "save_total_limit": 2
 }
diff --git a/llm/config/gpt-3/sft_argument.json b/llm/config/gpt-3/sft_argument.json
new file mode 100644
index 000000000000..76d50ec28628
--- /dev/null
+++ b/llm/config/gpt-3/sft_argument.json
@@ -0,0 +1,33 @@
+{
+    "model_name_or_path": "gpt2-medium-en",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 3,
+    "learning_rate": 3e-05,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "fp16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "lora": true,
+    "zero_padding": false,
+    "unified_checkpoint": true,
+    "use_flash_attention": false
+  }
diff --git a/llm/llama/README.md b/llm/config/llama/README.md
similarity index 92%
rename from llm/llama/README.md
rename to llm/config/llama/README.md
index c707c0cd64ac..bda1959533d7 100644
--- a/llm/llama/README.md
+++ b/llm/config/llama/README.md
@@ -16,6 +16,10 @@
 | meta-llama/Llama-2-13b-chat       |
 | meta-llama/Llama-2-70b            |
 | meta-llama/Llama-2-70b-chat       |
+|meta-llama/Meta-Llama-3-8B|
+|meta-llama/Meta-Llama-3-8B-Instruct|
+|meta-llama/Meta-Llama-3-70B|
+|meta-llama/Meta-Llama-3-70B-Instruct|
 | ziqingyang/chinese-llama-7b       |
 | ziqingyang/chinese-llama-13b      |
 | ziqingyang/chinese-alpaca-7b      |
@@ -48,11 +52,3 @@ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat")
 LLaMA 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/LICENSE)。
 
 Llama2 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/Llama2.LICENSE)。
-
-
-## 3. 预训练
-
-请参考[LLM全流程工具介绍](../README.md)
-
-## 4. 模型精调
-请参考[LLM全流程工具介绍](../README.md)
diff --git a/llm/llama/awq_argument.json b/llm/config/llama/awq_argument.json
similarity index 76%
rename from llm/llama/awq_argument.json
rename to llm/config/llama/awq_argument.json
index 21a9bcdb13b3..7ae7f55b678c 100644
--- a/llm/llama/awq_argument.json
+++ b/llm/config/llama/awq_argument.json
@@ -1,14 +1,14 @@
 {
-    "model_name_or_path": "./checkpoints/llama_sft_ckpts",
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
     "per_device_train_batch_size": 8,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
     "src_length": 1024,
     "max_length": 2048,
-    "fp16": true,
+    "bf16": true,
     "fp16_opt_level": "O2",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_ptq_ckpts",
+    "output_dir": "./checkpoints/ptq_ckpts",
     "do_eval": true,
     "eval_with_do_generation": false,
     "do_ptq": true,
@@ -18,5 +18,6 @@
     "smooth": true,
     "auto_clip": true,
     "autoclip_step": 1,
+    "unified_checkpoint": true,
     "do_awq": true
   }
\ No newline at end of file
diff --git a/llm/llama/dpo_argument.json b/llm/config/llama/dpo_argument.json
similarity index 92%
rename from llm/llama/dpo_argument.json
rename to llm/config/llama/dpo_argument.json
index 7aa86b342128..b30fcc86478c 100644
--- a/llm/llama/dpo_argument.json
+++ b/llm/config/llama/dpo_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "meta-llama/Llama-2-7b-chat",
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
     "train_dataset_path": "./data/train.jsonl",
     "dev_dataset_path": "./data/dev.jsonl",
     "output_dir": "./checkpoints/dpo_ckpts",
@@ -34,5 +34,6 @@
     "benchmark": false,
     "dpo_loss_type": "sigmoid",
     "dpo_label_smoothing": 0.0,
+    "unified_checkpoint": true,
     "autotuner_benchmark":false
   }
diff --git a/llm/config/llama/gptq_argument.json b/llm/config/llama/gptq_argument.json
new file mode 100644
index 000000000000..bbc2ac60d5a7
--- /dev/null
+++ b/llm/config/llama/gptq_argument.json
@@ -0,0 +1,17 @@
+{
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
+    "per_device_train_batch_size": 8,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "src_length": 1024,
+    "max_length": 2048,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/gptq_ckpts",
+    "do_eval": true,
+    "eval_with_do_generation": false,
+    "do_gptq": true,
+    "unified_checkpoint": true,
+    "gptq_step": 8
+  }
\ No newline at end of file
diff --git a/llm/config/llama/lora_argument.json b/llm/config/llama/lora_argument.json
new file mode 100644
index 000000000000..3b4374529880
--- /dev/null
+++ b/llm/config/llama/lora_argument.json
@@ -0,0 +1,35 @@
+{
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/lora_ckpts",
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 3,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage1",
+    "lora": true,
+    "zero_padding": false,
+    "use_flash_attention": true,
+    "unified_checkpoint": true,
+    "pissa": false
+  }
diff --git a/llm/config/llama/ppo.json b/llm/config/llama/ppo_argument.json
similarity index 100%
rename from llm/config/llama/ppo.json
rename to llm/config/llama/ppo_argument.json
diff --git a/llm/llama/pretrain-llama2_13b-tp2sd4_stage2.json b/llm/config/llama/pretrain_argument.json
similarity index 83%
rename from llm/llama/pretrain-llama2_13b-tp2sd4_stage2.json
rename to llm/config/llama/pretrain_argument.json
index 3dbfd8c1e12c..dff5b322337e 100644
--- a/llm/llama/pretrain-llama2_13b-tp2sd4_stage2.json
+++ b/llm/config/llama/pretrain_argument.json
@@ -1,8 +1,8 @@
 {
-    "model_name_or_path": "meta-llama/Llama-2-13b",
-    "tokenizer_name_or_path": "meta-llama/Llama-2-13b",
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
+    "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-8B",
     "input_dir": "./data",
-    "output_dir": "./checkpoints/llama2_pretrain_ckpts",
+    "output_dir": "./checkpoints/pretrain_ckpts",
     "per_device_train_batch_size": 1,
     "gradient_accumulation_steps": 16,
     "per_device_eval_batch_size": 2,
@@ -36,5 +36,6 @@
     "recompute": false,
     "distributed_dataloader": 1,
     "recompute_granularity": "full",
+    "unified_checkpoint": true,
     "save_total_limit": 2
   }
diff --git a/llm/qwen/pt_argument.json b/llm/config/llama/pt_argument.json
similarity index 85%
rename from llm/qwen/pt_argument.json
rename to llm/config/llama/pt_argument.json
index 3500215eb3da..66c336cc4b87 100644
--- a/llm/qwen/pt_argument.json
+++ b/llm/config/llama/pt_argument.json
@@ -1,7 +1,7 @@
 {
-    "model_name_or_path": "qwen/qwen-7b",
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen_pt_ckpts",
+    "output_dir": "./checkpoints/pt_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -28,5 +28,5 @@
     "pipeline_parallel_degree": 1,
     "prefix_tuning": true,
     "zero_padding": false,
-    "use_flash_attention": false
+    "use_flash_attention": true
   }
diff --git a/llm/llama/ptq_argument.json b/llm/config/llama/ptq_argument.json
similarity index 83%
rename from llm/llama/ptq_argument.json
rename to llm/config/llama/ptq_argument.json
index 0a64f3818834..79cc82e8d5d7 100644
--- a/llm/llama/ptq_argument.json
+++ b/llm/config/llama/ptq_argument.json
@@ -1,11 +1,11 @@
 {
-  "model_name_or_path": "./checkpoints/llama_sft_ckpts",
+  "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
   "per_device_train_batch_size": 8,
   "per_device_eval_batch_size": 8,
   "eval_accumulation_steps":16,
   "src_length": 1024,
   "max_length": 2048,
-  "fp16": true,
+  "bf16": true,
   "fp16_opt_level": "O2",
   "dataset_name_or_path": "./data",
   "output_dir": "./checkpoints/llama_ptq_ckpts",
@@ -13,6 +13,7 @@
   "eval_with_do_generation": false,
   "do_ptq": true,
   "ptq_step": 16,
+  "unified_checkpoint": true,
   "smooth": true,
   "smooth_step": 16,
   "smooth_all_linears": true,
diff --git a/llm/llama/qlora_argument.json b/llm/config/llama/qlora_argument.json
similarity index 84%
rename from llm/llama/qlora_argument.json
rename to llm/config/llama/qlora_argument.json
index 38775ac03948..30963715d2af 100644
--- a/llm/llama/qlora_argument.json
+++ b/llm/config/llama/qlora_argument.json
@@ -1,7 +1,7 @@
 {
-    "model_name_or_path": "facebook/llama-7b",
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_lora_ckpts",
+    "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -14,7 +14,7 @@
     "save_strategy": "epoch",
     "src_length": 1024,
     "max_length": 2048,
-    "fp16": true,
+    "bf16": true,
     "fp16_opt_level": "O2",
     "do_train": true,
     "do_eval": true,
@@ -29,5 +29,6 @@
     "lora": true,
     "zero_padding": false,
     "use_flash_attention": false,
+    "unified_checkpoint": true,
     "weight_quantize_algo": "nf4"
   }
\ No newline at end of file
diff --git a/llm/config/llama/rm.json b/llm/config/llama/rm_argument.json
similarity index 100%
rename from llm/config/llama/rm.json
rename to llm/config/llama/rm_argument.json
diff --git a/llm/llama/sft_argument.json b/llm/config/llama/sft_argument.json
similarity index 68%
rename from llm/llama/sft_argument.json
rename to llm/config/llama/sft_argument.json
index 34b36a3bc023..9af167187555 100644
--- a/llm/llama/sft_argument.json
+++ b/llm/config/llama/sft_argument.json
@@ -1,9 +1,9 @@
 {
-    "model_name_or_path": "facebook/llama-7b",
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
     "dataset_name_or_path": "./data",
     "output_dir": "./checkpoints/llama_sft_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 4,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 2,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
     "num_train_epochs": 3,
@@ -14,7 +14,7 @@
     "save_strategy": "epoch",
     "src_length": 1024,
     "max_length": 2048,
-    "fp16": true,
+    "bf16": true,
     "fp16_opt_level": "O2",
     "do_train": true,
     "do_eval": true,
@@ -22,10 +22,13 @@
     "load_best_model_at_end": true,
     "eval_with_do_generation": false,
     "metric_for_best_model": "accuracy",
-    "recompute": true,
+    "recompute": false,
     "save_total_limit": 1,
-    "tensor_parallel_degree": 4,
+    "tensor_parallel_degree": 1,
     "pipeline_parallel_degree": 1,
+    "pipeline_parallel_config": "disable_p2p_cache_shape",
+    "sharding": "stage2",
     "zero_padding": false,
+    "unified_checkpoint": true,
     "use_flash_attention": false
   }
\ No newline at end of file
diff --git a/llm/llama/wint8_lora_argument.json b/llm/config/llama/wint8_lora_argument.json
similarity index 89%
rename from llm/llama/wint8_lora_argument.json
rename to llm/config/llama/wint8_lora_argument.json
index 97d9f96d6419..fbce73a89e50 100644
--- a/llm/llama/wint8_lora_argument.json
+++ b/llm/config/llama/wint8_lora_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "facebook/llama-7b",
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
     "dataset_name_or_path": "./data",
     "output_dir": "./checkpoints/llama_lora_ckpts",
     "per_device_train_batch_size": 4,
@@ -14,7 +14,7 @@
     "save_strategy": "epoch",
     "src_length": 1024,
     "max_length": 2048,
-    "fp16": true,
+    "bf16": true,
     "fp16_opt_level": "O2",
     "do_train": true,
     "do_eval": true,
@@ -29,5 +29,6 @@
     "lora": true,
     "zero_padding": false,
     "use_flash_attention": false,
+    "unified_checkpoint": true,
     "weight_quantize_algo": "weight_only_int8"
   }
\ No newline at end of file
diff --git a/llm/mixtral/lora_argument.json b/llm/config/mixtral/lora_argument.json
similarity index 88%
rename from llm/mixtral/lora_argument.json
rename to llm/config/mixtral/lora_argument.json
index 507c0f76e798..e70bd58a5eb7 100644
--- a/llm/mixtral/lora_argument.json
+++ b/llm/config/mixtral/lora_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/mixtral_lora_ckpts",
+    "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -28,5 +28,6 @@
     "pipeline_parallel_degree": 1,
     "lora": true,
     "zero_padding": false,
-    "use_flash_attention": false
+    "unified_checkpoint": true,
+    "use_flash_attention": true
   }
diff --git a/llm/llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json b/llm/config/mixtral/pretrain_argument.json
similarity index 79%
rename from llm/llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json
rename to llm/config/mixtral/pretrain_argument.json
index bd227877bfd2..efd3823fa988 100644
--- a/llm/llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json
+++ b/llm/config/mixtral/pretrain_argument.json
@@ -1,12 +1,12 @@
 {
-    "model_name_or_path": "idea-ccnl/ziya-llama-13b-v1",
-    "tokenizer_name_or_path": "idea-ccnl/ziya-llama-13b-v1",
+    "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "tokenizer_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
     "input_dir": "./data",
-    "output_dir": "./checkpoints/ziya_pretrain_ckpts",
+    "output_dir": "./checkpoints/pretrain_ckpts",
     "per_device_train_batch_size": 1,
     "gradient_accumulation_steps": 16,
     "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
+    "tensor_parallel_degree": 8,
     "pipeline_parallel_degree": 1,
     "sharding": "stage2",
     "virtual_pp_degree": 1,
@@ -36,5 +36,6 @@
     "recompute": false,
     "distributed_dataloader": 1,
     "recompute_granularity": "full",
+    "unified_checkpoint": true,
     "save_total_limit": 2
   }
diff --git a/llm/mixtral/sft_argument.json b/llm/config/mixtral/sft_argument.json
similarity index 74%
rename from llm/mixtral/sft_argument.json
rename to llm/config/mixtral/sft_argument.json
index 3e778b913ffc..b11bb80380a0 100644
--- a/llm/mixtral/sft_argument.json
+++ b/llm/config/mixtral/sft_argument.json
@@ -1,9 +1,9 @@
 {
     "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/mixtral_sft_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 4,
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
     "num_train_epochs": 3,
@@ -26,5 +26,8 @@
     "save_total_limit": 1,
     "tensor_parallel_degree": 8,
     "sharding": "stage2",
-    "pipeline_parallel_degree": 1
+    "pipeline_parallel_degree": 1,
+    "zero_padding": false,
+    "unified_checkpoint": true,
+    "use_flash_attention": true
 }
diff --git a/llm/opt/README.md b/llm/config/opt/README.md
similarity index 88%
rename from llm/opt/README.md
rename to llm/config/opt/README.md
index 98b3f140fbfb..3b77d6304b14 100644
--- a/llm/opt/README.md
+++ b/llm/config/opt/README.md
@@ -17,6 +17,3 @@
 |facebook/opt-66b |
 |facebook/opt-iml-1.3b |
 |opt-iml-max-1.3b |
-
-## 2. 模型精调
-请参考[LLM全流程工具介绍](../README.md)
diff --git a/llm/opt/lora_argument.json b/llm/config/opt/lora_argument.json
similarity index 94%
rename from llm/opt/lora_argument.json
rename to llm/config/opt/lora_argument.json
index 75193e47238d..2ddeb5f2a9f8 100644
--- a/llm/opt/lora_argument.json
+++ b/llm/config/opt/lora_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "facebook/opt-125m",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/opt_lora_ckpts",
+    "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
diff --git a/llm/opt/sft_argument.json b/llm/config/opt/sft_argument.json
similarity index 94%
rename from llm/opt/sft_argument.json
rename to llm/config/opt/sft_argument.json
index 4eed122fa3cb..2b4f03b842bc 100644
--- a/llm/opt/sft_argument.json
+++ b/llm/config/opt/sft_argument.json
@@ -1,7 +1,7 @@
 {
     "model_name_or_path": "facebook/opt-125m",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/opt_sft_ckpts",
+    "output_dir": "./checkpoints/sft_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
diff --git a/llm/qwen/README.md b/llm/config/qwen/README.md
similarity index 96%
rename from llm/qwen/README.md
rename to llm/config/qwen/README.md
index 22ac37c19e17..ce32fd88d5b5 100644
--- a/llm/qwen/README.md
+++ b/llm/config/qwen/README.md
@@ -55,7 +55,3 @@
 | Qwen/Qwen2-72B-Instruct      |
 | Qwen/Qwen2-57B-A14B          |
 | Qwen/Qwen2-57B-A14B-Instruct |
-
-
-## 2. 模型精调
-请参考[LLM全流程工具介绍](../README.md)
diff --git a/llm/qwen/dpo_argument.json b/llm/config/qwen/dpo_argument.json
similarity index 93%
rename from llm/qwen/dpo_argument.json
rename to llm/config/qwen/dpo_argument.json
index 19884cfaefc0..716cdba59da6 100644
--- a/llm/qwen/dpo_argument.json
+++ b/llm/config/qwen/dpo_argument.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "qwen/qwen-7b",
+    "model_name_or_path": "Qwen/Qwen2-7B",
     "train_dataset_path": "./data/train.jsonl",
     "dev_dataset_path": "./data/dev.jsonl",
     "output_dir": "./checkpoints/dpo_ckpts",
@@ -32,6 +32,7 @@
     "recompute_granularity": "full",
     "dpo_beta": 0.1,
     "benchmark": false,
+    "unified_checkpoint": true,
     "dpo_loss_type": "sigmoid",
     "dpo_label_smoothing": 0.0,
     "autotuner_benchmark":false
diff --git a/llm/qwen/lora_argument.json b/llm/config/qwen/lora_argument.json
similarity index 82%
rename from llm/qwen/lora_argument.json
rename to llm/config/qwen/lora_argument.json
index 321a2ee3354f..aeb0d5d61f92 100644
--- a/llm/qwen/lora_argument.json
+++ b/llm/config/qwen/lora_argument.json
@@ -1,7 +1,7 @@
 {
-    "model_name_or_path": "qwen/qwen-7b",
+    "model_name_or_path": "Qwen/Qwen2-7B",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen_lora_ckpts",
+    "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -27,6 +27,8 @@
     "tensor_parallel_degree": 1,
     "pipeline_parallel_degree": 1,
     "lora": true,
+    "unified_checkpoint": true,
     "zero_padding": false,
-    "use_flash_attention": false
+    "use_flash_attention": true,
+    "pissa": false
   }
diff --git a/llm/qwen/pretrain_argument_stage2.json b/llm/config/qwen/pretrain_argument.json
similarity index 84%
rename from llm/qwen/pretrain_argument_stage2.json
rename to llm/config/qwen/pretrain_argument.json
index 1345021f3d19..99d37d832874 100644
--- a/llm/qwen/pretrain_argument_stage2.json
+++ b/llm/config/qwen/pretrain_argument.json
@@ -1,8 +1,8 @@
 {
-    "model_name_or_path": "qwen/qwen-7b",
-    "tokenizer_name_or_path": "qwen/qwen-7b",
+    "model_name_or_path": "Qwen/Qwen2-7B",
+    "tokenizer_name_or_path": "Qwen/Qwen2-7B",
     "input_dir": "./data",
-    "output_dir": "./checkpoints/qwen_pretrain_ckpts",
+    "output_dir": "./checkpoints/pretrain_ckpts",
     "per_device_train_batch_size": 2,
     "gradient_accumulation_steps": 1,
     "per_device_eval_batch_size": 2,
@@ -35,5 +35,6 @@
     "recompute": true,
     "distributed_dataloader": 1,
     "recompute_granularity": "full",
+    "unified_checkpoint": true,
     "save_total_limit": 2
   }
diff --git a/llm/llama/pt_argument.json b/llm/config/qwen/pt_argument.json
similarity index 81%
rename from llm/llama/pt_argument.json
rename to llm/config/qwen/pt_argument.json
index 501e09c47160..b70e4a144c75 100644
--- a/llm/llama/pt_argument.json
+++ b/llm/config/qwen/pt_argument.json
@@ -1,7 +1,7 @@
 {
-    "model_name_or_path": "facebook/llama-7b",
+    "model_name_or_path": "Qwen/Qwen2-7B",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_pt_ckpts",
+    "output_dir": "./checkpoints/pt_ckpts",
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -14,7 +14,7 @@
     "save_strategy": "epoch",
     "src_length": 1024,
     "max_length": 2048,
-    "fp16": true,
+    "bf16": true,
     "fp16_opt_level": "O2",
     "do_train": true,
     "do_eval": true,
@@ -27,6 +27,5 @@
     "tensor_parallel_degree": 1,
     "pipeline_parallel_degree": 1,
     "prefix_tuning": true,
-    "zero_padding": false,
-    "use_flash_attention": false
+    "use_flash_attention": true
   }
diff --git a/llm/qwen/sft_argument.json b/llm/config/qwen/sft_argument.json
similarity index 78%
rename from llm/qwen/sft_argument.json
rename to llm/config/qwen/sft_argument.json
index 38daa1d0f293..21b1e0da7f74 100644
--- a/llm/qwen/sft_argument.json
+++ b/llm/config/qwen/sft_argument.json
@@ -1,7 +1,7 @@
 {
-    "model_name_or_path": "qwen/qwen-7b",
+    "model_name_or_path": "Qwen/Qwen2-7B",
     "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen_sft_ckpts",
+    "output_dir": "./checkpoints/sft_ckpts",
     "per_device_train_batch_size": 1,
     "gradient_accumulation_steps": 4,
     "per_device_eval_batch_size": 8,
@@ -24,8 +24,10 @@
     "metric_for_best_model": "accuracy",
     "recompute": true,
     "save_total_limit": 1,
-    "tensor_parallel_degree": 4,
+    "tensor_parallel_degree": 1,
     "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
     "zero_padding": false,
-    "use_flash_attention": false
+    "unified_checkpoint": true,
+    "use_flash_attention": true
   }
diff --git a/llm/docs/chat_template.md b/llm/docs/chat_template.md
index 6c9e699c8468..e8ad37167f26 100644
--- a/llm/docs/chat_template.md
+++ b/llm/docs/chat_template.md
@@ -36,14 +36,14 @@
 ...
 ```
 
-其次就是将构造好的`chat_template.json`文件传入到 `llm/finetune_generation.py` 模块当中：
+其次就是将构造好的`chat_template.json`文件传入到 `llm/run_finetune.py` 模块当中：
 
 * 使用模型自带chat-template
 
 > 并不是所有的模型支持chat-template，PaddleNLP 正在全力支持，可根据是否有下载 `chat_template.json` 文件来判断该模型是否支持 chat-template。
 
 ```shell
-python finetune_generation.py ... --model_name_or_path qwen/qwen-7b-chat --chat_template qwen/qwen-7b-chat
+python run_finetune.py ... --model_name_or_path qwen/qwen-7b-chat --chat_template qwen/qwen-7b-chat
 ```
 
 此时当 `chat_template` 参数和 `model_name_or_path` 参数一致时，此时将默认使用模型自带的chat_template.json` 文件。
@@ -51,7 +51,7 @@ python finetune_generation.py ... --model_name_or_path qwen/qwen-7b-chat --chat_
 * 使用自定义 chat-template
 
 ```shell
-python finetune_generation.py ... --chat_template ./qwen_14b_chat_template.json
+python run_finetune.py ... --chat_template ./qwen_14b_chat_template.json
 ```
 
 1. 当 `chat_template` 参数和 `model_name_or_path` 参数一致时，此时将默认使用模型自带的 `chat_template.json` 文件。
diff --git a/llm/docs/finetune.md b/llm/docs/finetune.md
index 79bd7eb84dfe..b590a09739b7 100644
--- a/llm/docs/finetune.md
+++ b/llm/docs/finetune.md
@@ -70,28 +70,21 @@ git clone 代码到本地，即可开始。
 SFT（Supervised Fine-Tuning）模型全参微调依托飞桨提出的[4D混合分布式并行](https://ai.baidu.com/forum/topic/show/987996)能力，支持使用Trainer API轻松切换数据并行(DP)、[张量并行（TP, Tensor Parallelism）](https://arxiv.org/abs/1909.08053)、[流水线并行（PP, Pipeline Parallelism）](https://arxiv.org/abs/1811.06965)（目前仅支持Llama）等多种分布式训练策略。
 
 ```
-# 张量并行分布式训练（常用）
-python -u  -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./llama/sft_argument.json
-
-# 目前ChatGLM2、OPT不支持张量并行，默认使用Sharding策略
-python -u  -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./chatglm2/sft_argument.json
-
-# 张量并行&流水线并行分布式训练（目前仅支持Llama）
-python -u  -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py ./llama/sft_pp_argument.json
+python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_finetune.py ./config/llama/sft_argument.json
 ```
 
 1. `zero_padding`设为True有助于提高训练效率。建议将`per_device_train_batch_size`设为1，使用`gradient_accumulation_steps`控制batch size，适当调整`max_length`取值。
 2. 设置`use_flash_attention`为True使用FlashAttention。
+3. SFT API支持4D并行策略，可以通过控制`tensor_parallel_degree`、`pipeline_parallel_degree`、 `sharding`、`sharding_parallel_degree`调整
 
 ### 2.4 LoRA
 
 ```
 # 单卡训练
-python  finetune_generation.py ./llama/lora_argument.json
+python  run_finetune.py ./config/llama/lora_argument.json
 
-# 张量并行分布式训练（ChatGLM2、OPT不支持张量并行）
-# 将lora_argument.json中tensor_parallel_degree修改为2
-python  -u  -m paddle.distributed.launch --gpus "0,1"  finetune_generation.py ./llama/lora_argument.json
+# 张量并行分布式训练
+python  -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7"  run_finetune.py ./config/llama/lora_argument.json
 ```
 
 **Note:**
@@ -107,11 +100,10 @@ python  -u  -m paddle.distributed.launch --gpus "0,1"  finetune_generation.py ./
 
 ```
 # 单卡训练
-python  finetune_generation.py ./llama/pt_argument.json
+python  run_finetune.py ./llama/pt_argument.json
 
-# 张量并行分布式训练（ChatGLM2、OPT不支持张量并行）
-# 将pt_argument.json中tensor_parallel_degree修改为2
-python  -u  -m paddle.distributed.launch --gpus "0,1"  finetune_generation.py ./llama/pt_argument.json
+# 张量并行分布式训练
+python  -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7"  run_finetune.py ./llama/pt_argument.json
 ```
 
 **Note:**
@@ -198,7 +190,7 @@ python  -u  -m paddle.distributed.launch --gpus "0,1"  finetune_generation.py ./
 
 ## 4.分布式策略参数合并
 
-我们使用张量并行（TP，Tensor Parallelism）和 流水线并行（PP，Pipeline Parallelism）训练过程中，为了节省TP参数合并时间通常在中间checkpoint将参数存储为多个TP和PP参数分片，可以使用提供的分片合并参数脚本进行参数合并。
+**如果开启unified_checkpoint则不需要合参**。我们使用张量并行（TP，Tensor Parallelism）和 流水线并行（PP，Pipeline Parallelism）训练过程中，为了节省TP参数合并时间通常在中间checkpoint将参数存储为多个TP和PP参数分片，可以使用提供的分片合并参数脚本进行参数合并。
 
 ```
 python merge_tp_and_pp_params.py \
@@ -216,16 +208,18 @@ python merge_tp_and_pp_params.py \
 为了后续的**压缩**和**静态图推理**方便，我们提供LoRA参数合并脚本，可以将LoRA参数合并到主干模型并保存相应的权重。
 ```
 python merge_lora_params.py \
-    --lora_path ./checkpoints/llama_lora_ckpts \
-    --merge_lora_model_path ./checkpoints/llama_lora_merge \
+    --model_name_or_path ./checkpoints/sft_ckpts \
+    --lora_path ./checkpoints/lora_ckpts \
+    --output_path ./checkpoints/lora_merge \
     --device "gpu" \
-    --low_gpu_mem True
+    --safe_serialization True
 ```
 
 <summary>&emsp; 脚本参数介绍</summary><div>
 
 - `lora_path`: LoRA参数和配置路径，对LoRA参数进行初始化，默认为None。
+- `model_name_or_path`: 必须，主干模型参数路径，默认为None。
 - `merge_model_path`: 必须，合并参数后保存路径，默认为None。
 - `device`: 运行环境，默认为gpu。
-- `low_gpu_mem`:降低合参时候所需显存，默认为False。如果合参时显存不足，建议开启
+- `safe_serialization`: 是否保存为safetensor格式，默认为True。
 </div>
diff --git a/llm/docs/inference.md b/llm/docs/inference.md
index a20e3a32d614..9660778a22ef 100644
--- a/llm/docs/inference.md
+++ b/llm/docs/inference.md
@@ -17,7 +17,7 @@ PaddleNLP 提供了动态图推理和静态图推理两种方式，方便用户
 ### 1.1 动态图推理
 ```shell
 # 动态图模型推理命令参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file ./data/dev.json --dtype float16
 ```
 对于LoRA、PrefixTuning 模型只需额外传入相应的lora_path或prefix_path即可，如：`--lora_path ./checkpoints/llama_lora_ckpts`或`--prefix_path ./checkpoints/llama_prefix_ckpts`，详见推理参数减少。
 
@@ -26,9 +26,9 @@ python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --data_file
 ```shell
 # 静态图模型推理命令参考， LoRA需要先合并参数，Prefix Tuning暂不支持
 # step1 : 静态图导出
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --output_path ./inference --dtype float16
 # step2: 静态图推理
-python predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static
+python ./predict/predictor.py --model_name_or_path ./inference --data_file ./data/dev.json --dtype float16 --mode static
 ```
 
 ## 2. 高性能模型推理
@@ -86,7 +86,7 @@ git clone https://github.com/PaddlePaddle/PaddleNLP
 #GPU设备安装自定义算子
 cd ./paddlenlp/csrc && python setup_cuda.py install
 #XPU设备安装自定义算子
-cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh 
+cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh
 ```
 
 ### 2.3 关闭BlockAttention的高性能推理
@@ -95,16 +95,16 @@ cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh
 
 ```shell
 # 动态图模型推理命令参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16
 
 # PrefixTuning动态图推理参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
 
 # Weight Only Int8 动态图推理参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8
 
 # PTQ-A8W8推理命令参考
-python predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16
+python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16
 ```
 **Note**：
 1. LoRA 模型在推理之前是需要合并参数，详细可见：[合并 LoRA 参数](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/merge_lora_params.py)。
@@ -115,16 +115,16 @@ python predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference
 **step1：动转静**
 ```shell
 # 动转静命令参考
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16
 
 # PrefixTuning动转静命令参考
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --export_precache true
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --export_precache true
 
 # Weight Only Int8 动转静命令参考
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8
 
 # PTQ-A8W8动转静命令参考
-python export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16
+python ./predict/export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16
 ```
 **Note**：
 1. LoRA 模型在推理之前是需要合并参数，详细可见：[合并 LoRA 参数](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/merge_lora_params.py)。
@@ -135,13 +135,13 @@ python export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --infere
 **step2：静态图推理**
 ```shell
 # 静态图推理命令参考
-python predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static"
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static"
 
 # PrefixTuning静态图推理命令参考
-python predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
 
 # Weight Only Int8 静态图推理命令参考
-python predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --quant_type weight_only_int8
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --quant_type weight_only_int8
 
 # PTQ-A8W8静态图推理命令参考
 # 以下环境变量用于开启int8矩阵乘的算法选择以获得更快的推理速度，打开之后第一次执行会执行算法选择从而导致速度较慢。
@@ -149,7 +149,7 @@ export FLAGS_use_autotune=1
 export FLAGS_cublaslt_exhaustive_search_times=10
 export FLAGS_cache_inference_while_scope=1
 
-python predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static"
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static"
 ```
 **Note**：
 1. LoRA 模型在推理之前是需要合并参数，详细可见：[合并 LoRA 参数](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/merge_lora_params.py)。
@@ -164,50 +164,50 @@ python predictor.py  --model_name_or_path ./inference --inference_model --quant_
 
 ```shell
 # 动态图模型推理命令参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn
 
 # XPU设备动态图模型推理命令参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --device xpu
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --device xpu
 
 # Weight Only Int8 动态图推理参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8 --block_attn
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --quant_type weight_only_int8 --block_attn
 
 # PTQ-A8W8推理命令参考
-python predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16 --block_attn
+python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16 --block_attn
 
 # CacheKV 动态量化推理命令参考
-python predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --cachekv_int8
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --cachekv_int8
 ```
 
 #### 2.4.2 静态图推理
 **step1：动转静**
 ```shell
 # 动转静命令参考
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn
 
 # XPU设备动转静命令参考
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --device xpu
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --device xpu
 
 # Weight Only Int8 动转静命令参考
-python export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8 --block_attn
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --quant_type weight_only_int8 --block_attn
 
 # PTQ-A8W8动转静命令参考
-python export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16 --block_attn
+python ./predict/export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16 --block_attn
 
 # CacheKV 动态量化动转静命令参考
-python export_model.py  --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --cachekv_int8
+python ./predict/export_model.py  --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --cachekv_int8
 ```
 
 **step2：静态图推理**
 ```shell
 # 静态图推理命令参考
-python predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn
 
 # XPU设备静态图推理命令参考
-python predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn --device xpu
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn --device xpu
 
 # Weight Only Int8 静态图推理命令参考
-python predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --quant_type weight_only_int8 --block_attn
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --quant_type weight_only_int8 --block_attn
 
 # PTQ-A8W8静态图推理命令参考
 # 以下环境变量用于开启int8矩阵乘的算法选择以获得更快的推理速度，打开之后第一次执行会执行算法选择从而导致速度较慢。
@@ -215,10 +215,10 @@ export FLAGS_use_autotune=1
 export FLAGS_cublaslt_exhaustive_search_times=10
 export FLAGS_cache_inference_while_scope=1
 
-python predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn
 
 # CacheKV 动态量化8静态图推理命令参考
-python predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --cachekv_int8 --block_attn
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --cachekv_int8 --block_attn
 ```
 **Note**：
 1. 使用Weight Only Int8 推理需要额外传入 `quant_type`。
diff --git a/llm/docs/pretrain.rst b/llm/docs/pretrain.rst
index 987e6c53f90d..d0fd203b97e3 100644
--- a/llm/docs/pretrain.rst
+++ b/llm/docs/pretrain.rst
@@ -68,10 +68,10 @@ git clone 代码到本地，即可开始。
     cd ../model_zoo/gpt-3/external_ops/ && python3 setup.py install && cd -
 
     # llama 模型预训练
-    python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./llama/pretrain-llama2_7b-tp2sd4_stage2.json
+    python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/llama/pretrain_argument.json
 
     # Qwen 模型预训练
-    python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./qwen/pretrain_argument_stage2.json
+    python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/qwen/pretrain_argument.json
 
 
 注意：
diff --git a/llm/docs/quantization.md b/llm/docs/quantization.md
index 101c18f4441a..eadaa77397a2 100644
--- a/llm/docs/quantization.md
+++ b/llm/docs/quantization.md
@@ -58,19 +58,19 @@ git clone 代码到本地，即可开始。
 ### 2.3 PTQ 量化
 
 ```
-python  finetune_generation.py ./llama/ptq_argument.json
+python  run_finetune.py ./config/llama/ptq_argument.json
 ```
 
 ### 2.4 GPTQ 量化
 
 ```
-python  finetune_generation.py ./llama/gptq_argument.json
+python  run_finetune.py ./config/llama/gptq_argument.json
 ```
 
 ### 2.5 AWQ 量化
 
 ```
-python  finetune_generation.py ./llama/awq_argument.json
+python  run_finetune.py ./config/llama/awq_argument.json
 ```
 
 ### 2.6 量化参数介绍
diff --git a/llm/ernie-3.5-se/README.md b/llm/experimental/ernie-3.5-se/README.md
similarity index 100%
rename from llm/ernie-3.5-se/README.md
rename to llm/experimental/ernie-3.5-se/README.md
diff --git a/llm/ernie-3.5-se/configuration.py b/llm/experimental/ernie-3.5-se/configuration.py
similarity index 100%
rename from llm/ernie-3.5-se/configuration.py
rename to llm/experimental/ernie-3.5-se/configuration.py
diff --git a/llm/ernie-3.5-se/conversion_utils.py b/llm/experimental/ernie-3.5-se/conversion_utils.py
similarity index 100%
rename from llm/ernie-3.5-se/conversion_utils.py
rename to llm/experimental/ernie-3.5-se/conversion_utils.py
diff --git a/llm/ernie-3.5-se/data.py b/llm/experimental/ernie-3.5-se/data.py
similarity index 100%
rename from llm/ernie-3.5-se/data.py
rename to llm/experimental/ernie-3.5-se/data.py
diff --git a/llm/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model b/llm/experimental/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model
similarity index 100%
rename from llm/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model
rename to llm/experimental/ernie-3.5-se/ernie-tokenizer/sentencepiece.bpe.model
diff --git a/llm/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json b/llm/experimental/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json
similarity index 100%
rename from llm/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json
rename to llm/experimental/ernie-3.5-se/ernie-tokenizer/special_tokens_map.json
diff --git a/llm/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json b/llm/experimental/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json
similarity index 100%
rename from llm/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json
rename to llm/experimental/ernie-3.5-se/ernie-tokenizer/tokenizer_config.json
diff --git a/llm/ernie-3.5-se/ernie_dataset.py b/llm/experimental/ernie-3.5-se/ernie_dataset.py
similarity index 100%
rename from llm/ernie-3.5-se/ernie_dataset.py
rename to llm/experimental/ernie-3.5-se/ernie_dataset.py
diff --git a/llm/ernie-3.5-se/finetune_generation.py b/llm/experimental/ernie-3.5-se/finetune_generation.py
similarity index 100%
rename from llm/ernie-3.5-se/finetune_generation.py
rename to llm/experimental/ernie-3.5-se/finetune_generation.py
diff --git a/llm/ernie-3.5-se/modeling.py b/llm/experimental/ernie-3.5-se/modeling.py
similarity index 100%
rename from llm/ernie-3.5-se/modeling.py
rename to llm/experimental/ernie-3.5-se/modeling.py
diff --git a/llm/ernie-3.5-se/predict_generation.py b/llm/experimental/ernie-3.5-se/predict_generation.py
similarity index 100%
rename from llm/ernie-3.5-se/predict_generation.py
rename to llm/experimental/ernie-3.5-se/predict_generation.py
diff --git a/llm/ernie-3.5-se/run_pretrain.py b/llm/experimental/ernie-3.5-se/run_pretrain.py
similarity index 100%
rename from llm/ernie-3.5-se/run_pretrain.py
rename to llm/experimental/ernie-3.5-se/run_pretrain.py
diff --git a/llm/ernie-3.5-se/run_trainer_stage2.sh b/llm/experimental/ernie-3.5-se/run_trainer_stage2.sh
similarity index 100%
rename from llm/ernie-3.5-se/run_trainer_stage2.sh
rename to llm/experimental/ernie-3.5-se/run_trainer_stage2.sh
diff --git a/llm/ernie-3.5-se/tokenizer.py b/llm/experimental/ernie-3.5-se/tokenizer.py
similarity index 100%
rename from llm/ernie-3.5-se/tokenizer.py
rename to llm/experimental/ernie-3.5-se/tokenizer.py
diff --git a/llm/ernie-3.5-se/utils.py b/llm/experimental/ernie-3.5-se/utils.py
similarity index 100%
rename from llm/ernie-3.5-se/utils.py
rename to llm/experimental/ernie-3.5-se/utils.py
diff --git a/llm/llama/run_sharding_v2.sh b/llm/experimental/scripts/run_sharding_v2.sh
similarity index 100%
rename from llm/llama/run_sharding_v2.sh
rename to llm/experimental/scripts/run_sharding_v2.sh
diff --git a/llm/llama/run_trainer.sh b/llm/experimental/scripts/run_trainer.sh
similarity index 100%
rename from llm/llama/run_trainer.sh
rename to llm/experimental/scripts/run_trainer.sh
diff --git a/llm/llama/run_trainer_tp2cp2.sh b/llm/experimental/scripts/run_trainer_tp2cp2.sh
similarity index 100%
rename from llm/llama/run_trainer_tp2cp2.sh
rename to llm/experimental/scripts/run_trainer_tp2cp2.sh
diff --git a/llm/llama/run_trainer_tp4pp2.sh b/llm/experimental/scripts/run_trainer_tp4pp2.sh
similarity index 100%
rename from llm/llama/run_trainer_tp4pp2.sh
rename to llm/experimental/scripts/run_trainer_tp4pp2.sh
diff --git a/llm/llama/run_trainer_tp4sep2.sh b/llm/experimental/scripts/run_trainer_tp4sep2.sh
similarity index 100%
rename from llm/llama/run_trainer_tp4sep2.sh
rename to llm/experimental/scripts/run_trainer_tp4sep2.sh
diff --git a/llm/fused_layers.py b/llm/fused_layers.py
deleted file mode 120000
index b183f45159cc..000000000000
--- a/llm/fused_layers.py
+++ /dev/null
@@ -1 +0,0 @@
-llama/fused_layers.py
\ No newline at end of file
diff --git a/llm/gemma/sft_argument_7b.json b/llm/gemma/sft_argument_7b.json
deleted file mode 100644
index 16eba55bed9e..000000000000
--- a/llm/gemma/sft_argument_7b.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "model_name_or_path": "google/gemma-7b",
-  "dataset_name_or_path": "./data",
-  "output_dir": "./checkpoints/gemma_sft_ckpts",
-  "per_device_train_batch_size": 8,
-  "gradient_accumulation_steps": 1,
-  "per_device_eval_batch_size": 8,
-  "eval_accumulation_steps":1,
-  "num_train_epochs": 3,
-  "learning_rate": 3e-06,
-  "warmup_steps": 30,
-  "logging_steps": 1,
-  "evaluation_strategy": "epoch",
-  "save_strategy": "epoch",
-  "src_length": 512,
-  "max_length": 1024,
-  "bf16": true,
-  "fp16_opt_level": "O2",
-  "do_train": true,
-  "do_eval": true,
-  "do_predict": true,
-  "disable_tqdm": true,
-  "load_best_model_at_end": true,
-  "eval_with_do_generation": false,
-  "metric_for_best_model": "accuracy",
-  "recompute": true,
-  "save_total_limit": 1,
-  "tensor_parallel_degree": 8,
-  "pipeline_parallel_degree": 1,
-  "zero_padding": false,
-  "use_flash_attention": false
-}
\ No newline at end of file
diff --git a/llm/gemma/sft_argument_7b_sharding.json b/llm/gemma/sft_argument_7b_sharding.json
deleted file mode 100644
index ca04affdb243..000000000000
--- a/llm/gemma/sft_argument_7b_sharding.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "model_name_or_path": "google/gemma-7b",
-  "dataset_name_or_path": "./data",
-  "output_dir": "./checkpoints/llama_sft_ckpts",
-  "per_device_train_batch_size": 1,
-  "gradient_accumulation_steps": 1,
-  "per_device_eval_batch_size": 8,
-  "eval_accumulation_steps":1,
-  "num_train_epochs": 3,
-  "learning_rate": 3e-06,
-  "warmup_steps": 30,
-  "logging_steps": 1,
-  "evaluation_strategy": "epoch",
-  "save_strategy": "epoch",
-  "src_length": 1024,
-  "max_length": 2048,
-  "fp16": true,
-  "fp16_opt_level": "O2",
-  "do_train": true,
-  "do_eval": true,
-  "do_predict": true,
-  "disable_tqdm": true,
-  "load_best_model_at_end": true,
-  "eval_with_do_generation": false,
-  "metric_for_best_model": "accuracy",
-  "recompute": true,
-  "save_total_limit": 1,
-  "sharding_parallel_degree": 8,
-  "sharding": "stage3",
-  "pipeline_parallel_degree": 1,
-  "zero_padding": false,
-  "use_flash_attention": false
-}
\ No newline at end of file
diff --git a/llm/gemma/sft_argument_sharding.json b/llm/gemma/sft_argument_sharding.json
deleted file mode 100644
index d462645e2235..000000000000
--- a/llm/gemma/sft_argument_sharding.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "model_name_or_path": "google/gemma-2b/",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/chatglm2_sft_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 1,
-    "per_device_eval_batch_size": 1,
-    "eval_accumulation_steps":1,
-    "num_train_epochs": 3,
-    "learning_rate": 3e-05,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 512,
-    "max_length": 1024,
-    "fp16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "sharding_parallel_degree": 2,
-    "sharding": "stage3",
-    "zero_padding": false,
-    "use_flash_attention": false
-  }
\ No newline at end of file
diff --git a/llm/glm/README.md b/llm/glm/README.md
deleted file mode 100644
index 86bc69d571e6..000000000000
--- a/llm/glm/README.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# GLM
-
-## 1. 模型介绍
-
-[General Language Model (GLM)](https://arxiv.org/abs/2103.10360) 是以自回归填空作为训练目标的通用语言模型，可用于各类理解和生成任务。
-
-现有预训练框架包括以 BERT 为代表的自编码模型，以 GPT 为代表的自回归模型和以 T5 为代表的编码-解码模型。但这些框架均不能完全支持自然语言理解、无条件生成和条件生成这三类主要任务。为了解决这一问题，我们提出了基于自回归填空任务的通用语言模型（GLM）。GLM 使用 2D 位置编码和任意顺序预测改进了填空预训练过程，在自然语言理解任务上超越了 BERT 和 T5。同时，GLM 的预训练过程基于多种任务，填空长度和数量各不相同。在自然语言理解、无条件生成和条件生成任务上，GLM 均超过了具有相同参数规模和训练数据量的 BERT、T5 和 GPT 模型。除此之外，GLM 还以 BERT Large 1.25 倍参数量的规模取得了当前最优的效果，证明了其在不同下游任务上良好的泛化能力。
-
-
-**支持模型权重:**
-
-| Model                            |
-|----------------------------------|
-| THUDM/glm-large-chinese                |
-| THUDM/glm-10b-chinese              |
-
-## 3. 模型精调
-
-### SFT
-
-```
-python -m paddle.distributed.launch --gpus "0,1,2,3" finetune_generation.py \
---model_name_or_path THUDM/glm-large-chinese \
---num_train_epochs 4 \
---learning_rate 3e-5 \
---warmup_ratio 0.06 \
---weight_decay 0.1 \
---label_smoothing 0.1 \
---save_steps 100 \
---logging_steps 1 \
---eval_steps 100 \
---output_dir ./checkpoints/glm-large-chinese \
---src_length 608 \
---tgt_length 160 \
---min_tgt_length 55 \
---length_penalty 0.7 \
---no_repeat_ngram_size 3 \
---num_beams 5 \
---select_topk True \
---per_device_eval_batch_size 2 \
---per_device_train_batch_size 2 \
---max_grad_norm 1.0 \
---lr_scheduler_type linear \
---fp16 \
---fp16_opt_level O2 \
---recompute \
---do_train \
---do_eval
-```
-
-### 单卡LoRA微调
-
-```
-python finetune_generation.py \
---model_name_or_path THUDM/glm-large-chinese \
---num_train_epochs 4 \
---learning_rate 3e-5 \
---warmup_ratio 0.06 \
---weight_decay 0.1 \
---label_smoothing 0.1 \
---save_steps 100 \
---logging_steps 1 \
---eval_steps 100 \
---output_dir ./checkpoints/glm-large-chinese \
---src_length 608 \
---tgt_length 160 \
---min_tgt_length 55 \
---length_penalty 0.7 \
---no_repeat_ngram_size 3 \
---num_beams 5 \
---select_topk True \
---per_device_eval_batch_size 2 \
---per_device_train_batch_size 2 \
---max_grad_norm 1.0 \
---lr_scheduler_type linear \
---fp16 \
---fp16_opt_level O2 \
---recompute \
---do_train \
---do_eval \
---lora True
-```
-
-其中参数释义如下：
-
-- `model_name_or_path`: 预训练模型内置名称或者模型所在目录，默认为`THUDM/glm-large-chinese`。
-- `src_length`: 上下文的最大输入长度，默认为608.
-- `tgt_length`: 生成文本的最大长度，默认为160.
-- `min_tgt_length`: 生成文本的最小长度，默认为55.
-- `length_penalty`: 生成解码时的长度惩罚因子，默认为0.7.
-- `num_beams`: 搜索方向数量，默认为5。
-- `label_smoothing`: 标签平滑因子，默认为0.1.
-- `lr_decay_ratio`: 学习率衰减因子，默认为0.1.
-- `lora`: 是否使用LoRA技术.
-
-
-## 3.4 动态图推理
-
-```
-python predict_generation.py \
-    --model_name_or_path  THUDM/glm-large-chinese
-```
diff --git a/llm/glm/data.py b/llm/glm/data.py
deleted file mode 100644
index 40f5f3320a64..000000000000
--- a/llm/glm/data.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-def custom_convert_example(example, tokenizer, data_args, is_test=True):
-    source = None
-    title = None
-    target = None
-    if "source" in example and "title" in example:
-        source = example["source"]
-        if "title" in example.keys():
-            title = example["title"]
-    elif "context" in example and "answer" in example:
-        source = example["context"]
-        if "answer" in example.keys():
-            title = example["answer"]
-    else:
-        assert False, "Source and title are not in the input dictionary, nor are context and answer."
-    if "target" in example.keys():
-        target = example["target"]
-    elif "question" in example.keys():
-        target = example["question"]
-    example["text_a"] = "答案：" + title + "，" + "上下文：" + source
-    example["text_b"] = "在已知答案的前提下，问题：" + target
-    inputs = tokenizer.encode(example["text_a"], max_length=data_args.src_length - 1, truncation=True)
-    inputs["input_ids"] = inputs["input_ids"][:-1] + [tokenizer.gmask_token_id] + inputs["input_ids"][-1:]
-    pad_length = data_args.src_length - len(inputs["input_ids"])
-    inputs["input_ids"] = np.array([inputs["input_ids"] + [tokenizer.pad_token_id] * pad_length])
-    inputs["attention_mask"] = np.array([inputs["attention_mask"] + [1] + [0] * pad_length])
-    sep = inputs["input_ids"].shape[1]
-    inputs = tokenizer.build_inputs_for_generation(
-        inputs,
-        max_gen_length=data_args.tgt_length,
-        targets=" " + example["text_b"] if not is_test else None,
-        padding="max_length",
-    )
-
-    for input_name in inputs.keys():
-        inputs[input_name] = inputs[input_name].squeeze(0)
-    if is_test:
-        inputs["position_ids"] = inputs["position_ids"][:, : inputs["input_ids"].shape[-1]]
-        labels = tokenizer.encode(
-            " " + example["text_b"], add_special_tokens=False, max_length=data_args.tgt_length - 1
-        )["input_ids"]
-        loss_mask = [0] * sep + [1] * len(labels) + [0] * (data_args.tgt_length - len(labels))
-        labels = (
-            [0] * sep
-            + labels
-            + [tokenizer.eop_token_id]
-            + [tokenizer.pad_token_id] * (data_args.tgt_length - len(labels) - 1)
-        )
-        inputs["label_ids"] = labels
-        inputs["loss_mask"] = loss_mask
-    return inputs
diff --git a/llm/glm/finetune_generation.py b/llm/glm/finetune_generation.py
deleted file mode 100644
index e8779d68f3ee..000000000000
--- a/llm/glm/finetune_generation.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-from dataclasses import dataclass, field
-from functools import partial
-
-import paddle
-from data import custom_convert_example
-from utils import GLMTrainer
-
-from paddlenlp.data import DefaultDataCollator
-from paddlenlp.datasets import load_dataset
-from paddlenlp.metrics import BLEU, Rouge1, Rouge2, RougeL
-from paddlenlp.peft import LoRAConfig, LoRAModel
-from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
-from paddlenlp.transformers import AutoModelForConditionalGeneration, AutoTokenizer
-from paddlenlp.utils.log import logger
-
-
-@dataclass
-class DataArgument:
-    task_name: str = field(default="dureader_qg", metadata={"help": "The name of task."})
-    src_length: int = field(default=608, metadata={"help": "The max length of source text."})
-    tgt_length: int = field(default=160, metadata={"help": "The max length of target text."})
-    min_tgt_length: int = field(default=55, metadata={"help": "The min length of target text."})
-    length_penalty: float = field(default=0.7, metadata={"help": "The length penalty."})
-    no_repeat_ngram_size: int = field(default=3, metadata={"help": "The no repeat ngram size."})
-    num_beams: int = field(default=5, metadata={"help": "The number of beams."})
-    select_topk: bool = field(default=True, metadata={"help": "Whether to select top k tokens for generation."})
-    top_p: float = field(
-        default=0.0, metadata={"help": "The cumulative probability for top-p-filtering in the 'sampling' strategy."}
-    )
-    top_k: int = field(
-        default=0,
-        metadata={
-            "help": "The number of highest probability tokens to keep for top-k-filtering in the 'sampling' strategy."
-        },
-    )
-    no_block_position: bool = field(default=False)
-
-
-@dataclass
-class ModelArgument:
-    model_name_or_path: str = field(
-        default="THUDM/glm-2b", metadata={"help": "Build-in pretrained model name or the path to local model."}
-    )
-    label_smoothing: float = field(default=0.1, metadata={"help": "The label smoothing parameter."})
-    lr_decay_ratio: float = field(default=0.1, metadata={"help": "The ratio for learning rate decrease"})
-    lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"})
-
-
-def main():
-    parser = PdArgumentParser((ModelArgument, DataArgument, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    training_args.print_config(model_args, "Model")
-    training_args.print_config(data_args, "Data")
-    setattr(training_args, "label_smoothing", model_args.label_smoothing)
-    setattr(training_args, "lr_decay_ratio", model_args.lr_decay_ratio)
-
-    paddle.set_device(training_args.device)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
-    )
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 1:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    dtype = None
-    if training_args.fp16_opt_level == "O2":
-        if training_args.fp16:
-            dtype = "float16"
-        if training_args.bf16:
-            dtype = "bfloat16"
-
-    # Load the pretrained language model.
-    model = AutoModelForConditionalGeneration.from_pretrained(
-        model_args.model_name_or_path,
-        output_predict=True,
-        parallel_output=True,
-        dtype=dtype,  # todo enable set dtype to avoid additional mem usage
-        tensor_parallel_degree=training_args.tensor_parallel_degree,
-        tensor_parallel_rank=training_args.tensor_parallel_rank,
-    )
-    if model_args.lora:
-        # TODO: hardcode parameters for now. Change after MergedLoRA is introduced
-        lora_config = LoRAConfig(
-            target_modules=[".*query_key_value.*"],
-            r=4,
-            lora_alpha=8,
-            merge_weights=True,
-            tensor_parallel_degree=training_args.tensor_parallel_degree,
-            dtype=dtype,
-        )
-        model = LoRAModel(model, lora_config)
-        model.mark_only_lora_as_trainable()
-        model.print_trainable_parameters()
-
-    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
-
-    # Load the dataset.
-    train_ds, dev_ds = load_dataset(data_args.task_name, splits=["train", "dev"])
-    trans_func = partial(custom_convert_example, tokenizer=tokenizer, data_args=data_args)
-    train_ds = train_ds.map(partial(trans_func, is_test=False))
-    test_ds = dev_ds.map(trans_func)
-
-    collate_fn = DefaultDataCollator()
-
-    def compute_metrics(eval_preds):
-        rouge1 = Rouge1()
-        rouge2 = Rouge2()
-        rougel = RougeL()
-        bleu4 = BLEU(n_size=4)
-        predictions = [x[x != -100] for x in eval_preds.predictions]
-        references = [x[x != -100] for x in eval_preds.label_ids]
-
-        # for pred in predictions:
-
-        rouge1_score = rouge1.score(predictions, references)
-        rouge2_score = rouge2.score(predictions, references)
-        for pred, ref in zip(predictions, references):
-            rougel.add_inst(pred, [ref])
-            bleu4.add_inst(pred, [ref])
-        return {
-            "rouge1": rouge1_score,
-            "rouge2": rouge2_score,
-            "rougel": rougel.score(),
-            "bleu4": bleu4.score(),
-        }
-
-    trainer = GLMTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_ds,
-        eval_dataset=dev_ds,
-        tokenizer=tokenizer,
-        compute_metrics=compute_metrics,
-        do_generation=True,
-        data_collator=collate_fn,
-    )
-    if training_args.fp16_opt_level == "O2":
-        trainer.disable_autocast_context_manager()
-
-    if training_args.do_train:
-        train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
-        trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1)
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-
-    if training_args.do_eval:
-        eval_result = trainer.evaluate(test_ds)
-        trainer.log_metrics("test", eval_result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/llm/glm/predict_generation.py b/llm/glm/predict_generation.py
deleted file mode 100644
index 41dd6b3459af..000000000000
--- a/llm/glm/predict_generation.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from paddle.distributed import fleet
-
-from paddlenlp.peft import LoRAConfig, LoRAModel
-from paddlenlp.transformers import (
-    AutoConfig,
-    AutoModelForConditionalGeneration,
-    AutoTokenizer,
-)
-
-
-def parse_arguments():
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path", default="THUDM/glm-large-chinese", required=True, help="The directory of model."
-    )
-    parser.add_argument("--lora_path", default=None, help="The directory of LoRA parameters. Default to None")
-    parser.add_argument(
-        "--merge_tensor_parallel_path", default=None, help="The directory of model to merge tensor parallel parts."
-    )
-    parser.add_argument("--batch_size", type=int, default=2, help="The batch size of data.")
-    parser.add_argument("--src_length", type=int, default=200, help="The batch size of data.")
-    parser.add_argument("--tgt_length", type=int, default=20, help="The batch size of data.")
-    return parser.parse_args()
-
-
-def batchfy_text(texts, batch_size):
-    batch_texts = []
-    batch_start = 0
-    while batch_start < len(texts):
-        batch_texts += [texts[batch_start : min(batch_start + batch_size, len(texts))]]
-        batch_start += batch_size
-    return batch_texts
-
-
-class Predictor(object):
-    def __init__(self, args):
-        self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-        self.batch_size = args.batch_size
-        self.args = args
-
-        tensor_parallel_degree = paddle.distributed.get_world_size()
-        tensor_parallel_rank = 0
-        if tensor_parallel_degree > 1:
-            strategy = fleet.DistributedStrategy()
-            strategy.hybrid_configs = {
-                "dp_degree": 1,
-                "mp_degree": tensor_parallel_degree,
-                "pp_degree": 1,
-                "sharding_degree": 1,
-            }
-            fleet.init(is_collective=True, strategy=strategy)
-            hcg = fleet.get_hybrid_communicate_group()
-            tensor_parallel_rank = hcg.get_model_parallel_rank()
-
-        if self.args.lora_path is not None:
-            lora_config = LoRAConfig.from_pretrained(self.args.lora_path)
-            dtype = lora_config.dtype
-        else:
-            config = AutoConfig.from_pretrained(args.model_name_or_path)
-            dtype = config.dtype if config.dtype is not None else "float32"
-
-        self.model = AutoModelForConditionalGeneration.from_pretrained(
-            args.model_name_or_path,
-            tensor_parallel_degree=tensor_parallel_degree,
-            tensor_parallel_rank=tensor_parallel_rank,
-            dtype=dtype,
-        )
-        if self.args.lora_path is not None:
-            self.model = LoRAModel.from_pretrained(self.model, self.args.lora_path)
-        self.model.eval()
-
-    def preprocess(self, input_text):
-        input_text = [text.strip() + "[gMASK]" for text in input_text]
-        inputs = self.tokenizer(
-            input_text,
-            return_tensors="np",
-            add_special_tokens=True,
-            padding=True,
-            max_length=self.args.src_length,
-            truncation=True,
-            truncation_side="left",
-        )
-        inputs = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=self.args.tgt_length)
-        inputs_tensor = {}
-        for key, value in inputs.items():
-            inputs_tensor[key] = paddle.to_tensor(value)
-        return inputs_tensor
-
-    def infer(self, inputs):
-        result = self.model.generate(
-            **inputs,
-            decode_strategy="sampling",
-            top_k=1,
-            max_length=self.args.tgt_length,
-            eos_token_id=self.tokenizer.eop_token_id,
-            pad_token_id=self.tokenizer.pad_token_id,
-        )
-        result = result[0]
-        return result
-
-    def postprocess(self, infer_data):
-        result = []
-        for x in infer_data.tolist():
-            res = self.tokenizer.decode(x, skip_special_tokens=True)
-            result.append(res)
-        out_dict = {"result": result}
-        return out_dict
-
-    def predict(self, texts):
-        input_map = self.preprocess(texts)
-        infer_result = self.infer(input_map)
-        output = self.postprocess(infer_result)
-        return output
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    predictor = Predictor(args)
-    all_texts = [
-        "答案：年基准利率4.35%，上下文：从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。在已知答案的前提下，问题：",
-        "答案：U系列，上下文：U系列是最好的，采用国际顶尖技术（由格力自主研发）双级变频压缩机，提高压缩机运转效率，制冷制热能力更强劲；1赫兹变频技术，使空调相当于一个15 W电灯泡，更加节能省电；送风面积广，风力大；生态风，净化空气。非常不错，现在国美在做活动，可以了解一下。在已知答案的前提下，问题：",
-    ]
-    batch_texts = batchfy_text(all_texts, args.batch_size)
-    for bs, texts in enumerate(batch_texts):
-        outputs = predictor.predict(texts)
-        for text, result in zip(texts, outputs["result"]):
-            print("{}\n{}".format(text, result))
-
-    if args.merge_tensor_parallel_path is not None:
-        predictor.model.save_pretrained(
-            save_dir=args.merge_tensor_parallel_path,
-            merge_tensor_parallel=True,
-        )
-        predictor.tokenizer.save_pretrained(args.merge_tensor_parallel_path)
diff --git a/llm/glm/utils.py b/llm/glm/utils.py
deleted file mode 100644
index d3b9e8919aa7..000000000000
--- a/llm/glm/utils.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-
-from paddlenlp.trainer import Trainer
-
-
-class GLMTrainer(Trainer):
-    def __init__(self, do_generation: bool, **kwargs):
-        super().__init__(**kwargs)
-        self.do_generation = do_generation
-
-    def prediction_step(
-        self,
-        model: nn.Layer,
-        inputs: Dict[str, Union[paddle.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]:
-
-        if not self.do_generation:
-            return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
-
-        model.eval()
-        with paddle.no_grad():
-            tokens = model.generate(
-                input_ids=inputs["input_ids"],
-                position_ids=inputs["position_ids"],
-                attention_mask=inputs["attention_mask"],
-                decode_strategy="sampling",
-                top_k=1,
-                repetition_penalty=2.0,
-                bos_token_id=self.tokenizer.sop_token_id,
-                eos_token_id=self.tokenizer.eop_token_id,
-                pad_token_id=self.tokenizer.pad_token_id,
-            )[0]
-            all_preds = []
-            for pred_tokens in tokens:
-                all_preds.append(pred_tokens[pred_tokens != self.tokenizer.pad_token_id].tolist())
-            max_pred_length = max([len(x) for x in all_preds])
-            for index, preds in enumerate(all_preds):
-                all_preds[index] = preds + [-100] * (max_pred_length - len(preds))
-
-            all_labels = []
-            for label, mask in zip(inputs["labels"].numpy(), inputs["loss_mask"].numpy()):
-                label = label[mask.astype("bool")]
-                label = [x for x in label[label != self.tokenizer.pad_token_id]]
-                all_labels.append(label)
-            max_label_length = max([len(x) for x in all_labels])
-            for index, labels in enumerate(all_labels):
-                all_labels[index] = labels + [-100] * (max_label_length - len(labels))
-
-        return (None, paddle.to_tensor(all_preds), paddle.to_tensor(all_labels))
-
-    def log(self, logs: Dict[str, float], **kwargs) -> None:
-
-        if self.state.epoch is not None:
-            logs["epoch"] = round(self.state.epoch, 4)
-
-        if "eval_loss" in logs:
-            logs["eval_ppl"] = np.exp(logs["eval_loss"])
-        output = {**logs, **{"step": self.state.global_step}}
-        self.state.log_history.append(output)
-        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs, **kwargs)
diff --git a/llm/gpt-3/README.md b/llm/gpt-3/README.md
deleted file mode 100644
index a0c387158d43..000000000000
--- a/llm/gpt-3/README.md
+++ /dev/null
@@ -1,205 +0,0 @@
-# GPT
-
-## 1. 模型介绍
-
-GPT-3是一种预训练语言模型，它能够模拟人类语言思维和表达。GPT-3拥有巨大的参数，包含了1750亿个参数，这使得它具有强大的语言理解和生成能力。它可以完成的任务包括文本生成、文本摘要、回答问题、翻译、阅读理解等。GPT-3的预训练过程使用了大量的语料库，包括互联网上的大量文本。它通过分析这些文本，学习如何生成和理解人类语言。GPT-3在自然语言处理领域具有很高的影响力，它可以模拟人类对话和生成文本，这使得它在许多应用领域都有广泛的应用，比如智能客服、自然语言处理、游戏设计等。
-
-## 2. 预训练
-
-预训练数据制作参考[此处](../../model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md)
-
-为了方便用户运行测试本模型，本项目提供了处理好的100k条doc的训练样本：
-```shell
-wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
-wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
-```
-
-将所有预处理得到的文件统一放入一个文件夹中，以备训练使用：
-
-```
-mkdir data
-mv gpt_en_dataset_300m_ids.npy ./data
-mv gpt_en_dataset_300m_idx.npz ./data
-```
-
-注意：
-1. 需要paddle develop版本训练，需要安装`pip install tool_helpers visualdl==2.5.3`等相关缺失whl包
-2. `use_flash_attention` 需要在A100机器开启。建议使用cuda11.8环境。
-
-使用下面脚本,即可在gpt2-medium-en的基础上,继续训练.
-```shell
-task_name="gpt3_hybrid"
-export PYTHONPATH="../../PaddleNLP/"
-export FLAGS_cudnn_deterministic=True
-log_dir="log"
-rm -rf $log_dir
-
-python -u  -m paddle.distributed.launch \
-    --gpus "0,1,2,3,4,5,6,7" \
-    --log_dir ${log_dir} \
-    run_pretrain.py \
-    --model_name_or_path gpt2-medium-en \
-    --tokenizer_name_or_path gpt2-medium-en \
-    --input_dir "./data" \
-    --output_dir "output/$task_name" \
-    --split 949,50,1 \
-    --max_seq_length 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --tensor_parallel_degree 1 \
-    --pipeline_parallel_degree 1 \
-    --sequence_parallel 0 \
-    --fuse_attention_qkv 0 \
-    --use_flash_attention 0 \
-    --fp16  \
-    --fp16_opt_level "O2"  \
-    --scale_loss 1024 \
-    --learning_rate 0.00001 \
-    --min_learning_rate 0.000005 \
-    --max_steps 10000 \
-    --save_steps 5000 \
-    --weight_decay 0.01 \
-    --warmup_ratio 0.01 \
-    --max_grad_norm 1.0 \
-    --logging_steps 1\
-    --continue_training \
-    --dataloader_num_workers 1 \
-    --sharding "stage2" \
-    --eval_steps 1000 \
-    --report_to "visualdl" \
-    --disable_tqdm true \
-    --recompute 1 \
-    --gradient_accumulation_steps 2 \
-    --do_train \
-    --do_eval \
-    --device "gpu"
-```
-
-其中参数释义如下：
-
-- `model_name_or_path`: 预训练模型内置名称或者模型所在目录，默认为`gpt2-medium-en`。
-- `tokenizer_name_or_path`: tokenizer名称或者tokenizer所在目录，默认为`gpt2-medium-en`。
-- `input_dir`: 预训练数据所在目录。
-- `output_dir`: 模型参数及日志保存目录。
-- `split`: 预训练数据切分比例，默认为949,50,1。
-- `max_seq_length`: 预训练最大序列长度，默认为1024。
-- `per_device_train_batch_size`: 单卡训练batch_size大小，默认为1。
-- `per_device_eval_batch_size`: 单卡评估batch_size大小，默认为1。
-- `tensor_parallel_degree`: 模型并行数量。
-- `pipeline_parallel_degree`: 流水线并行数量。
-- `sequence_parallel`: 序列并行数量。需要当`tensor_parallel_degree>1`时，使用序列并行。注意：当模型规模较小、batch_size较小、sequence_length较小时，不建议使用序列并行。
-- `fuse_attention_qkv`：在MultiHeadAttention中使用qkv线性层融合
-- `use_flash_attention`：使用flash attention技术，注意此处需要在A100机器开启, 建议使用cuda11.8环境。
-- `fp16`: 使用 float16 精度进行模型训练和推理。
-- `fp16_opt_level`: float16 精度训练模式，`O2`表示纯 float16 训练。
-- `scale_loss`: float16 精度训练时，损失值的缩放比例。微调时建议使用1024，预训练时建议调大。
-- `learning_rate`: 参数更新的学习率。
-- `min_learning_rate`: 最小学习率。
-- `max_steps`: 模型训练步数。
-- `save_steps`: 模型参数保存的间隔步数。
-- `weight_decay`: 权重衰减系数。
-- `warmup_ratio`: warmup比例。
-- `max_grad_norm`: 梯度裁剪系数。
-- `logging_steps`: 训练日志打印的间隔步数。
-- `continue_training`: 是否继续训练模型。
-- `dataloader_num_workers`: dataloader进程数。
-- `sharding`: sharding切分策略，包含stage1、stage2、stage3。
-- `eval_steps`: 模型评估的间隔步数。
-- `recompute`: 使用重计算策略，开启后可节省训练显存。
-- `gradient_accumulation_steps`: 模型参数梯度累积的步数，可用于扩大 batch size。实际的 batch_size = per_device_train_batch_size * gradient_accumulation_steps。
-- `do_train`: 是否训练模型。
-- `do_eval`: 是否评估模型。
-- `lora`: 是否使用LoRA技术。
-
-<a name="1"></a>
-
-
-## 3. 微调
-### SFT
-
-```shell
-task_name="gpt3_hybrid"
-export PYTHONPATH="../../PaddleNLP/"
-export FLAGS_cudnn_deterministic=True
-log_dir="log"
-rm -rf $log_dir
-
-python -u  -m paddle.distributed.launch \
-    --gpus "0" \
-    --log_dir ${log_dir} \
-    finetune_generation.py \
-    --model_name_or_path gpt2-medium-en \
-    --output_dir "output/$task_name" \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 1 \
-    --tensor_parallel_degree 1 \
-    --pipeline_parallel_degree 1 \
-    --sequence_parallel 0 \
-    --fp16  \
-    --fp16_opt_level "O2"  \
-    --scale_loss 1024 \
-    --learning_rate 0.00001 \
-    --max_steps 10000 \
-    --save_steps 5000 \
-    --weight_decay 0.01 \
-    --warmup_ratio 0.01 \
-    --max_grad_norm 1.0 \
-    --logging_steps 1\
-    --dataloader_num_workers 1 \
-    --sharding "stage2" \
-    --eval_steps 1000 \
-    --report_to "visualdl" \
-    --disable_tqdm true \
-    --recompute 1 \
-    --gradient_accumulation_steps 2 \
-    --do_train \
-    --do_eval \
-    --device "gpu"
-```
-
-### LoRA
-
-```shell
-export PYTHONPATH="../../PaddleNLP/"
-export FLAGS_cudnn_deterministic=True
-log_dir="log"
-rm -rf $log_dir
-
-python finetune_generation.py \
-    --model_name_or_path gpt2-medium-en \
-    --output_dir "output/$task_name" \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 1 \
-    --tensor_parallel_degree 1 \
-    --pipeline_parallel_degree 1 \
-    --sequence_parallel 0 \
-    --fp16  \
-    --fp16_opt_level "O2"  \
-    --scale_loss 1024 \
-    --learning_rate 3e-4 \
-    --max_steps 10000 \
-    --save_steps 5000 \
-    --weight_decay 0.01 \
-    --warmup_ratio 0.01 \
-    --max_grad_norm 1.0 \
-    --logging_steps 1\
-    --dataloader_num_workers 1 \
-    --sharding "stage2" \
-    --eval_steps 1000 \
-    --report_to "visualdl" \
-    --disable_tqdm true \
-    --recompute 1 \
-    --gradient_accumulation_steps 2 \
-    --do_train \
-    --do_eval \
-    --device "gpu" \
-    --lora
-```
-
-
-## 3. 动态图推理
-
-```shell
-python predict_generation.py
-
-```
diff --git a/llm/gpt-3/finetune_generation.py b/llm/gpt-3/finetune_generation.py
deleted file mode 100644
index 0d0df71d8100..000000000000
--- a/llm/gpt-3/finetune_generation.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-from dataclasses import dataclass, field
-from functools import partial
-
-import paddle
-from utils import (
-    DataCollatorForSupervisedDataset,
-    GPTTrainer,
-    compute_metrics,
-    convert_example,
-)
-
-from paddlenlp.datasets import load_dataset
-from paddlenlp.peft import LoRAConfig, LoRAModel
-from paddlenlp.trainer import (
-    PdArgumentParser,
-    TrainingArguments,
-    get_last_checkpoint,
-    set_seed,
-)
-from paddlenlp.transformers import (
-    AutoTokenizer,
-    GPTConfig,
-    GPTForCausalLM,
-    GPTForCausalLMPipe,
-)
-from paddlenlp.utils.log import logger
-
-MODEL_CLASSES = {
-    "gpt": (GPTConfig, GPTForCausalLM),
-}
-
-
-@dataclass
-class DataArgument:
-    task_name: str = field(default="squad", metadata={"help": "The name of task."})
-    src_length: int = field(default=1024, metadata={"help": "The max length of source text."})
-    tgt_length: int = field(default=142, metadata={"help": "The max length of target text."})
-    generate_num: int = field(default=0, metadata={"help": "Save first k examples generation result in dev dataset"})
-
-
-@dataclass
-class ModelArgument:
-    model_type: str = field(
-        default="gpt-cn", metadata={"help": "Build-in pretrained model from the different model type."}
-    )
-    model_name_or_path: str = field(
-        default="gpt-cpm-large-cn", metadata={"help": "Build-in pretrained model name or the path to local model."}
-    )
-    use_flash_attn: bool = field(default=False, metadata={"help": "Whether to use flash attention"})
-    enable_fuse_transformer: bool = field(
-        default=False,
-        metadata={"help": "gpt, enable_fuse_transformer"},
-    )
-
-    fuse_attention_qkv: bool = field(
-        default=False,
-        metadata={"help": "gpt, fuse_attention_qkv"},
-    )
-    eval_with_do_generation: bool = field(
-        default=True, metadata={"help": "Evaluate with generation, instead for calc loss."}
-    )
-    lr_decay_ratio: float = field(default=0.1, metadata={"help": "The ratio for learning rate decrease"})
-    # lora
-    lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"})
-    lora_path: str = field(default=None, metadata={"help": "Initialize lora state dict."})
-    lora_rank: int = field(default=8, metadata={"help": "Lora attention dimension"})
-    merge_weights: bool = field(
-        default=False, metadata={"help": "Merge weights of the original model and the Lora model"}
-    )
-
-
-def main():
-    parser = PdArgumentParser((ModelArgument, DataArgument, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    # data_args.always_pad_to_max_length = False
-    data_args.always_pad_to_max_length = training_args.pipeline_parallel_degree > 1
-    setattr(training_args, "lr_decay_ratio", model_args.lr_decay_ratio)
-
-    training_args.print_config(model_args, "Model")
-    training_args.print_config(data_args, "Data")
-    training_args.tgt_length = data_args.tgt_length
-    paddle.set_device(training_args.device)
-
-    set_seed(seed=training_args.seed)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
-    )
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 1:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set the dtype for loading model
-    dtype = "float32"
-    if training_args.fp16_opt_level == "O2":
-        if training_args.fp16:
-            dtype = "float16"
-        if training_args.bf16:
-            dtype = "bfloat16"
-
-    config_class, model_class = MODEL_CLASSES[model_args.model_type]
-    if training_args.pipeline_parallel_degree > 1:
-        model_class = GPTForCausalLMPipe
-    # Load the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
-    tokenizer.padding_side = "left"
-
-    # Load and set the pretrained configuration
-    config = config_class.from_pretrained(model_args.model_name_or_path)
-    config.enable_fuse_transformer = model_args.enable_fuse_transformer
-    config.fuse_attention_qkv = model_args.fuse_attention_qkv
-    config.use_flash_attn = model_args.use_flash_attn
-    config.use_recompute = training_args.recompute
-
-    config.tensor_parallel_degree = training_args.tensor_parallel_degree
-    config.tensor_parallel_rank = training_args.tensor_parallel_rank
-    config.ignore_index = tokenizer.pad_token_id
-
-    model = model_class.from_pretrained(
-        model_args.model_name_or_path,
-        config=config,
-        dtype=dtype,
-    )
-    if model_args.lora:
-        if model_args.lora_path is None:
-            target_modules = [
-                ".*qkv_proj.*",
-                ".*q_proj.*",
-                ".*k_proj.*",
-                ".*v_proj.*",
-                ".*linear1.*",
-                ".*linear2.*",
-                ".*out_proj.*",
-            ]
-            lora_config = LoRAConfig(
-                target_modules=target_modules,
-                r=model_args.lora_rank,
-                lora_alpha=2 * model_args.lora_rank,
-                merge_weights=model_args.merge_weights,
-                tensor_parallel_degree=training_args.tensor_parallel_degree,
-                dtype=dtype,
-            )
-            model = LoRAModel(model, lora_config)
-        else:
-            model = LoRAModel.from_pretrained(model=model, lora_path=model_args.lora_path)
-        model.mark_only_lora_as_trainable()
-        model.print_trainable_parameters()
-
-    # Load the dataset.
-    if training_args.do_train or training_args.do_eval:
-        train_ds, dev_ds = load_dataset(data_args.task_name, splits=["train_v1", "dev_v1"])
-        trans_func = partial(
-            convert_example,
-            tokenizer=tokenizer,
-            max_source_length=data_args.src_length,
-            max_target_length=data_args.tgt_length,
-        )
-
-    if training_args.do_train:
-        train_ds = train_ds.map(partial(trans_func))
-    if training_args.do_eval:
-        is_test = model_args.eval_with_do_generation
-        dev_ds = dev_ds.map(partial(trans_func, is_test=is_test))
-
-    collate_fn = DataCollatorForSupervisedDataset(
-        tokenizer, max_length=1024 if data_args.always_pad_to_max_length else 0
-    )
-
-    def compute_metrics_trainer(eval_preds, tokenizer):
-        all_preds = []
-        all_labels = []
-        preds = eval_preds.predictions
-        preds = [x[x != -100] for x in preds]
-        all_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=False))
-        labels = [x[x != -100] for x in eval_preds.label_ids]
-        all_labels.extend(tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False))
-
-        all_preds = [pred.strip() for pred in all_preds]
-        all_labels = [label.strip() for label in all_labels]
-        all_preds = [pred.strip("question:") for pred in all_preds]
-        all_labels = [label.strip("question:") for label in all_labels]
-
-        eval_result = compute_metrics(all_preds, all_labels)
-        return eval_result
-
-    compute_metrics_func = partial(
-        compute_metrics_trainer,
-        tokenizer=tokenizer,
-    )
-
-    trainer = GPTTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_ds if training_args.do_train else None,
-        eval_dataset=dev_ds if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        compute_metrics=compute_metrics_func
-        if (model_args.eval_with_do_generation and training_args.do_eval)
-        else None,
-        do_generation=model_args.eval_with_do_generation,
-        data_collator=collate_fn,
-    )
-
-    if training_args.do_train:
-        train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
-        trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1)
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-
-    if training_args.do_eval:
-        eval_result = trainer.evaluate()
-        trainer.log_metrics("test", eval_result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/llm/gpt-3/predict_generation.py b/llm/gpt-3/predict_generation.py
deleted file mode 100644
index 060bcb9f8cf1..000000000000
--- a/llm/gpt-3/predict_generation.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import paddle
-from utils import get_hcg, init_dist_env, set_seed
-
-from paddlenlp.transformers import (
-    GPTChineseTokenizer,
-    GPTConfig,
-    GPTForCausalLM,
-    GPTTokenizer,
-)
-
-MODEL_CLASSES = {
-    "gpt2": (GPTForCausalLM, GPTTokenizer),
-    "gpt2-cn": (GPTForCausalLM, GPTChineseTokenizer),
-}
-
-
-def parse_arguments():
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_type", default="gpt2-cn", help="The directory of model.")
-    parser.add_argument("--model_name_or_path", default="gpt-cpm-large-cn", help="The directory of model.")
-    parser.add_argument("--save_onepiece_model_path", default=None, help="The directory of model.")
-    parser.add_argument("--batch_size", type=int, default=1, help="The batch size of data.")
-    parser.add_argument("--src_length", type=int, default=200, help="The batch size of data.")
-    parser.add_argument("--tgt_length", type=int, default=200, help="The batch size of data.")
-    parser.add_argument("--seed", type=int, default=20, help="the seed of parameter initialization")
-    return parser.parse_args()
-
-
-def batchfy_text(texts, batch_size):
-    batch_texts = []
-    batch_start = 0
-    while batch_start < len(texts):
-        batch_texts += [texts[batch_start : min(batch_start + batch_size, len(texts))]]
-        batch_start += batch_size
-    return batch_texts
-
-
-class Predictor(object):
-    def __init__(self, args=None, tokenizer=None, model=None, **kwargs):
-        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-        self.tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
-        self.tokenizer.padding_side = "left"
-        self.batch_size = args.batch_size
-        self.args = args
-        self.src_length = self.args.src_length
-        self.tgt_length = self.args.tgt_length
-
-        tensor_parallel_degree = paddle.distributed.get_world_size()
-        tensor_parallel_rank = 0
-        if tensor_parallel_degree > 1:
-            hcg = get_hcg()
-            tensor_parallel_rank = hcg.get_model_parallel_rank()
-
-        config = GPTConfig.from_pretrained(args.model_name_or_path)
-        dtype = config.dtype if config.dtype is not None else "float16"
-
-        self.model = GPTForCausalLM.from_pretrained(
-            args.model_name_or_path,
-            dtype=dtype,
-            tensor_parallel_degree=tensor_parallel_degree,
-            tensor_parallel_rank=tensor_parallel_rank,
-        )
-        if self.tokenizer.pad_token_id is None:
-            self.tokenizer.pad_token_id = self.model.config.pad_token_id
-        self.model.eval()
-
-    def preprocess(self, input_text):
-        inputs = self.tokenizer(
-            input_text,
-            return_tensors="np",
-            padding=True,
-            max_length=self.src_length,
-        )
-        inputs_tensor = {}
-        for key, value in inputs.items():
-            inputs_tensor[key] = paddle.to_tensor(value)
-        return inputs_tensor
-
-    def infer(self, inputs):
-        if self.model.config.dtype == "float32" or self.model.config.dtype is None:
-            with paddle.no_grad():
-                result = self.model.generate(
-                    **inputs,
-                    max_length=self.tgt_length,
-                    bos_token_id=self.tokenizer.bos_token_id,
-                    eos_token_id=self.tokenizer.eol_token_id,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    decode_strategy="sampling",
-                    top_k=1,
-                )
-        else:
-            with paddle.no_grad():
-                with paddle.amp.auto_cast(False, level="O2", dtype=self.model.config.dtype):
-                    result = self.model.generate(
-                        **inputs,
-                        max_length=self.tgt_length,
-                        bos_token_id=self.tokenizer.bos_token_id,
-                        eos_token_id=self.tokenizer.eol_token_id,
-                        pad_token_id=self.tokenizer.pad_token_id,
-                        decode_strategy="sampling",
-                        top_k=1,
-                    )
-        result = result[0]
-        return result
-
-    def postprocess(self, infer_data):
-        result = []
-        for x in infer_data.tolist():
-            res = self.tokenizer.convert_ids_to_string(x)
-            result.append(res)
-        out_dict = {"result": result}
-        return out_dict
-
-    def predict(self, texts):
-        input_map = self.preprocess(texts)
-        infer_result = self.infer(input_map)
-        output = self.postprocess(infer_result)
-        return output
-
-    def save_onepiece_model(self, save_onepiece_model_path):
-        self.model.save_pretrained(save_dir=save_onepiece_model_path, merge_tensor_parallel=True)
-        paddle.distributed.barrier()
-        self.tokenizer.save_pretrained(save_onepiece_model_path)
-        paddle.distributed.barrier()
-
-
-def predict():
-    args = parse_arguments()
-
-    # Init the fleet config
-    tensor_parallel_degree = paddle.distributed.get_world_size()
-    if tensor_parallel_degree > 1:
-        init_dist_env(tensor_parallel_degree=tensor_parallel_degree, seed=args.seed)
-    set_seed(args.seed)
-
-    predictor = Predictor(args)
-    all_texts = ["问题：中国的首都是哪里？答案：北京。\n问题：苹果的CEO是谁? 答案：", "问题：中国的首都是哪里？答案：北京。\n问题：广东的省会是哪个城市? 答案："]
-    batch_texts = batchfy_text(all_texts, args.batch_size)
-    for bs, texts in enumerate(batch_texts):
-        outputs = predictor.predict(texts)
-        for text, result in zip(texts, outputs["result"]):
-            print(result)
-    if args.save_onepiece_model_path is not None:
-        predictor.save_onepiece_model(args.save_onepiece_model_path)
-
-
-if __name__ == "__main__":
-    predict()
diff --git a/llm/gpt-3/run_pretrain.py b/llm/gpt-3/run_pretrain.py
deleted file mode 120000
index f4873c94b357..000000000000
--- a/llm/gpt-3/run_pretrain.py
+++ /dev/null
@@ -1 +0,0 @@
-../run_pretrain.py
\ No newline at end of file
diff --git a/llm/gpt-3/tests/test_sequence_parallel.py b/llm/gpt-3/tests/test_sequence_parallel.py
deleted file mode 100644
index b8284695d652..000000000000
--- a/llm/gpt-3/tests/test_sequence_parallel.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.distributed.fleet as fleet
-from paddle.distributed.fleet.meta_parallel.pipeline_parallel import PipelineParallel
-
-from paddlenlp.transformers import GPTConfig, GPTForCausalLM, GPTForCausalLMPipe
-
-
-class TestGPT(unittest.TestCase):
-    def test_sequence_model(self):
-        model_name_or_path = "gpt2-medium-en"
-        seq_len = 1024
-        batch_size = 2
-        input_ids = paddle.arange(100, 100 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len])
-        labels = paddle.arange(101, 101 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len])
-
-        world_size = paddle.distributed.get_world_size()
-        pp_degree = 2
-        tp_degree = world_size // pp_degree
-        strategy = fleet.DistributedStrategy()
-        strategy.hybrid_configs = {
-            "dp_degree": 1,
-            "mp_degree": tp_degree,
-            "pp_degree": pp_degree,
-            "sharding_degree": 1,
-        }
-        strategy.pipeline_configs = {"enable_partial_send_recv": False if pp_degree > 1 else True}
-        fleet.init(is_collective=True, strategy=strategy)
-        hcg = fleet.get_hybrid_communicate_group()
-        mp_group = hcg.get_model_parallel_group()
-        tensor_parallel_rank = mp_group.rank
-
-        if pp_degree > 1:
-            model_class = GPTForCausalLMPipe
-        else:
-            model_class = GPTForCausalLM
-
-        config = GPTConfig.from_pretrained(model_name_or_path)
-        config.seq_length = seq_len
-        config.use_flash_attention = False
-        config.fuse_attention_qkv = False
-        config.recompute_granularity = "full"
-        config.virtual_pp_degree = 1
-        config.use_recompute = False
-
-        config.tensor_parallel_degree = tp_degree
-        config.tensor_parallel_rank = tensor_parallel_rank
-        config.tensor_parallel_output = False
-        # when tp_degree > 1, sequence_parallel can be set to True
-        config.sequence_parallel = True
-        config.fuse_sequence_parallel_allreduce = False
-
-        model = model_class.from_pretrained(model_name_or_path, config=config, dtype="float32")
-        model.eval()
-
-        if pp_degree > 1:
-            pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy)
-            pp_model.accumulate_steps = batch_size  # for micro_batch_size * acc_steps == batch_size
-            ret_mp_pp = pp_model.eval_batch(data=[input_ids, labels], compute_loss=True)
-        else:
-            ret_mp_pp = model(input_ids=input_ids, labels=labels)[0]
-
-        # run model for single device
-        config.tensor_parallel_degree = 1
-        config.tensor_parallel_rank = -1
-        config.sequence_parallel = False
-        single_model = GPTForCausalLM.from_pretrained(model_name_or_path, config=config, dtype="float32")
-        single_model.eval()
-        ret_single = single_model(input_ids=input_ids, labels=labels)[0]
-
-        # output all results
-        print(f"ret mp{tp_degree} pp{pp_degree}", float(ret_mp_pp))
-        print("ret single", float(ret_single))
-
-        diff = (ret_single - ret_mp_pp) / ret_single
-        print(f"diff: {float(diff)}")
-        np.testing.assert_allclose(float(ret_single), ret_mp_pp, rtol=1.5e-7)
-
-
-if __name__ == "__main__":
-    TestGPT().test_sequence_model()
-# python -m paddle.distributed.launch --gpus 0,1,2,3  tests/test_pipeline_parallel.py
diff --git a/llm/gpt-3/utils.py b/llm/gpt-3/utils.py
deleted file mode 100644
index 647b9d05356f..000000000000
--- a/llm/gpt-3/utils.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import random
-import re
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.distributed as dist
-import paddle.nn as nn
-from paddle.distributed import fleet
-from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-from paddle.optimizer.lr import LambdaDecay
-from rouge import Rouge
-
-from paddlenlp.data import DataCollatorForSeq2Seq
-from paddlenlp.metrics import BLEU
-from paddlenlp.trainer import Trainer
-from paddlenlp.utils.log import logger
-
-PREFIX_CHECKPOINT_DIR = "model_state"
-_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\.tp(\d+)" + ".pdparams$")
-
-
-_hcg = None
-
-
-def set_hcg(hcg):
-    global _hcg
-    _hcg = hcg
-
-
-def get_hcg():
-    global _hcg
-    return _hcg
-
-
-def set_seed(seed):
-    # NOTE(shenliang03): For parameter init seed:
-    # seed: dp/mp_undistributed_paramter/sharding is same; others is different
-    # For compute seed(dropout):
-    # global seed: only mp group is same.
-    # local seed: all groups are different
-
-    hcg = get_hcg()
-    if paddle.distributed.get_world_size() > 1:
-        # obtain rank message of hybrid parallel
-
-        mp_rank = hcg.get_model_parallel_rank()
-        mp_size = hcg.get_model_parallel_world_size()
-
-        pp_rank = hcg.get_stage_id()
-        pp_size = hcg.get_pipe_parallel_world_size()
-
-        dp_rank = hcg.get_data_parallel_rank()
-        dp_size = hcg.get_data_parallel_world_size()
-
-        sharding_rank = hcg.get_sharding_parallel_rank()
-        # sharding_size = hcg.get_sharding_parallel_world_size()
-    else:
-        mp_rank, mp_size = 0, 1
-        pp_rank, pp_size = 0, 1
-        dp_rank, dp_size = 0, 1
-        sharding_rank, _ = 0, 1
-
-    # NOTE: the commented seeds are set only for precision validation
-    # seed += 100 * pp_rank
-    random_seed = seed + 100 * pp_rank
-    random.seed(random_seed)
-    np.random.seed(random_seed)
-
-    # seed = mp_rank +
-    #        pp_rank * (mp_size) +
-    #        dp_rank * (mp_size * pp_size) +
-    #        sharding_rank * (mp_size * pp_size * dp_size)
-    # seed offset is order to avoid conflicts with the parameter initialization seed
-
-    seed_offset = seed + 1024 + paddle.distributed.get_world_size()
-    global_seed = (
-        seed_offset
-        + pp_rank * (mp_size)
-        + dp_rank * (mp_size * pp_size)
-        + sharding_rank * (mp_size * pp_size * dp_size)
-    )
-
-    seed_offset += paddle.distributed.get_world_size()
-    local_seed = (
-        seed_offset
-        + mp_rank
-        + pp_rank * (mp_size)
-        + dp_rank * (mp_size * pp_size)
-        + sharding_rank * (mp_size * pp_size * dp_size)
-    )
-
-    tracker = get_rng_state_tracker()
-    tracker.add("global_seed", global_seed)
-    tracker.add("local_seed", local_seed)
-
-    paddle.seed(global_seed)
-
-    logger.info(
-        "The global seed is set to {}, local seed is set to {} and "
-        "random seed is set to {}.".format(global_seed, local_seed, random_seed)
-    )
-
-
-def create_hcg(strategy, hcg_name="HybridCommunicateGroup"):
-    if hcg_name == "HybridCommunicateGroup":
-        fleet.init(is_collective=True, strategy=strategy)
-        hcg = fleet.get_hybrid_communicate_group()
-    else:
-        dist.init_parallel_env()
-        hcg = eval("{}".format(hcg_name))(strategy)
-
-    return hcg
-
-
-def init_dist_env(
-    tensor_parallel_degree=1, sharding_parallel_degree=1, pipeline_parallel_degree=1, data_parallel_degree=1, seed=1
-):
-
-    strategy = fleet.DistributedStrategy()
-
-    def is_segment_parallel_supported():
-        import inspect
-
-        members = [name for (name, date) in inspect.getmembers(fleet.HybridCommunicateGroup)]
-        return "get_sep_parallel_world_size" in members
-
-    if tensor_parallel_degree == 1 and sharding_parallel_degree == 1:
-        if is_segment_parallel_supported():
-            order = ["pp", "dp", "sharding", "sep", "mp"]
-        else:
-            order = ["pp", "dp", "sharding", "mp"]
-    else:
-        if is_segment_parallel_supported():
-            order = ["dp", "pp", "sharding", "sep", "mp"]
-        else:
-            order = ["dp", "pp", "sharding", "mp"]
-
-    strategy.hybrid_configs = {
-        "dp_degree": data_parallel_degree,
-        "mp_degree": tensor_parallel_degree,
-        "pp_degree": pipeline_parallel_degree,
-        "sharding_degree": sharding_parallel_degree,
-        "order": order,
-    }
-
-    # TODO(wawltor) The inference parallel do not support the pipeline mode
-
-    """
-    if pipeline_parallel_degree > 1:
-        if "sequence_parallel" in config.Model:
-            if config.Model.sequence_parallel:
-                assert config.Global.enable_partial_send_recv is False, (
-                    "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, "
-                    "config.Global.enable_partial_send_recv should be set False."
-                )
-
-    strategy.pipeline_configs = {
-        "accumulate_steps": config.Global.local_batch_size // config.Global.micro_batch_size,
-        "micro_batch_size": config.Global.micro_batch_size,
-        "enable_partial_send_recv": config.Global.enable_partial_send_recv,
-    }
-    """
-
-    # set control in tensor parallel
-    strategy.tensor_parallel_configs = {"tensor_init_seed": seed}
-
-    hcg = create_hcg(strategy)
-    set_hcg(hcg)
-
-
-def convert_example(
-    example,
-    tokenizer,
-    max_source_length,
-    max_target_length,
-    is_test=False,
-):
-    """
-    Convert an example into necessary features.
-    """
-    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-    # in one example possible giving several features when a context is long, each of those features having a
-    # context that overlaps a bit the context of the previous feature.
-    # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
-    # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
-    context = example["context"]
-    question = example["question"]
-    try:
-        answer = example["answers"][0]
-    except Exception:
-        print(example["context"])
-        print(example["question"])
-        print(example["answers"])
-        print(example["answer_starts"])
-        print(example["is_impossible"])
-
-    input_seq = f"answer: {answer} context: {context} </s>"
-    output_seq = f"question: {question} </s>"
-
-    outputs = tokenizer(
-        output_seq,
-        max_length=max_target_length,
-        # pad_to_max_seq_len=True,
-        truncation_strategy="longest_first",
-        return_attention_mask=False,
-        return_token_type_ids=False,
-    )
-    inputs = tokenizer(
-        input_seq,
-        max_length=max_source_length,
-        # pad_to_max_seq_len=True,
-        truncation_strategy="longest_first",
-        return_attention_mask=False,
-        return_length=False,
-    )
-
-    final = {}
-    for k in outputs.keys():
-        final[k] = inputs[k] + outputs[k]
-        if k == "input_ids":
-            final["labels"] = [tokenizer.pad_token_id] * len(inputs["input_ids"]) + outputs[k]
-    if is_test:
-        return dict(input_ids=inputs["input_ids"], labels=outputs["input_ids"])
-
-    # shift inputs and labels
-    final["input_ids"] = final["input_ids"][:-1]
-    final["labels"] = final["labels"][1:]
-    return final
-
-
-def compute_metrics(preds, targets):
-    assert len(preds) == len(targets), (
-        "The length of pred_responses should be equal to the length of "
-        "target_responses. But received {} and {}.".format(len(preds), len(targets))
-    )
-    rouge = Rouge()
-    bleu4 = BLEU(n_size=4)
-    scores = []
-    for pred, target in zip(preds, targets):
-        try:
-            score = rouge.get_scores(" ".join(pred), " ".join(target))
-            scores.append([score[0]["rouge-1"]["f"], score[0]["rouge-2"]["f"], score[0]["rouge-l"]["f"]])
-        except ValueError:
-            scores.append([0, 0, 0])
-        bleu4.add_inst(pred, [target])
-    rouge1 = np.mean([i[0] for i in scores])
-    rouge2 = np.mean([i[1] for i in scores])
-    rougel = np.mean([i[2] for i in scores])
-
-    rouge1 = round(rouge1, 4)
-    rouge2 = round(rouge2, 4)
-    rougel = round(rougel, 4)
-    bleu4 = round(bleu4.score(), 4)
-    return dict(
-        rouge1=rouge1,
-        rouge2=rouge2,
-        rougel=rougel,
-        bleu4=bleu4,
-    )
-
-
-class DataCollatorForSupervisedDataset(DataCollatorForSeq2Seq):
-    """Collate examples for supervised fine-tuning."""
-
-    def __call__(self, features, return_tensors=None):
-        # Deep copy to avoid modifying features in-place
-        batch = copy.deepcopy(features)
-        if return_tensors is None:
-            return_tensors = self.return_tensors
-        labels = [feature["labels"] for feature in batch] if "labels" in batch[0].keys() else None
-        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
-        # same length to return tensors.
-        if labels is not None:
-            # Note(gongenlei): In pipeline, max_label_length = self.max_length
-            if self.padding == "max_length" and self.max_length is not None:
-                max_label_length = self.max_length
-            else:
-                max_label_length = max(len(l) for l in labels)
-            if self.pad_to_multiple_of is not None:
-                max_label_length = (
-                    (max_label_length + self.pad_to_multiple_of - 1)
-                    // self.pad_to_multiple_of
-                    * self.pad_to_multiple_of
-                )
-
-            padding_side = self.tokenizer.padding_side
-            for feature in batch:
-                remainder = [self.tokenizer.pad_token_id] * (max_label_length - len(feature["labels"]))
-                if isinstance(feature["labels"], list):
-                    feature["labels"] = (
-                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
-                    )
-                elif padding_side == "right":
-                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
-                else:
-                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
-
-        batch = self.tokenizer.pad(
-            batch,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=return_tensors,
-            return_attention_mask=self.return_attention_mask,
-        )
-
-        return batch
-
-
-class GPTTrainer(Trainer):
-    def __init__(self, do_generation: bool, **kwargs):
-        super().__init__(**kwargs)
-        self.do_generation = do_generation
-
-    def prediction_step(
-        self,
-        model: nn.Layer,
-        inputs: Dict[str, Union[paddle.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]:
-
-        if prediction_loss_only:
-            return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
-        elif not self.do_generation:
-            loss, logits, labels = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
-            # argmax here to avoid gather all logits, which is too memory-consuming.
-            # keepdim in order to maintain the same shape as logits
-            return (loss, logits.argmax(axis=-1, keepdim=True), labels)
-
-        model.eval()
-
-        preds = model.generate(
-            input_ids=inputs["input_ids"],
-            attention_mask=inputs["attention_mask"] if "attention_mask" in inputs else None,
-            max_length=self.args.tgt_length,
-            min_length=0,
-            use_cache=True,
-            temperature=1.0,
-            top_k=1,
-            top_p=1.0,
-            repetition_penalty=1.0,
-            decode_strategy="sampling",
-        )[0]
-        all_labels = []
-        for label in inputs["labels"].numpy():
-            label = [x for x in label[label != self.tokenizer.pad_token_id]]
-            all_labels.append(label)
-        max_label_length = max([len(x) for x in all_labels])
-        for index, labels in enumerate(all_labels):
-            all_labels[index] = labels + [-100] * (max_label_length - len(labels))
-
-        return (None, paddle.to_tensor(preds), paddle.to_tensor(all_labels))
-
-    def create_scheduler(self, num_training_steps: int):
-        num_warmup_steps = (
-            self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_ratio * num_training_steps
-        )
-
-        def lr_lambda(current_step: int):
-            if current_step < num_warmup_steps:
-                return float(current_step) / float(max(1, num_warmup_steps))
-            else:
-                decay_step_ratio = (current_step - num_warmup_steps) / (num_training_steps - num_warmup_steps)
-                return 1.0 - (1.0 - self.args.lr_decay_ratio) * decay_step_ratio
-
-        if self.lr_scheduler is None:
-            self.lr_scheduler = LambdaDecay(self.args.learning_rate, lr_lambda, last_epoch=-1)
-        return self.lr_scheduler
-
-    def log(self, logs: Dict[str, float], **kwargs) -> None:
-        if "loss" in logs:
-            logs["ppl"] = np.exp(logs["loss"])
-        if "eval_loss" in logs:
-            logs["eval_ppl"] = np.exp(logs["eval_loss"])
-
-        super(GPTTrainer, self).log(logs, **kwargs)
diff --git a/llm/llama/lora_argument_pissa.json b/llm/llama/lora_argument_pissa.json
deleted file mode 100644
index ba9e770add54..000000000000
--- a/llm/llama/lora_argument_pissa.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-    "model_name_or_path": "facebook/llama-7b",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_lora_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 32,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps":16,
-    "num_train_epochs": 3,
-    "learning_rate": 2e-05,
-    "warmup_steps": 10,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 1024,
-    "max_length": 2048,
-    "fp16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 1,
-    "pipeline_parallel_degree": 1,
-    "lora": true,
-    "pissa": false,
-    "zero_padding": false,
-    "use_flash_attention": false
-  }
diff --git a/llm/llama/megre_tp_and_pp.py b/llm/llama/megre_tp_and_pp.py
deleted file mode 100644
index 1758ecf59710..000000000000
--- a/llm/llama/megre_tp_and_pp.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import paddle
-
-from paddlenlp.transformers import LlamaConfig, LlamaForCausalLM
-from paddlenlp.utils.log import logger
-
-
-def merge_pipeline_parallel(tp_degree, pp_degree, path):
-    tp_state_dict_list = []
-    for tp in range(tp_degree):
-        tp_state_dict = {}
-        for pp in range(pp_degree):
-            tmp = paddle.load(os.path.join(path, f"model_state.tp{tp:0>2d}_pp{pp:0>2d}.pdparams"), return_numpy=True)
-            for k, v in tmp.items():
-                tp_state_dict[k] = v
-
-        tp_state_dict_list.append(tp_state_dict)
-
-    return tp_state_dict_list
-
-
-def merge_tensor_parallel(cls, state_dict_list, config) -> None:
-    """the entry of converting config and converting model file
-
-    Args:
-        input_dir (str | None): the input dir which contains `pytorch_model.bin` and `config.json` file
-        config (PretrainedConfig): the PretrainedConfig instance of model
-    """
-    name_action_mappings = cls._get_tensor_parallel_mappings(config, is_split=False)
-    state_keys_map = cls._resolve_prefix_keys(name_action_mappings.keys(), state_dict_list[0].keys())
-
-    for k, v in state_keys_map.items():
-        name_action_mappings[v] = name_action_mappings.pop(k)
-
-    state_dict_to_save = {}
-    for key in state_dict_list[0].keys():
-        tensor = state_dict_list[0][key]
-        if key in name_action_mappings:
-            ret = [x[key] for x in state_dict_list]
-            action = name_action_mappings.pop(key)
-            tensor = action(ret)
-
-        state_dict_to_save[key] = tensor
-
-    if len(name_action_mappings) > 0:
-        for x in name_action_mappings.keys():
-            logger.warning(f"key <{x}> need to merge tensor parallel but we can't find in model state.")
-
-    print("Finally, we merging state dict to fellowing tensors.")
-    for k, v in state_dict_to_save.items():
-        print(k, v.shape, v.dtype)
-
-    return state_dict_to_save
-
-
-def main():
-    tp_degree = 2
-    pp_degree = 2
-    model_name_or_path = "temp_dir_to_your_ckpt"
-
-    assert tp_degree > 1
-    assert pp_degree > 1
-    config = LlamaConfig.from_pretrained(model_name_or_path)
-    cls = LlamaForCausalLM
-
-    tp_state_dict_list = merge_pipeline_parallel(tp_degree, pp_degree, model_name_or_path)
-    state_dict_to_save = merge_tensor_parallel(cls=cls, state_dict_list=tp_state_dict_list, config=config)
-    print("saving")
-    paddle.save(state_dict_to_save, os.path.join(model_name_or_path, "model_state.pdparams"))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json b/llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json
deleted file mode 100644
index 928ef5510687..000000000000
--- a/llm/llama/pretrain-baichuan2_13b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
-    "tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/baichuan_pretrain_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 16,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": true,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json b/llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json
deleted file mode 100644
index 6840fc73b24b..000000000000
--- a/llm/llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "FlagAlpha/Llama2-Chinese-13b-Chat",
-    "tokenizer_name_or_path": "FlagAlpha/Llama2-Chinese-13b-Chat",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/flagalpha_pretrain_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 16,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json b/llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json
deleted file mode 100644
index f2edb150e011..000000000000
--- a/llm/llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "FlagAlpha/Llama2-Chinese-7b-Chat",
-    "tokenizer_name_or_path": "FlagAlpha/Llama2-Chinese-7b-Chat",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/flagalpha_pretrain_ckpts",
-    "per_device_train_batch_size": 2,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json b/llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json
deleted file mode 100644
index 4f6965a3bd3a..000000000000
--- a/llm/llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "linly-ai/chinese-llama-2-7b",
-    "tokenizer_name_or_path": "linly-ai/chinese-llama-2-7b",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/linly_pretrain_ckpts",
-    "per_device_train_batch_size": 2,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 2048,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json b/llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json
deleted file mode 100644
index 33b20ec2b568..000000000000
--- a/llm/llama/pretrain-llama2_7b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "meta-llama/Llama-2-7b",
-    "tokenizer_name_or_path": "meta-llama/Llama-2-7b",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/llama2_pretrain_ckpts",
-    "per_device_train_batch_size": 2,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/llama/pretrain-llama_13b-tp2sd4_stage2.json b/llm/llama/pretrain-llama_13b-tp2sd4_stage2.json
deleted file mode 100644
index 545665f502d3..000000000000
--- a/llm/llama/pretrain-llama_13b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "facebook/llama-13b",
-    "tokenizer_name_or_path": "facebook/llama-13b",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/llama_pretrain_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 16,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/llama/pretrain-llama_7b-tp2sd4_stage2.json b/llm/llama/pretrain-llama_7b-tp2sd4_stage2.json
deleted file mode 100644
index 12e1029e09bc..000000000000
--- a/llm/llama/pretrain-llama_7b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "facebook/llama-7b",
-    "tokenizer_name_or_path": "facebook/llama-7b",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/llama_pretrain_ckpts",
-    "per_device_train_batch_size": 2,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 2048,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/llama/run_pretrain.py b/llm/llama/run_pretrain.py
deleted file mode 120000
index f4873c94b357..000000000000
--- a/llm/llama/run_pretrain.py
+++ /dev/null
@@ -1 +0,0 @@
-../run_pretrain.py
\ No newline at end of file
diff --git a/llm/llama/sft_pp_argument.json b/llm/llama/sft_pp_argument.json
deleted file mode 100644
index 8f03f20e97d7..000000000000
--- a/llm/llama/sft_pp_argument.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "model_name_or_path": "facebook/llama-7b",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/llama_sft_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps":16,
-    "num_train_epochs": 3,
-    "learning_rate": 3e-05,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 256,
-    "max_length": 512,
-    "fp16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 2,
-    "pipeline_parallel_config": "disable_p2p_cache_shape",
-    "zero_padding": false,
-    "use_flash_attention": false
-  }
\ No newline at end of file
diff --git a/llm/llama/tests/test_pipeline_parallel.py b/llm/llama/tests/test_pipeline_parallel.py
deleted file mode 100644
index a232642e2987..000000000000
--- a/llm/llama/tests/test_pipeline_parallel.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.distributed.fleet as fleet
-from paddle.distributed.fleet.meta_parallel.pipeline_parallel import PipelineParallel
-
-from paddlenlp.transformers import AutoModelForCausalLM, AutoModelForCausalLMPipe
-
-
-class TestLlama(unittest.TestCase):
-    def test_pipeline_model(self):
-        world_size = paddle.distributed.get_world_size()
-        pp_degree = world_size
-        tp_degree = 1
-        if world_size > 2:
-            pp_degree = 2
-            assert world_size % pp_degree == 0
-            tp_degree = world_size // pp_degree
-
-        pp_degree = -1
-        if pp_degree == -1:
-            tp_degree = world_size
-            pp_degree = 1
-
-        strategy = fleet.DistributedStrategy()
-        strategy.hybrid_configs = {
-            "dp_degree": 1,
-            "mp_degree": tp_degree,
-            "pp_degree": pp_degree,
-            "sharding_degree": 1,
-        }
-        fleet.init(is_collective=True, strategy=strategy)
-        hcg = fleet.get_hybrid_communicate_group()
-
-        if pp_degree > 1:
-            model_class = AutoModelForCausalLMPipe
-        else:
-            model_class = AutoModelForCausalLM
-
-        model_name_or_path = "./llama-7b-2l"
-        # model_name_or_path = "__internal_testing__/tiny-random-llama"
-        model = model_class.from_pretrained(
-            model_name_or_path,
-            tensor_parallel_degree=tp_degree,
-            tensor_parallel_rank=hcg.get_model_parallel_rank(),
-            tensor_parallel_output=False,
-            # use_flash_attention=True,
-        )
-
-        model.eval()
-
-        input_ids = paddle.to_tensor([[x for x in range(100, 110)]], dtype="int64")
-        labels = paddle.to_tensor([[x for x in range(101, 111)]], dtype="int64")
-        attention_mask = None
-
-        if pp_degree > 1:
-            pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy)
-            ret = pp_model.eval_batch(data=[input_ids, labels], compute_loss=True)
-        else:
-            ret = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
-            ret = ret[0]
-
-        # np.testing.assert_allclose(ret.item(), 10.49988270, atol=1e-7)
-        print(f"ret mp{tp_degree} pp", ret.item())
-        ret_mp_pp = ret.item()
-
-        single_model = AutoModelForCausalLM.from_pretrained(
-            model_name_or_path,
-            tensor_parallel_output=False,
-        )
-        single_model.eval()
-        ret = single_model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
-        print("ret single", ret[0].item())
-        print(
-            f"diff: {(ret[0].item()- ret_mp_pp)/ret[0].item()}",
-        )
-        np.testing.assert_allclose(ret[0].item(), ret_mp_pp, rtol=1.5e-7)
-        # 15.526779174804688
-        # 16.879518508911133
-
-
-if __name__ == "__main__":
-    TestLlama().test_pipeline_model()
-
-# 3 bugs to fix in paddlepaddle
-# pp_layers.py
-# def _construct_shared_comm(self):
-#     shared_comm = {}
-#     if self._topo.get_dim("pipe") == 1:
-#         return shared_comm
-
-# topology.py
-# def _set_p2p_group(self):
-#     self.send_next_group = None
-#     self.send_prev_group = None
-#     self.recv_next_group = None
-#     self.recv_prev_group = None
-#     if self._pp_degree <= 1:
-#         return
-
-# pipeline_parallel.py
-# def _load_micro_batch(self, cache_id, stage=None):
-#     inputs = self.data
-#     if stage == "fisrt":
-#         assert self.is_pipeline_first_stage()
-#         assert len(inputs) == 2, "length of input should be 2"
-#         return self._load_micro_batch_impl(inputs[0], cache_id)
-#     elif stage== "last":
-#         assert self.is_pipeline_last_stage()
-#         assert len(inputs) == 2, "length of input should be 2"
-#         return self._load_micro_batch_impl(inputs[1], cache_id)
-#     else:
-#         inputs = None
-#
-#
-# CUDA_VISIBLE_DEVICES=2 PYTHONPATH=./ pytest -s -v tests/test_pipeline_parallel.py
-# PYTHONPATH=/ssd2/zhonghui03/Datasets/PaddleNLP:$PYTHONPATH  PYTHONPATH=$PYTHONPATH:./ python   -m paddle.distributed.launch --gpus 0,1,2,3  tests/test_pipeline_parallel.py
diff --git a/llm/llama/tests/test_sequence_parallel.py b/llm/llama/tests/test_sequence_parallel.py
deleted file mode 100644
index f46330e85cd5..000000000000
--- a/llm/llama/tests/test_sequence_parallel.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.distributed.fleet as fleet
-from paddle.distributed.fleet.meta_parallel.pipeline_parallel import PipelineParallel
-
-from paddlenlp.transformers import LlamaConfig, LlamaForCausalLM, LlamaForCausalLMPipe
-
-
-class TestLlama(unittest.TestCase):
-    def test_sequence_model(self):
-        world_size = paddle.distributed.get_world_size()
-        pp_degree = world_size
-        tp_degree = 1
-
-        if world_size > 2:
-            pp_degree = 2
-            assert world_size % pp_degree == 0
-            tp_degree = world_size // pp_degree
-
-        strategy = fleet.DistributedStrategy()
-        strategy.hybrid_configs = {
-            "dp_degree": 1,
-            "mp_degree": tp_degree,
-            "pp_degree": pp_degree,
-            "sharding_degree": 1,
-        }
-        strategy.pipeline_configs = {"enable_partial_send_recv": False if pp_degree > 1 else True}
-        fleet.init(is_collective=True, strategy=strategy)
-        hcg = fleet.get_hybrid_communicate_group()
-        mp_group = hcg.get_model_parallel_group()
-        tensor_parallel_rank = mp_group.rank
-
-        if pp_degree > 1:
-            model_class = LlamaForCausalLMPipe
-        else:
-            model_class = LlamaForCausalLM
-
-        # model_name_or_path = "facebook/llama-7b"
-        model_name_or_path = "__internal_testing__/tiny-random-llama"
-
-        seq_len = 2048
-        batch_size = 2
-
-        config = LlamaConfig.from_pretrained(model_name_or_path)
-        config.seq_length = seq_len
-        config.use_flash_attention = False
-        config.use_fused_rms_norm = False
-        config.fuse_attention_qkv = False
-        config.recompute_granularity = "full"
-        config.virtual_pp_degree = 1
-        config.use_recompute = False
-
-        config.tensor_parallel_degree = tp_degree
-        config.tensor_parallel_rank = tensor_parallel_rank
-        config.tensor_parallel_output = False
-        config.sequence_parallel = True
-
-        config.fuse_sequence_parallel_allreduce = False
-
-        # hidden_size = 4096
-        model = model_class.from_pretrained(
-            model_name_or_path,
-            config=config,
-            dtype="float32",
-        )
-
-        model.eval()
-
-        input_ids = paddle.arange(100, 100 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len])
-        labels = paddle.arange(101, 101 + batch_size * seq_len, dtype="int64").reshape([batch_size, seq_len])
-
-        attention_mask = None
-        if pp_degree > 1:
-            pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy)
-            pp_model.accumulate_steps = batch_size  # for micro_batch_size * acc_steps == batch_size
-            ret = pp_model.eval_batch(data=[input_ids, labels], compute_loss=True)
-        else:
-            # pp_model = PipelineParallel(layers=model, hcg=hcg, strategy=strategy)
-            # pp_model.data = [input_ids, labels]
-            # ret = pp_model._forward_step(None)
-            ret = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
-            ret = ret[0]
-
-        # np.testing.assert_allclose(ret.item(), 10.49988270, atol=1e-7)
-        print(f"ret mp{tp_degree} pp{pp_degree}", ret.item())
-        ret_mp_pp = ret.item()
-
-        single_model = LlamaForCausalLM.from_pretrained(model_name_or_path, config=config)
-        single_model.eval()
-        ret = single_model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
-        print("ret single", ret[0].item())
-        print(
-            f"diff: {(ret[0].item()- ret_mp_pp)/ret[0].item()}",
-        )
-        np.testing.assert_allclose(ret[0].item(), ret_mp_pp, rtol=1.5e-7)
-
-
-if __name__ == "__main__":
-    TestLlama().test_sequence_model()
-
-# CUDA_VISIBLE_DEVICES=2 PYTHONPATH=./ pytest -s -v tests/test_pipeline_parallel.py
-# PYTHONPATH=/ssd2/zhonghui03/Datasets/PaddleNLP:$PYTHONPATH  PYTHONPATH=$PYTHONPATH:./ python   -m paddle.distributed.launch --gpus 0,1,2,3  tests/test_pipeline_parallel.py
diff --git a/llm/llama/tests/unified-ckpt-llama-500m/config.json b/llm/llama/tests/unified-ckpt-llama-500m/config.json
deleted file mode 100644
index 470d93f73fa4..000000000000
--- a/llm/llama/tests/unified-ckpt-llama-500m/config.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 5504,
-  "max_position_embeddings": 2048,
-  "model_type": "llama",
-  "num_attention_heads": 8,
-  "num_hidden_layers": 8,
-  "pad_token_id": 0,
-  "rms_norm_eps": 1e-06,
-  "vocab_size": 32000
-}
diff --git a/llm/merge_lora_params.py b/llm/merge_lora_params.py
deleted file mode 100644
index 065a2585ebc0..000000000000
--- a/llm/merge_lora_params.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import copy
-import os
-
-import paddle
-
-from paddlenlp.peft import LoRAConfig, LoRAModel
-
-try:
-    from paddle.nn.quant import weight_dequantize, weight_quantize
-except:
-    weight_dequantize = None
-    weight_quantize = None
-try:
-    from paddlenlp.quantization.qlora import qlora_weight_quantize_dequantize
-except:
-    qlora_weight_quantize_dequantize = None
-
-from paddlenlp.quantization.quantization_config import QuantizationConfig
-from paddlenlp.transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-from paddlenlp.transformers.utils import device_guard
-from paddlenlp.utils.env import CONFIG_NAME
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name_or_path", default=None, help="The directory of pretrained model.")
-    parser.add_argument(
-        "--lora_path", default=None, required=True, help="The directory of LoRA parameters. Default to None"
-    )
-    parser.add_argument(
-        "--merge_lora_model_path",
-        default=None,
-        required=True,
-        help="The directory of merged parameters. Default to None",
-    )
-    parser.add_argument("--device", type=str, default="gpu", help="Device")
-    parser.add_argument(
-        "--low_gpu_mem", type=bool, default=False, help="Whether to use low gpu memory. Default to False"
-    )
-    return parser.parse_args()
-
-
-def weight_process(name, quant_config, lora_config, state_dict):
-    weight = state_dict.pop(name + ".weight").cuda()
-    if quant_config.weight_quantize_algo is None:
-        pass
-    elif quant_config.weight_quantize_algo in ["nf4", "fp4"]:
-        weight = qlora_weight_quantize_dequantize(
-            weight,
-            quant_algo=quant_config.weight_quantize_algo,
-            double_quant=quant_config.weight_double_quant,
-            block_size=quant_config.weight_blocksize,
-            double_quant_block_size=quant_config.weight_double_quant_block_size,
-        )
-    elif quant_config.weight_quantize_algo in ["weight_only_int8"]:
-        out, scale = weight_quantize(weight, algo=quant_config.weight_quantize_algo)
-        weight = weight_dequantize(out, scale)
-    else:
-        raise ValueError(f"quant_config.weight_quantize_algo {quant_config.weight_quantize_algo} is not supported.")
-    lora_A = state_dict.pop(name + ".lora_A").cuda()
-    lora_B = state_dict.pop(name + ".lora_B").cuda()
-    scaling = lora_config.lora_alpha / lora_config.r
-    state_dict[name + ".weight"] = (weight + lora_A @ lora_B * scaling).cpu()
-
-
-def merge():
-    args = parse_arguments()
-    paddle.set_device(args.device)
-
-    lora_config = LoRAConfig.from_pretrained(args.lora_path)
-    if lora_config.base_model_name_or_path is None:
-        if args.model_name_or_path is not None:
-            raise ValueError("We can not find a valid model_name_or_path.")
-        else:
-            lora_config.base_model_name_or_path = args.model_name_or_path
-
-    if os.path.isfile(os.path.join(args.lora_path, CONFIG_NAME)):
-        config = AutoConfig.from_pretrained(args.lora_path)
-    elif args.model_name_or_path is not None:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
-    else:
-        raise ValueError(
-            f"We can not find config.json in lora_path: {args.lora_path} or find a valid model_name_or_path."
-        )
-    config.dtype = lora_config.dtype
-    if (
-        lora_config.dtype == "bfloat16" or config.quantization_config.weight_quantize_algo in ["nf4", "fp4"]
-    ) and args.device == "cpu":
-        raise ValueError("We can not apply bfloat16 or nf4/fp4 lora merge on cpu.")
-
-    if args.low_gpu_mem and args.device == "gpu":
-        quant_config = copy.deepcopy(config.quantization_config)
-        config.quantization_config = QuantizationConfig()
-        lora_config.merge_weights = False
-        with device_guard():
-            model = AutoModelForCausalLM.from_pretrained(
-                lora_config.base_model_name_or_path,
-                config=config,
-                low_cpu_mem_usage=True,
-            )
-            model = LoRAModel.from_pretrained(model=model, lora_path=args.lora_path, lora_config=lora_config)
-        model.eval()
-        model_state_dict = model.model.state_dict()
-        lora_name_list = []
-        for key in model_state_dict.keys():
-            if "lora_A" in key:
-                lora_name_list.append(key[:-7])
-        for name in lora_name_list:
-            weight_process(name, quant_config, lora_config, model_state_dict)
-    else:
-        model = AutoModelForCausalLM.from_pretrained(
-            lora_config.base_model_name_or_path,
-            config=config,
-            low_cpu_mem_usage=args.low_gpu_mem,
-        )
-        lora_config.merge_weights = True
-        model = LoRAModel.from_pretrained(model=model, lora_path=args.lora_path, lora_config=lora_config)
-        model.eval()
-        model_state_dict = model.model.state_dict()
-        for key in list(model_state_dict):
-            if "lora" in key:
-                del model_state_dict[key]
-            if "quant" in key:
-                del model_state_dict[key]
-        model.model.config.quantization_config = QuantizationConfig()
-    model.model.save_pretrained(args.merge_lora_model_path, state_dict=model_state_dict)
-
-    tokenizer = AutoTokenizer.from_pretrained(lora_config.base_model_name_or_path)
-    tokenizer.save_pretrained(args.merge_lora_model_path)
-
-
-if __name__ == "__main__":
-    merge()
diff --git a/llm/llama/npu/export_utils.py b/llm/npu/llama/export_utils.py
similarity index 91%
rename from llm/llama/npu/export_utils.py
rename to llm/npu/llama/export_utils.py
index db7a1f4ad27f..84bd0018a767 100644
--- a/llm/llama/npu/export_utils.py
+++ b/llm/npu/llama/export_utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -61,23 +61,23 @@ def process_params(model_path):
             for op in tqdm(block.ops, desc="processing the linear layer for NPU"):
                 if op.type == "matmul_v2":
                     w_name = op.input_arg_names[-1]
-                    if w_name.endswith("qkv_weight") and op.attr("trans_y") == False:
+                    if w_name.endswith("qkv_weight") and not op.attr("trans_y"):
                         op._set_attr("trans_y", True)
                         w = block.var(w_name)
                         trans_weight(w)
-                    elif w_name.endswith("out_proj_weight") and op.attr("trans_y") == False:
+                    elif w_name.endswith("out_proj_weight") and not op.attr("trans_y"):
                         op._set_attr("trans_y", True)
                         w = block.var(w_name)
                         trans_weight(w)
-                    elif w_name.endswith("ffn1_weight") and op.attr("trans_y") == False:
+                    elif w_name.endswith("ffn1_weight") and not op.attr("trans_y"):
                         op._set_attr("trans_y", True)
                         w = block.var(w_name)
                         trans_weight(w)
-                    elif w_name.endswith("ffn2_weight") and op.attr("trans_y") == False:
+                    elif w_name.endswith("ffn2_weight") and not op.attr("trans_y"):
                         op._set_attr("trans_y", True)
                         w = block.var(w_name)
                         trans_weight(w)
-                    elif w_name == "llama_lm_head_0.w_0" and op.attr("trans_y") == False:
+                    elif w_name == "llama_lm_head_0.w_0" and not op.attr("trans_y"):
                         op._set_attr("trans_y", True)
                         w = block.var(w_name)
                         trans_weight(w)
diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/npu/llama/llama_npu_opt_lora.sh
similarity index 94%
rename from llm/llama/npu/llama_npu_opt_lora.sh
rename to llm/npu/llama/llama_npu_opt_lora.sh
index 6523dbae2321..65992492f587 100644
--- a/llm/llama/npu/llama_npu_opt_lora.sh
+++ b/llm/npu/llama/llama_npu_opt_lora.sh
@@ -27,12 +27,12 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 rm -rf lora_bf16_llama_N1C8
 rm -rf output/lora_bf16_llama_N1C8
-ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "run_finetune.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 export PYTHONPATH=../../../:$PYTHONPATH
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./lora_bf16_llama_N1C8" \
-    ../../finetune_generation.py \
+    ../../run_finetune.py \
     --device "npu" \
     --model_name_or_path "meta-llama/Llama-2-13b-chat" \
     --dataset_name_or_path "data/" \
diff --git a/llm/llama/npu/llama_npu_opt_ppt.sh b/llm/npu/llama/llama_npu_opt_ppt.sh
similarity index 100%
rename from llm/llama/npu/llama_npu_opt_ppt.sh
rename to llm/npu/llama/llama_npu_opt_ppt.sh
diff --git a/llm/llama/npu/llama_npu_opt_sft.sh b/llm/npu/llama/llama_npu_opt_sft.sh
similarity index 95%
rename from llm/llama/npu/llama_npu_opt_sft.sh
rename to llm/npu/llama/llama_npu_opt_sft.sh
index e0e7e5ccbaea..64ada00e420c 100644
--- a/llm/llama/npu/llama_npu_opt_sft.sh
+++ b/llm/npu/llama/llama_npu_opt_sft.sh
@@ -33,11 +33,11 @@ export MULTI_STREAM_MEMORY_REUSE=1
 export PYTHONPATH=../../../:$PYTHONPATH
 rm -rf sft_bf16_llama_N1C8
 rm -rf output/sft_bf16_llama_N1C8
-ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "run_finetune.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./sft_bf16_llama_N1C8" \
-    ../../finetune_generation.py \
+    ../../run_finetune.py \
     --device "npu" \
     --model_name_or_path "meta-llama/Llama-2-13b" \
     --dataset_name_or_path "data/" \
diff --git a/llm/export_model.py b/llm/predict/export_model.py
similarity index 96%
rename from llm/export_model.py
rename to llm/predict/export_model.py
index 5f5dd30bd97c..ce0a5ed76c02 100644
--- a/llm/export_model.py
+++ b/llm/predict/export_model.py
@@ -18,9 +18,9 @@
 
 import paddle
 from paddle.distributed import fleet
-from predictor import ModelArgument, PredictorArgument, create_predictor
+from predict.predictor import ModelArgument, PredictorArgument, create_predictor
 from tqdm import tqdm
-from utils import generate_rank_mapping, get_infer_model_path
+from utils.utils import generate_rank_mapping, get_infer_model_path
 
 from paddlenlp.trainer import PdArgumentParser
 from paddlenlp.utils.log import logger
diff --git a/llm/flask_server.py b/llm/predict/flask_server.py
similarity index 98%
rename from llm/flask_server.py
rename to llm/predict/flask_server.py
index aba727fa6311..d467d6dac688 100644
--- a/llm/flask_server.py
+++ b/llm/predict/flask_server.py
@@ -22,7 +22,12 @@
 
 import requests
 from filelock import FileLock
-from predictor import BasePredictor, ModelArgument, PredictorArgument, create_predictor
+from predict.predictor import (
+    BasePredictor,
+    ModelArgument,
+    PredictorArgument,
+    create_predictor,
+)
 
 from paddlenlp.trainer import PdArgumentParser
 from paddlenlp.utils.log import logger
diff --git a/llm/gradio_ui.py b/llm/predict/gradio_ui.py
similarity index 100%
rename from llm/gradio_ui.py
rename to llm/predict/gradio_ui.py
diff --git a/llm/predictor.py b/llm/predict/predictor.py
similarity index 99%
rename from llm/predictor.py
rename to llm/predict/predictor.py
index f8f39577cfb6..262a21fa6a0b 100644
--- a/llm/predictor.py
+++ b/llm/predict/predictor.py
@@ -28,7 +28,7 @@
 import paddle.incubate.multiprocessing as mp
 from paddle.base.framework import in_cinn_mode, in_pir_executor_mode
 from paddle.distributed import fleet
-from utils import (
+from utils.utils import (
     dybatch_preprocess,
     get_alibi_slopes,
     get_default_max_decoding_length,
diff --git a/llm/request_flask_server.py b/llm/predict/request_flask_server.py
similarity index 100%
rename from llm/request_flask_server.py
rename to llm/predict/request_flask_server.py
diff --git a/llm/qwen/lora_argument_pissa.json b/llm/qwen/lora_argument_pissa.json
deleted file mode 100644
index e3e51eb1bee0..000000000000
--- a/llm/qwen/lora_argument_pissa.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-    "model_name_or_path": "qwen/qwen-7b",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen_lora_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 32,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps":16,
-    "num_train_epochs": 3,
-    "learning_rate": 2e-05,
-    "warmup_steps": 10,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 1024,
-    "max_length": 2048,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 1,
-    "pipeline_parallel_degree": 1,
-    "lora": true,
-    "pissa": true,
-    "zero_padding": false,
-    "use_flash_attention": false
-  }
diff --git a/llm/qwen/lora_argument_qwen2_7b.json b/llm/qwen/lora_argument_qwen2_7b.json
deleted file mode 100644
index 7cf88075ab87..000000000000
--- a/llm/qwen/lora_argument_qwen2_7b.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "model_name_or_path": "Qwen/Qwen2-7B",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen2_7b__lora_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 4,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps": 16,
-    "num_train_epochs": 3,
-    "learning_rate": 3e-04,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 2048,
-    "max_length": 4096,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 1,
-    "pipeline_parallel_degree": 1,
-    "lora": true,
-    "zero_padding": false,
-    "use_flash_attention": false
-}
\ No newline at end of file
diff --git a/llm/qwen/lora_argument_qwen2moe.json b/llm/qwen/lora_argument_qwen2moe.json
deleted file mode 100644
index 0344e3885ba0..000000000000
--- a/llm/qwen/lora_argument_qwen2moe.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen2moe_lora_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 4,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps":16,
-    "num_train_epochs": 3,
-    "learning_rate": 3e-04,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 1024,
-    "max_length": 32768,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 8,
-    "pipeline_parallel_degree": 1,
-    "lora": true,
-    "zero_padding": false,
-    "use_flash_attention": false
-  }
\ No newline at end of file
diff --git a/llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json b/llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json
deleted file mode 100644
index 5bcdc1158680..000000000000
--- a/llm/qwen/pretrain-qwen1.5_7b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-    "model_name_or_path": "Qwen/Qwen1.5-7B",
-    "tokenizer_name_or_path": "Qwen/Qwen1.5-7B",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/qwen1.5_7b_pretrain_ckpts",
-    "per_device_train_batch_size": 2,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding_parallel_degree": 4,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": true,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json b/llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json
deleted file mode 100644
index d67ddfc01c6a..000000000000
--- a/llm/qwen/pretrain-qwen2_7b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-    "model_name_or_path": "Qwen/Qwen2-7B",
-    "tokenizer_name_or_path": "Qwen/Qwen2-7B",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/qwen2_7b_pretrain_ckpts",
-    "per_device_train_batch_size": 2,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding_parallel_degree": 4,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json b/llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json
deleted file mode 100644
index ed2bdb9cf7f3..000000000000
--- a/llm/qwen/pretrain-qwen_7b-tp2sd4_stage2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "qwen/qwen-7b",
-    "tokenizer_name_or_path": "qwen/qwen-7b",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/qwen_pretrain_ckpts",
-    "per_device_train_batch_size": 2,
-    "gradient_accumulation_steps": 8,
-    "per_device_eval_batch_size": 2,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 1,
-    "sharding": "stage2",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": false,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/qwen/pretrain_argument_tp2pp4.json b/llm/qwen/pretrain_argument_tp2pp4.json
deleted file mode 100644
index f2272ca3b7c6..000000000000
--- a/llm/qwen/pretrain_argument_tp2pp4.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "model_name_or_path": "qwen/qwen-7b",
-    "tokenizer_name_or_path": "qwen/qwen-7b",
-    "input_dir": "./data",
-    "output_dir": "./checkpoints/qwen_pretrain_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 16,
-    "per_device_eval_batch_size": 16,
-    "tensor_parallel_degree": 2,
-    "pipeline_parallel_degree": 4,
-    "sharding": "stage1",
-    "virtual_pp_degree": 1,
-    "sequence_parallel": 0,   
-    "use_flash_attention": true,
-    "use_fused_rms_norm": true,
-    "use_fused_rope": true,
-    "max_seq_length": 4096,
-    "learning_rate": 3e-05,
-    "min_learning_rate": 3e-06,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "max_steps": 10000,
-    "save_steps": 5000,
-    "eval_steps": 1000,
-    "weight_decay": 0.01,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "warmup_ratio": 0.01,
-    "max_grad_norm": 1.0,
-    "dataloader_num_workers": 1,
-    "continue_training": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "disable_tqdm": true,
-    "recompute": true,
-    "distributed_dataloader": 1,
-    "recompute_granularity": "full",
-    "save_total_limit": 2
-  }
diff --git a/llm/qwen/pt_argument_qwen2_7b.json b/llm/qwen/pt_argument_qwen2_7b.json
deleted file mode 100644
index 5d7c85c32503..000000000000
--- a/llm/qwen/pt_argument_qwen2_7b.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-    "model_name_or_path": "Qwen/Qwen2-7B",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen2_7b_pt_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 4,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps": 16,
-    "num_train_epochs": 3,
-    "learning_rate": 3e-02,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 2048,
-    "max_length": 4096,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 1,
-    "pipeline_parallel_degree": 1,
-    "prefix_tuning": true,
-    "zero_padding": false,
-    "use_flash_attention": false
-  }
-  
\ No newline at end of file
diff --git a/llm/qwen/sft_argument_qwen2_7b.json b/llm/qwen/sft_argument_qwen2_7b.json
deleted file mode 100644
index 70822bcc04f9..000000000000
--- a/llm/qwen/sft_argument_qwen2_7b.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "model_name_or_path": "Qwen/Qwen2-7B",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen2-7b_sft_ckpts",
-    "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 4,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps":16,
-    "num_train_epochs": 3,
-    "learning_rate": 3e-05,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 2048,
-    "max_length": 4096,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 4,
-    "pipeline_parallel_degree": 1,
-    "zero_padding": false,
-    "use_flash_attention": false
-  }
diff --git a/llm/qwen/sft_argument_qwen2moe.json b/llm/qwen/sft_argument_qwen2moe.json
deleted file mode 100644
index 75d3a93500f5..000000000000
--- a/llm/qwen/sft_argument_qwen2moe.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B",
-    "dataset_name_or_path": "./data",
-    "output_dir": "./checkpoints/qwen2moe_sft_ckpts",
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 4,
-    "per_device_eval_batch_size": 8,
-    "eval_accumulation_steps":16,
-    "num_train_epochs": 3,
-    "learning_rate": 3e-05,
-    "warmup_steps": 30,
-    "logging_steps": 1,
-    "evaluation_strategy": "epoch",
-    "save_strategy": "epoch",
-    "src_length": 1024,
-    "max_length": 32768,
-    "bf16": true,
-    "fp16_opt_level": "O2",
-    "do_train": true,
-    "do_eval": true,
-    "disable_tqdm": true,
-    "load_best_model_at_end": true,
-    "eval_with_do_generation": false,
-    "metric_for_best_model": "accuracy",
-    "recompute": true,
-    "save_total_limit": 1,
-    "tensor_parallel_degree": 8,
-    "sharding": "stage2",
-    "pipeline_parallel_degree": 1
-}
\ No newline at end of file
diff --git a/llm/finetune_generation.py b/llm/run_finetune.py
similarity index 95%
rename from llm/finetune_generation.py
rename to llm/run_finetune.py
index 8c72c553fb70..683bd871946b 100644
--- a/llm/finetune_generation.py
+++ b/llm/run_finetune.py
@@ -14,20 +14,18 @@
 import json
 import os
 import sys
-from dataclasses import dataclass, field
 from functools import partial
-from typing import Optional
 
 import paddle
-from argument import (
+from utils.argument import (
     DataArgument,
     GenerateArgument,
     ModelArgument,
     QuantArgument,
     TrainingArguments,
 )
-from data import get_convert_example
-from utils import (
+from utils.data import get_convert_example
+from utils.utils import (
     CausalLMTrainer,
     ZeroPaddingIterDatasetCallback,
     compute_metrics,
@@ -54,44 +52,16 @@
     Llama3Tokenizer,
     LlamaTokenizer,
 )
-from paddlenlp.transformers.configuration_utils import LlmMetaConfig, llmmetaclass
+from paddlenlp.transformers.configuration_utils import LlmMetaConfig
 from paddlenlp.utils.log import logger
 
 # Fine-tune Environment Variables to support sharding stage1 overlap optimization.
 os.environ["USE_CASUAL_MASK"] = "False"
 
 
-def add_start_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-@dataclass
-@llmmetaclass
-@add_start_docstrings(TrainingArguments.__doc__)
-class FinetuneArguments(TrainingArguments):
-    decay_steps: int = field(
-        default=0,
-        metadata={"help": "The steps use to control the learing rate."},
-    )
-    tensor_parallel_output: Optional[bool] = field(
-        default=False,
-        metadata={"help": "whether to output logits in distributed status"},
-    )
-
-
-def read_local_dataset(path):
-    with open(path, "r", encoding="utf-8") as fp:
-        for line in fp:
-            yield json.loads(line.strip())
-
-
 def main():
     # Arguments
-    parser = PdArgumentParser((GenerateArgument, QuantArgument, ModelArgument, DataArgument, FinetuneArguments))
+    parser = PdArgumentParser((GenerateArgument, QuantArgument, ModelArgument, DataArgument, TrainingArguments))
     # Support format as "args.json --arg1 value1 --arg2 value2.”
     # In case of conflict, command line arguments take precedence.
     if len(sys.argv) >= 2 and sys.argv[1].endswith(".json"):
@@ -161,6 +131,8 @@ def main():
         model_config.hidden_dropout_prob = model_args.hidden_dropout_prob
     if hasattr(model_config, "attention_probs_dropout_prob"):
         model_config.attention_probs_dropout_prob = model_args.attention_probs_dropout_prob
+    if hasattr(model_config, "ignore_index"):
+        model_config.ignore_index = -100
 
     if model_args.fuse_attention_qkv is not None:
         model_config.fuse_attention_qkv = model_args.fuse_attention_qkv
@@ -169,7 +141,7 @@ def main():
 
     model_config.seq_length = data_args.max_length
 
-    print("Final model config:", model_config)
+    logger.info(f"Final model config: {model_config}")
 
     model_class = AutoModelForCausalLM
     if training_args.pipeline_parallel_degree > 1:
@@ -342,7 +314,7 @@ def neft_post_hook(module, input, output):
         train_ds = train_ds.skip(consumed_samples)
 
     if training_args.pipeline_parallel_degree > 1:
-        from data import convert_example_common
+        from utils.data import convert_example_common
 
         trans_func = partial(convert_example_common, tokenizer=tokenizer, data_args=data_args)
     else:
@@ -584,7 +556,7 @@ def compute_metrics_do_generation(eval_preds):
 
     # QAT
     if quant_args.do_qat:
-        from quant import create_qat_model
+        from utils.quant import create_qat_model
 
         trainer.model = create_qat_model(quant_args, trainer.model, dtype)
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
@@ -599,7 +571,7 @@ def compute_metrics_do_generation(eval_preds):
             raise NotImplementedError(
                 "PTQ strategy not supported for LoRA model. Please merge lora parameters to pretrain model first."
             )
-        from quant import (
+        from utils.quant import (
             apply_autoclip,
             apply_ptq,
             apply_shift,
@@ -635,7 +607,7 @@ def compute_metrics_do_generation(eval_preds):
             raise NotImplementedError(
                 "PTQ strategy not supported for LoRA model. Please merge lora parameters to pretrain model first."
             )
-        from quant import apply_gptq
+        from utils.quant import apply_gptq
 
         ptq_dataloader = trainer.get_ptq_dataloader(ptq_ds)
         apply_gptq(quant_args, trainer, ptq_dataloader)
diff --git a/llm/run_pretrain.py b/llm/run_pretrain.py
index 04a6fc22dc24..12364e47118f 100644
--- a/llm/run_pretrain.py
+++ b/llm/run_pretrain.py
@@ -85,6 +85,14 @@ class PreTrainingArguments(TrainingArguments):
         default=False,
         metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
     )
+    unified_checkpoint: bool = field(
+        default=False,
+        metadata={"help": "Enable fused linear grad add strategy."},
+    )
+    unified_checkpoint_config: Optional[str] = field(
+        default="",
+        metadata={"help": "Configs to unify hybrid parallel checkpoint.\n"},
+    )
 
     def __post_init__(self):
         super().__post_init__()
@@ -360,7 +368,7 @@ def main():
         training_args.no_recompute_layers.sort()
 
     if training_args.enable_linear_fused_grad_add:
-        from fused_layers import mock_layers
+        from utils.fused_layers import mock_layers
 
         mock_layers()
 
@@ -473,7 +481,7 @@ def main():
         model_class = AutoModelForCausalLMPipe
         if "LLama" in str(config.architectures):
             try:
-                from register_reshard import register_pp_reshard_information
+                from utils.register_reshard import register_pp_reshard_information
 
                 register_pp_reshard_information(config.num_hidden_layers)
             except:
diff --git a/llm/tests/test_best_pretrain_speed.py b/llm/tests/test_best_pretrain_speed.py
deleted file mode 100644
index 79f6ea455a5c..000000000000
--- a/llm/tests/test_best_pretrain_speed.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# import copy
-import json
-import os
-import shutil
-
-# import numpy as np
-from llama.tests.parallel_launch import TestMultipleGpus
-
-# export NVIDIA_TF32_OVERRIDE=0
-# export NCCL_IB_GID_INDEX=3
-# export NCCL_SOCKET_IFNAME=xgbe0
-# export NCCL_IB_TIMEOUT=22
-# export NCCL_DEBUG=INFO
-# export NCCL_IB_DISABLE=1
-# export NCCL_IB_GDR_LEVEL=4
-# export NCCL_SOCKET_IFNAME=eth2
-
-
-environment_variables = {
-    # "NCCL_ALGO": "Tree",
-    # "NVIDIA_TF32_OVERRIDE": "0",
-    "NCCL_IB_TIMEOUT": "22",
-    # "NCCL_DEBUG": "INFO",
-    # "FLAGS_embedding_deterministic": "1",
-    # "FLAGS_cudnn_deterministic": "1",
-    # "Flags_mp_aysnc_allreduce": "1",
-    # "Flags_skip_mp_c_identity": "1",
-    # "FLAGS_shard_norm_align_dp": "0",
-    # "FLAGS_shard_use_reduce": "1",
-    "test_ci_no_save_model": "1",
-}
-
-pretrain_arguments = {
-    "learning_rate": 1e-04,
-    "min_learning_rate": 1e-05,
-    "warmup_steps": 100,
-    "logging_steps": 1,
-    "max_steps": 10,
-    "save_steps": 2000,
-    "eval_steps": 1000,
-    "continue_training": 0,
-    "skip_memory_metrics": 0,
-    "do_train": "true",
-    "do_eval": "false",
-    "do_predict": "false",
-    "disable_tqdm": "true",
-    "save_total_limit": 2,
-}
-
-best_pretrain_config_for_a100_80g = {
-    # "qwen/qwen-7b": "./qwen/pretrain_argument_stage2.json",
-    # "baichuan-inc/Baichuan2-13B-Base" "./llama/pretrain-baichuan2_13b-tp4sd2_stage2.json",
-    # "baichuan-inc/Baichuan2-13B-Base": "./llama/pretrain-baichuan2_13b-tp2sd4_stage2.json",
-    "facebook/llama-7b": "./llama/pretrain-llama_7b-tp2sd4_stage2.json",
-    "facebook/llama-13b": "./llama/pretrain-llama_13b-tp2sd4_stage2.json",
-    "meta-llama/Llama-2-7b": "./llama/pretrain-llama2_7b-tp2sd4_stage2.json",
-    "meta-llama/Llama-2-13b": "./llama/pretrain-llama2_13b-tp2sd4_stage2.json",
-    "qwen/qwen-7b": "./qwen/pretrain-qwen_7b-tp2sd4_stage2.json",
-    "baichuan-inc/Baichuan2-13B-Base": "./baichuan/pretrain-baichuan2_13b-sd8_stage2.json",
-    "baichuan-inc/Baichuan2-7B-Base": "./baichuan/pretrain-baichuan2_7b-tp2sd4_stage2.json",
-    "FlagAlpha/Llama2-Chinese-13b-Chat": "./llama/pretrain-flagalpha_llama2_13b-tp2sd4_stage2.json",
-    "FlagAlpha/Llama2-Chinese-7b-Chat": "./llama/pretrain-flagalpha_llama2_7b-tp2sd4_stage2.json",
-    "linly-ai/chinese-llama-2-7b": "./llama/pretrain-linly_llama2_7b-tp2sd4_stage2.json",
-    "idea-ccnl/ziya-llama-13b-v1": "./llama/pretrain-ziya_llama_13b-tp2sd4_stage2.json",
-}
-
-
-def log_test_result(model_name_or_path, config_name, config, log_dir="log"):
-    model_name_or_path = model_name_or_path
-    max_seq_len = config["max_seq_length"]
-    distribued_info = config_name.split("b-")[-1].split(".json")[0]
-    speed = "NA"
-    memory = "NA"
-    config_name = config_name
-    time = "NA"
-
-    file_path = os.path.join(log_dir, "workerlog.n0.c0")
-
-    get_memory_cmd = (
-        "grep -aE 'gpu_mem_max_memory_reserved ' " + file_path + " | awk '{print $8}' |  awk -F '\x1b'  '{print $1}'"
-    )
-    get_time_cmd = (
-        "grep -aE 'gpu_mem_max_memory_reserved ' "
-        + file_path
-        + " | awk -F '['  '{print $3}' | awk -F ','  '{print $1}'"
-    )
-    get_ips_cmd = "grep -aE 'global_step: ' " + file_path + "  | awk -F ',' '{print $6}' | awk   '{print $2}'  "
-
-    import subprocess
-
-    res = subprocess.check_output(get_memory_cmd, shell=True, text=True)
-    if "MB" in res:
-        memory = res.strip()
-
-    res = subprocess.check_output(get_time_cmd, shell=True, text=True)
-    if len(res) > 0:
-        time = res.strip()
-
-    res = subprocess.check_output(get_ips_cmd, shell=True, text=True)
-    ips = [float(x) for x in res.strip().split()]
-    if len(ips) > 4:
-        ips = sum(ips[2:-2]) / (len(ips) - 4)
-        speed = round(ips * max_seq_len / 8, 2)
-
-    write_result(
-        [
-            f"`{model_name_or_path}`",
-            max_seq_len,
-            f"`{distribued_info}`",
-            speed,
-            memory,
-            f"`{config_name}`",
-            time,
-        ]
-    )
-
-    return res
-
-
-result_title = r"""| 模型 | 序列长度 | 分布式策略 | 速度(`tokens/card/sec`) | 显存占用(`MB^1`) | 配置文件| 测试时间 |"""
-result_file_name = "results_of_best_pretrain_config_for_a100_80g.md"
-
-
-def write_result(res):
-    fileds_name = [x.strip() for x in result_title.split("|")[1:-1]]
-    assert len(fileds_name) == len(res)
-
-    def format_list_to_str(lst):
-        content = "|".join([""] + ["{:10}".format(x) for x in lst] + [""])
-        return content
-
-    if not os.path.exists(result_file_name):
-        with open(result_file_name, "w") as f:
-            f.write(format_list_to_str(fileds_name) + "\n")
-            f.write(format_list_to_str([" :-: "] * len(fileds_name)) + "\n")
-
-    with open(result_file_name, "a+") as f:
-        f.write(format_list_to_str(res) + "\n")
-
-
-def remove_logs(log_dir="log"):
-    if os.path.exists(log_dir):
-        shutil.rmtree(log_dir)
-
-
-def remove_ckpt(ckpt_dir):
-    if os.path.exists(ckpt_dir):
-        shutil.rmtree(ckpt_dir)
-
-
-class TestModelOnN1C8(TestMultipleGpus):
-    def setUp(self):
-        os.environ.update(environment_variables)
-
-    def test_facebook_llama_7b(self):
-        name = "facebook/llama-7b"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_facebook_llama_13b(self):
-        name = "facebook/llama-13b"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_metallama_Llama2_7b(self):
-        name = "meta-llama/Llama-2-7b"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_metallama_Llama2_13b(self):
-        name = "meta-llama/Llama-2-13b"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_qwen_qwen_7b(self):
-        name = "qwen/qwen-7b"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_baichuan_Baichuan2_13B_Base(self):
-        name = "baichuan-inc/Baichuan2-13B-Base"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_baichuan_Baichuan2_7B_Base(self):
-        name = "baichuan-inc/Baichuan2-7B-Base"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_FlagAlpha_Llama2Chinese_13b_Chat(self):
-        name = "FlagAlpha/Llama2-Chinese-13b-Chat"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_FlagAlpha_Llama2Chinese_7b_Chat(self):
-        name = "FlagAlpha/Llama2-Chinese-7b-Chat"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_linlyai_chinesellama2_7b(self):
-        name = "linly-ai/chinese-llama-2-7b"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
-
-    def test_ideaccnl_ziyallama_13b(self):
-        name = "idea-ccnl/ziya-llama-13b-v1"
-        arguments = json.load(open(best_pretrain_config_for_a100_80g[name], "r"))
-        arguments.update(pretrain_arguments)
-        remove_logs()
-        remove_ckpt(arguments["output_dir"])
-        self.run_n1c8("run_pretrain.py", **arguments)
-        log_test_result(name, best_pretrain_config_for_a100_80g[name], arguments, log_dir="log")
diff --git a/llm/tools/merge_lora_params.py b/llm/tools/merge_lora_params.py
new file mode 100644
index 000000000000..06d2e2d7a9bd
--- /dev/null
+++ b/llm/tools/merge_lora_params.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import copy
+import math
+import os
+
+import numpy as np
+import paddle
+from paddle.nn.quant import weight_dequantize
+
+from paddlenlp.peft import LoRAConfig, LoRAModel
+
+try:
+    from paddlenlp.quantization.qlora import qlora_weight_quantize_dequantize
+    from paddlenlp.quantization.quantization_config import QuantizationConfig
+    from paddlenlp.quantization.quantization_linear import QuantizationLinear
+except:
+    pass
+
+from paddlenlp.trainer.argparser import strtobool
+from paddlenlp.transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from paddlenlp.transformers.utils import device_guard
+from paddlenlp.utils.env import CONFIG_NAME
+from paddlenlp.utils.log import logger
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", default=None, required=True, type=str, help="The directory of model.")
+    parser.add_argument(
+        "--lora_path", default=None, type=str, required=True, help="The directory of LoRA parameters. Default to None"
+    )
+    parser.add_argument("--output_path", default=None, type=str, required=True, help="The directory of saved model ")
+    parser.add_argument("--safe_serialization", default="False", type=strtobool, help="Whether save as safetensor.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        choices=["gpu", "npu", "cpu"],
+        help="Device for selecting for merging lora weights, currently only supports gpu/npu/cpu.",
+    )
+    return parser.parse_args()
+
+
+def weight_process(name, quant_config, lora_config, state_dict, device):
+    target_device = device if device == "cpu" else device + ":0"
+
+    if (name + ".weight") not in state_dict.keys():
+        return
+
+    if quant_config.weight_quantize_algo is None:
+        return
+    elif quant_config.weight_quantize_algo in ["nf4", "fp4"]:
+        weight = state_dict.pop(name + ".weight").to(target_device)
+        state_dict[name + ".weight"] = qlora_weight_quantize_dequantize(
+            weight,
+            quant_algo=quant_config.weight_quantize_algo,
+            double_quant=quant_config.weight_double_quant,
+            block_size=quant_config.weight_blocksize,
+            double_quant_block_size=quant_config.weight_double_quant_block_size,
+        ).cpu()
+    elif quant_config.weight_quantize_algo in ["weight_only_int8"]:
+        quant_weight = state_dict.pop(name + ".quant_weight").to(target_device)
+        quant_scale = state_dict.pop(name + ".quant_scale").to(target_device)
+        state_dict[name + ".weight"] = weight_dequantize(quant_weight, quant_scale, out_dtype=lora_config.dtype).cpu()
+    else:
+        raise ValueError(f"quant_config.weight_quantize_algo {quant_config.weight_quantize_algo} is not supported.")
+
+
+def lora_process(name, lora_config, state_dict, device, lora_state_dict=None):
+    target_device = device if device == "cpu" else device + ":0"
+
+    if (name + ".weight") not in state_dict.keys():
+        return
+
+    weight = state_dict.pop(name + ".weight")
+    if lora_state_dict is None:
+        lora_A = state_dict.pop(name + ".lora_A")
+        lora_B = state_dict.pop(name + ".lora_B")
+    else:
+        lora_A = lora_state_dict.pop(name + ".lora_A")
+        lora_B = lora_state_dict.pop(name + ".lora_B")
+    if device != "cpu":
+        weight = weight.to(target_device)
+        lora_A = lora_A.to(target_device)
+        lora_B = lora_B.to(target_device)
+    if not lora_config.rslora:
+        scaling = lora_config.lora_alpha / lora_config.r
+    else:
+        scaling = lora_config.lora_alpha / math.sqrt(lora_config.r)
+
+    if device == "cpu" and weight.dtype.name == "BF16":
+        weight = weight.astype("float32")
+        lora_A = lora_A.astype("float32")
+        lora_B = lora_B.astype("float32")
+        out = (weight + lora_A @ lora_B * scaling).astype("bfloat16")
+    else:
+        out = (weight + lora_A @ lora_B * scaling).cpu()
+
+    state_dict[name + ".weight"] = out
+
+
+def merge_old_lora(lora_config, args):
+    lora_config.merge_weight = True
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        dtype=lora_config.dtype,
+    )
+    model = LoRAModel.from_pretrained(model, args.lora_path)
+    model.eval()
+    model_state_dict = model.model.state_dict()
+    for key in list(model_state_dict):
+        if "lora" in key:
+            del model_state_dict[key]
+    return model, model_state_dict
+
+
+def read_file(file_name):
+    if file_name.endswith("safetensors"):
+        try:
+            from paddlenlp.utils.safetensors import fast_load_file as load_file
+        except:
+            from safetensors.numpy import load_file
+
+        read_tensors = load_file(file_name)
+        for key in list(read_tensors.keys()):
+            if isinstance(read_tensors[key], np.ndarray):
+                with device_guard("cpu"):
+                    read_tensors[key] = paddle.Tensor(read_tensors.pop(key), zero_copy=True)
+    else:
+        with device_guard("cpu"):
+            read_tensors = paddle.load(file_name)
+    return read_tensors
+
+
+def save_file(output_path, file_name, tensors, safe_serialization=True):
+    if safe_serialization:
+        from safetensors.numpy import save_file as _save_file
+
+        if file_name == "model_state.pdparams":
+            file_name = "model.safetensors"
+
+        for key in list(tensors.keys()):
+            if isinstance(tensors[key], paddle.Tensor):
+                tensors[key] = tensors.pop(key).cpu().numpy()
+        _save_file(tensors, os.path.join(output_path, file_name), metadata={"format": "np"})
+    else:
+        paddle.save(tensors, os.path.join(output_path, file_name))
+
+
+def merge():
+    args = parse_arguments()
+    paddle.set_device(args.device)
+
+    lora_config = LoRAConfig.from_pretrained(args.lora_path)
+    if os.path.isfile(os.path.join(args.lora_path, CONFIG_NAME)):
+        config = AutoConfig.from_pretrained(args.lora_path)
+    elif args.model_name_or_path is not None:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        raise ValueError(
+            f"We can not find config.json in lora_path: {args.lora_path} or find a valid model_name_or_path."
+        )
+    config.dtype = lora_config.dtype
+    quant_config = copy.deepcopy(config.quantization_config)
+    lora_config.merge_weights = False
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    tokenizer.save_pretrained(args.output_path)
+
+    if lora_config.enable_lora_list is not None:
+        model, model_state_dict = merge_old_lora(lora_config, args)
+    else:
+        if quant_config.weight_quantize_algo in ["nf4", "fp4"]:
+            config.quantization_config = QuantizationConfig()
+        with device_guard(args.device):
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model_name_or_path,
+                config=config,
+                low_cpu_mem_usage=True,
+            )
+            logger.info("load model done")
+            model = LoRAModel.from_pretrained(model=model, lora_path=args.lora_path, lora_config=lora_config)
+            logger.info("load lora model done")
+        if quant_config.weight_quantize_algo in ["weight_only_int8"]:
+            model.config.quantization_config = QuantizationConfig()
+        model.eval()
+        model_state_dict = model.model.state_dict()
+        if quant_config.weight_quantize_algo in ["nf4", "fp4", "weight_only_int8"]:
+            for name, layer in model.model.named_sublayers():
+                if isinstance(layer, paddle.nn.Linear) or isinstance(layer, QuantizationLinear):
+                    weight_process(name, quant_config, lora_config, model_state_dict, args.device)
+
+        lora_name_list = []
+        for key in model_state_dict.keys():
+            if "lora_A" in key:
+                lora_name_list.append(key[:-7])
+        for name in lora_name_list:
+            lora_process(name, lora_config, model_state_dict, args.device)
+
+    logger.info("Begin to save merged model")
+    if args.safe_serialization:
+        model.model.save_pretrained(
+            args.output_path, state_dict=model_state_dict, safe_serialization=args.safe_serialization
+        )
+    else:
+        model.model.save_pretrained(args.output_path, state_dict=model_state_dict, max_shard_size="100GB")
+
+
+if __name__ == "__main__":
+    merge()
diff --git a/llm/merge_tp_and_pp_params.py b/llm/tools/merge_tp_and_pp_params.py
similarity index 100%
rename from llm/merge_tp_and_pp_params.py
rename to llm/tools/merge_tp_and_pp_params.py
diff --git a/llm/argument.py b/llm/utils/argument.py
similarity index 89%
rename from llm/argument.py
rename to llm/utils/argument.py
index 79ce3fe4df16..67ad7c5dbe2a 100644
--- a/llm/argument.py
+++ b/llm/utils/argument.py
@@ -16,10 +16,21 @@
 
 from paddlenlp.trainer import TrainingArguments
 from paddlenlp.trainer.trainer_utils import IntervalStrategy
+from paddlenlp.transformers.configuration_utils import llmmetaclass
 from paddlenlp.utils.log import logger
 
 
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
 @dataclass
+@llmmetaclass
+@add_start_docstrings(TrainingArguments.__doc__)
 class TrainingArguments(TrainingArguments):
     benchmark: bool = field(default=False, metadata={"help": "Whether runs benchmark"})
     # NOTE(gongenlei): new add autotuner_benchmark
@@ -27,6 +38,22 @@ class TrainingArguments(TrainingArguments):
         default=False,
         metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
     )
+    decay_steps: int = field(
+        default=0,
+        metadata={"help": "The steps use to control the learing rate."},
+    )
+    tensor_parallel_output: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to output logits in distributed status"},
+    )
+    unified_checkpoint: bool = field(
+        default=False,
+        metadata={"help": "Unify hybrid parallel checkpoint."},
+    )
+    unified_checkpoint_config: Optional[str] = field(
+        default="",
+        metadata={"help": "Configs to unify hybrid parallel checkpoint.\n"},
+    )
 
     def __post_init__(self):
         super().__post_init__()
@@ -42,6 +69,16 @@ def __post_init__(self):
             self.report_to = []
             self.save_strategy = IntervalStrategy.NO
             self.evaluation_strategy = IntervalStrategy.NO
+        if self.benchmark:
+            self.do_train = True
+            self.do_export = False
+            self.do_predict = False
+            self.do_eval = False
+            self.overwrite_output_dir = True
+            self.load_best_model_at_end = False
+            self.report_to = []
+            self.save_strategy = IntervalStrategy.NO
+            self.evaluation_strategy = IntervalStrategy.NO
 
 
 @dataclass
diff --git a/llm/data.py b/llm/utils/data.py
similarity index 99%
rename from llm/data.py
rename to llm/utils/data.py
index 8000bd598455..eabac7456cbe 100644
--- a/llm/data.py
+++ b/llm/utils/data.py
@@ -53,6 +53,7 @@ def get_convert_example(model):
         "gemma",
         "qwen2",
         "qwen2_moe",
+        "gpt",
     ]:
         return convert_example_common
     else:
diff --git a/llm/llama/fused_layers.py b/llm/utils/fused_layers.py
similarity index 100%
rename from llm/llama/fused_layers.py
rename to llm/utils/fused_layers.py
diff --git a/llm/quant.py b/llm/utils/quant.py
similarity index 100%
rename from llm/quant.py
rename to llm/utils/quant.py
diff --git a/llm/llama/register_reshard.py b/llm/utils/register_reshard.py
similarity index 100%
rename from llm/llama/register_reshard.py
rename to llm/utils/register_reshard.py
diff --git a/llm/utils.py b/llm/utils/utils.py
similarity index 99%
rename from llm/utils.py
rename to llm/utils/utils.py
index 10c27a0b0594..2f51711b496b 100644
--- a/llm/utils.py
+++ b/llm/utils/utils.py
@@ -125,6 +125,16 @@ def get_lora_target_modules(model):
             ".*dense_h_to_4h.*",
             ".*dense_4h_to_h.*",
         ]
+    elif model.base_model_prefix == "gpt":
+        target_modules = [
+            ".*qkv_proj.*",
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*linear1.*",
+            ".*linear2.*",
+            ".*out_proj.*",
+        ]
     elif model.base_model_prefix == "bloom":
         target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"]
     elif model.base_model_prefix == "llama" or isinstance(model, LlamaForCausalLMPipe):
diff --git a/scripts/ci_approval/run_ci_approval.sh b/scripts/ci_approval/run_ci_approval.sh
index bc55cc58d5df..328834caba99 100644
--- a/scripts/ci_approval/run_ci_approval.sh
+++ b/scripts/ci_approval/run_ci_approval.sh
@@ -40,7 +40,7 @@ for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{pri
     dir3=${arr_file_name[2]}
     dir4=${arr_file_name[3]}
     echo "file_name:"${file_name}, "dir1:"${dir1}, "dir2:"${dir2},"dir3:"${dir3},".xx:" ${file_name##*.}
-    if [[ ${file_name} =~ "paddlenlp/trainer/training_args.py" ]] || [[ ${file_name} =~ "paddlenlp/trainer/trainer.py" ]] || [[ ${file_name} =~ "llm/run_pretrain.py" ]] || [[ ${file_name} =~ "llm/finetune_generation.py" ]];then
+    if [[ ${file_name} =~ "paddlenlp/trainer/training_args.py" ]] || [[ ${file_name} =~ "paddlenlp/trainer/trainer.py" ]] || [[ ${file_name} =~ "llm/run_pretrain.py" ]] || [[ ${file_name} =~ "llm/run_finetune.py" ]];then
         echo_line="You must have two RD: one from(ZHUI, wawltor),one from(ForFishes,sneaxiy,zhiqiu) approval for the changes of training_args.py/trainer.py/run_pretrain.py "
         check_approval 2 ZHUI wawltor ForFishes sneaxiy zhiqiu
     elif [[ ${dir1} =~ "paddlenlp" ]];then
diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 167a86fc468e..89cffb71a578 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -22,9 +22,9 @@ export root_path=/workspace/PaddleNLP
 export gpt_case_path=$root_path/legacy/model_zoo/gpt-3
 export gpt_data_path=/fleetx_data
 
-export llama_case_path=$root_path/llm/llama/auto_parallel
+export llama_case_path=$root_path/llm/auto_parallel/llama
 export llama_data_path=/llama_data
-export llm_gpt_case_path=$root_path/llm/gpt-3/auto_parallel
+export llm_gpt_case_path=$root_path/llm/auto_parallel/gpt-3
 
 unset CUDA_VISIBLE_DEVICES
 
diff --git a/tests/llm/test_finetune.py b/tests/llm/test_finetune.py
index 016720ce8789..672f7e07e023 100644
--- a/tests/llm/test_finetune.py
+++ b/tests/llm/test_finetune.py
@@ -46,7 +46,7 @@ def test_finetune(self):
         finetune_config["output_dir"] = self.output_dir
 
         with argv_context_guard(finetune_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
diff --git a/tests/llm/test_finetune_prefix_tuning.py b/tests/llm/test_finetune_prefix_tuning.py
index 4f70a10d1c0e..b4324066e822 100644
--- a/tests/llm/test_finetune_prefix_tuning.py
+++ b/tests/llm/test_finetune_prefix_tuning.py
@@ -61,7 +61,7 @@ def test_prefix_tuning(self):
         prefix_tuning_config["dataset_name_or_path"] = self.data_dir
         prefix_tuning_config["output_dir"] = self.output_dir
         with argv_context_guard(prefix_tuning_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
diff --git a/tests/llm/test_gradio.py b/tests/llm/test_gradio.py
index 88661c6fbc74..731c5f9bf6d3 100644
--- a/tests/llm/test_gradio.py
+++ b/tests/llm/test_gradio.py
@@ -46,7 +46,7 @@ def setUp(self):
         self.model_path = "__internal_testing__/micro-random-llama"
         command = (
             "cd ./llm && PYTHONPATH=../:$PYTHONPATH"
-            + ' {python} flask_server.py --model_name_or_path {model_path} --port {port} --flask_port {flask_port} --src_length 1024 --dtype "float16"'.format(
+            + ' {python} predict/flask_server.py --model_name_or_path {model_path} --port {port} --flask_port {flask_port} --src_length 1024 --dtype "float16"'.format(
                 flask_port=self.flask_port, port=self.port, model_path=self.model_path, python=sys.executable
             )
         )
diff --git a/tests/llm/test_long_sequence_strategies.py b/tests/llm/test_long_sequence_strategies.py
index 687889c54178..169c329d274b 100644
--- a/tests/llm/test_long_sequence_strategies.py
+++ b/tests/llm/test_long_sequence_strategies.py
@@ -5079,6 +5079,6 @@ def test_dynamic_to_static_inference(self):
             config["model_name_or_path"] = save_path
 
             with argv_context_guard(config):
-                from export_model import main
+                from predict.export_model import main
 
                 main()
diff --git a/tests/llm/test_lora.py b/tests/llm/test_lora.py
index bed84c39d96b..2e222e495688 100644
--- a/tests/llm/test_lora.py
+++ b/tests/llm/test_lora.py
@@ -61,17 +61,18 @@ def test_lora(self):
         lora_config["use_quick_lora"] = True
 
         with argv_context_guard(lora_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
         # merge weights
         merge_lora_weights_config = {
             "lora_path": lora_config["output_dir"],
-            "merge_lora_model_path": lora_config["output_dir"],
+            "model_name_or_path": lora_config["model_name_or_path"],
+            "output_path": lora_config["output_dir"],
         }
         with argv_context_guard(merge_lora_weights_config):
-            from merge_lora_params import merge
+            from tools.merge_lora_params import merge
 
             merge()
 
@@ -90,17 +91,18 @@ def test_rslora_plus(self):
         lora_config["dataset_name_or_path"] = self.data_dir
 
         with argv_context_guard(lora_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
         # merge weights
         merge_lora_weights_config = {
             "lora_path": lora_config["output_dir"],
-            "merge_lora_model_path": lora_config["output_dir"],
+            "model_name_or_path": lora_config["model_name_or_path"],
+            "output_path": lora_config["output_dir"],
         }
         with argv_context_guard(merge_lora_weights_config):
-            from merge_lora_params import merge
+            from tools.merge_lora_params import merge
 
             merge()
 
@@ -169,7 +171,7 @@ def test_rslora_plus(self):
 #         lora_config["output_dir"] = self.output_dir
 
 #         with argv_context_guard(lora_config):
-#             from finetune_generation import main
+#             from run_finetune import main
 
 #             main()
 
@@ -180,7 +182,7 @@ def test_rslora_plus(self):
 #             "merge_model_path": lora_config["output_dir"],
 #         }
 #         with argv_context_guard(merge_lora_weights_config):
-#             from merge_lora_params import merge
+#             from tools.merge_lora_params import merge
 
 #             merge()
 
diff --git a/tests/llm/test_predictor.py b/tests/llm/test_predictor.py
index c16d723375c1..0044f2ece476 100644
--- a/tests/llm/test_predictor.py
+++ b/tests/llm/test_predictor.py
@@ -185,7 +185,7 @@ def load_test_config(self):
         return config
 
     def test_create_predictor_with_unexpected_length(self):
-        from predictor import predict
+        from predict.predictor import predict
 
         config = self.load_test_config()
         config.pop("src_length", None)
@@ -430,6 +430,6 @@ def test_export(self):
         config["model_type"] = "qwen-img2txt"
 
         with argv_context_guard(config):
-            from export_model import main
+            from predict.export_model import main
 
             main()
diff --git a/tests/llm/test_ptq.py b/tests/llm/test_ptq.py
index 2f41cead554d..43512dd7c4e2 100644
--- a/tests/llm/test_ptq.py
+++ b/tests/llm/test_ptq.py
@@ -46,7 +46,7 @@ def test_ptq(self):
         finetune_config["output_dir"] = self.output_dir
 
         with argv_context_guard(finetune_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
@@ -59,7 +59,7 @@ def test_blha(self):
         finetune_config["output_dir"] = self.output_dir
 
         with argv_context_guard(finetune_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
@@ -73,7 +73,7 @@ def test_ptq_smooth(self):
         finetune_config["smooth"] = True
 
         with argv_context_guard(finetune_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
@@ -88,7 +88,7 @@ def test_ptq_shift(self):
         finetune_config["shift"] = True
 
         with argv_context_guard(finetune_config):
-            from finetune_generation import main
+            from run_finetune import main
 
             main()
 
diff --git a/tests/llm/testing_utils.py b/tests/llm/testing_utils.py
index 583e5479549f..3684ec243576 100644
--- a/tests/llm/testing_utils.py
+++ b/tests/llm/testing_utils.py
@@ -68,7 +68,7 @@ def run_predictor(self, config_params=None):
         predict_config.update(config_params)
 
         with argv_context_guard(predict_config):
-            from predictor import predict
+            from predict.predictor import predict
 
             predict()
 
@@ -83,7 +83,7 @@ def run_predictor(self, config_params=None):
         config["model_name_or_path"] = self.output_dir
         config.update(config_params)
         with argv_context_guard(config):
-            from export_model import main
+            from predict.export_model import main
 
             main()
 
@@ -96,7 +96,7 @@ def run_predictor(self, config_params=None):
         config_params.pop("model_name_or_path", None)
         config.update(config_params)
         with argv_context_guard(config):
-            from predictor import predict
+            from predict.predictor import predict
 
             predict()
 
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh
index 785adab372df..c215c15351f1 100644
--- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh
@@ -81,20 +81,20 @@ function _train(){
     case ${device_num} in
     N1C1) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
         train_cmd="python -m paddle.distributed.launch --gpus=0 \
-            --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}"
+            --auto_tuner_json ${autoconfig_json_file} run_finetune.py ${modle_json_file}"
         ;;
     N1C8) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
         train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
-            --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}"
+            --auto_tuner_json ${autoconfig_json_file} run_finetune.py ${modle_json_file}"
         ;;
     N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
         train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
             --auto_tuner_json ${autoconfig_json_file} --master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes \
-            finetune_generation.py ${modle_json_file}"
+            run_finetune.py ${modle_json_file}"
         ;;
     *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
         train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
-            --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}"
+            --auto_tuner_json ${autoconfig_json_file} run_finetune.py ${modle_json_file}"
         ;;
     esac
     cd ../llm/
diff --git a/tests/test_tipc/configs/llama/train_infer_python.txt b/tests/test_tipc/configs/llama/train_infer_python.txt
index fd7488fa7c16..8242d0f0f744 100644
--- a/tests/test_tipc/configs/llama/train_infer_python.txt
+++ b/tests/test_tipc/configs/llama/train_infer_python.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../llm/llama/benchmark.py --model_name_or_path facebook/llama-7b-2l --do_train --max_steps 500 --recompute False --overwrite_output_dir --output_dir ./checkpoints/ --fp16_opt_level O2 --learning_rate 3e-5 --warmup_steps 0 --seed 23 --logging_steps 1 --dataloader_num_workers 1
+norm_train:../legacy/examples/benchmark/llm/llama_single_gpu/benchmark.py --model_name_or_path facebook/llama-7b-2l --do_train --max_steps 500 --recompute False --overwrite_output_dir --output_dir ./checkpoints/ --fp16_opt_level O2 --learning_rate 3e-5 --warmup_steps 0 --seed 23 --logging_steps 1 --dataloader_num_workers 1
 pact_train:null
 fpgm_train:null
 distill_train:null
diff --git a/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh
index 6853ce963ad2..6c6413c08b86 100644
--- a/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/ft/benchmark_common/run_benchmark.sh
@@ -118,11 +118,11 @@ function _train(){
     cd ../llm/
     echo "run run_mode: ${run_mode} device_num: ${device_num}"
     if [ "N1C1" = ${device_num} ]; then
-        train_cmd="python -u finetune_generation.py ${train_cmd}" 
+        train_cmd="python -u run_finetune.py ${train_cmd}" 
     else
         rm -rf ./mylog   # 注意执行前删掉log目录
         train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
-            finetune_generation.py ${train_cmd}" 
+            run_finetune.py ${train_cmd}" 
     fi
 
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
diff --git a/tests/trainer/test_lora_unified_checkpoint.py b/tests/trainer/test_lora_unified_checkpoint.py
index 3a5533f65c1f..e04e8db907ad 100644
--- a/tests/trainer/test_lora_unified_checkpoint.py
+++ b/tests/trainer/test_lora_unified_checkpoint.py
@@ -119,7 +119,7 @@ def setUp(self):
 
         self.need_allclose = True
         self.rtol = 1e-7
-        self.run_lora_file = "llm/finetune_generation.py"
+        self.run_lora_file = "llm/run_finetune.py"
         self.num_nodes = 1
 
     def runfirst(self, train_args):
@@ -169,7 +169,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-        self.run_lora_file = "llm/finetune_generation.py"
+        self.run_lora_file = "llm/run_finetune.py"
 
     def runfirst(self, train_args):
         self.run_n1c8(self.run_lora_file, **train_args)
diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py
index 1230dda9a2bc..5044eeaad5f5 100644
--- a/tests/trainer/test_unified_checkpoint.py
+++ b/tests/trainer/test_unified_checkpoint.py
@@ -194,7 +194,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-        self.run_pretrain_file = "llm/llama/run_pretrain.py"
+        self.run_pretrain_file = "llm/run_pretrain.py"
 
     def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
diff --git a/tests/transformers/test_chat_template.py b/tests/transformers/test_chat_template.py
index 216286df1678..4e443b54a2e2 100644
--- a/tests/transformers/test_chat_template.py
+++ b/tests/transformers/test_chat_template.py
@@ -218,7 +218,7 @@ def test_must_have_system(self):
         system = tokenizer.chat_template.render_system()
         system_ids = tokenizer.encode(system, add_special_tokens=False)["input_ids"]
 
-        from data import tokenize_rounds_example
+        from utils.data import tokenize_rounds_example
 
         fake_data_args = self.DataArg(len(system_ids) + 5, src_length=len(system_ids) + 5)
 
@@ -244,7 +244,7 @@ def test_at_least_one_turn(self):
         all_sentence_ids = tokenizer(all_sentence, add_special_tokens=False)["input_ids"]
 
         # get the max_length of conversation
-        from data import tokenize_rounds_example
+        from utils.data import tokenize_rounds_example
 
         fake_data_args = self.DataArg(1024)
         example = {"src": ["你好", "今天吃啥"], "tgt": ["您好，我是个人人工智能助手", "你可以选择不同的菜系"]}
@@ -342,7 +342,7 @@ def test_jinja_syntax_error(self):
             self.tokenizer.init_chat_template(error_jinja)
 
     def test_train_format(self):
-        from data import tokenize_rounds_example
+        from utils.data import tokenize_rounds_example
 
         fake_data_args = self.DataArg(50, src_length=50)
         example = {"src": ["你好"], "tgt": ["您好，我是个人人工智能助手"]}
@@ -360,7 +360,7 @@ def test_train_format(self):
         self.assertNotEqual(tgt_id[tgt_idx], -100)
 
     def test_train_format_multi(self):
-        from data import tokenize_rounds_example
+        from utils.data import tokenize_rounds_example
 
         fake_data_args = self.DataArg(50, src_length=50)
         example = {"src": ["用户Round 1", "用户Round 2"], "tgt": ["回答Round 1", "回答Round 2"]}

From 65e721e7887ec5f9d46b8a84d464972500033763 Mon Sep 17 00:00:00 2001
From: Difer <707065510@qq.com>
Date: Fri, 21 Jun 2024 11:13:11 +0800
Subject: [PATCH 3/3] [LLM] Add sequence_parallel support for qwen (#8558)

* add sequence_parallel for qwen

* add sequence_parallel in qwen pp
---
 paddlenlp/transformers/qwen/configuration.py |   1 -
 paddlenlp/transformers/qwen/modeling.py      | 128 +++++++++++++++----
 paddlenlp/transformers/qwen/modeling_pp.py   |   9 ++
 3 files changed, 110 insertions(+), 28 deletions(-)

diff --git a/paddlenlp/transformers/qwen/configuration.py b/paddlenlp/transformers/qwen/configuration.py
index 836c99027a6c..1841622ea225 100644
--- a/paddlenlp/transformers/qwen/configuration.py
+++ b/paddlenlp/transformers/qwen/configuration.py
@@ -70,7 +70,6 @@ def __init__(
         self.use_dynamic_ntk = use_dynamic_ntk
         self.use_logn_attn = use_logn_attn
         self.no_bias = no_bias
-
         self.long_sequence_strategy_type = long_sequence_strategy_type
         self.long_sequence_strategy_name = long_sequence_strategy_name
         self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
diff --git a/paddlenlp/transformers/qwen/modeling.py b/paddlenlp/transformers/qwen/modeling.py
index 406e097651ee..91203a3b717c 100755
--- a/paddlenlp/transformers/qwen/modeling.py
+++ b/paddlenlp/transformers/qwen/modeling.py
@@ -49,6 +49,15 @@ def swiglu(x, y=None):
 from ..model_outputs import ModelOutput
 from .configuration import QWenConfig
 
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
 __all__ = [
     "QWenBlock",
     "QWenForCausalLM",
@@ -132,17 +141,26 @@ def __init__(self, config):
         assert self.projection_size % config.num_attention_heads == 0
         self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
 
+        self.sequence_parallel = config.sequence_parallel
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
         if config.tensor_parallel_degree > 1:
             if config.num_attention_heads % config.tensor_parallel_degree != 0:
                 raise ValueError("num_attention_heads has to be divisible by tensor_parallel_degree")
             self.num_heads = config.num_attention_heads // config.tensor_parallel_degree
-            self.c_attn = mpu.ColumnParallelLinear(
+            self.c_attn = ColumnParallelLinear(
                 config.hidden_size,
                 3 * self.projection_size,
                 has_bias=True,
                 gather_output=False,
             )
-            self.c_proj = mpu.RowParallelLinear(
+            self.c_proj = RowParallelLinear(
                 config.hidden_size,
                 self.projection_size,
                 has_bias=not config.no_bias,
@@ -150,7 +168,11 @@ def __init__(self, config):
             )
         else:
             self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size, bias_attr=True)
-            self.c_proj = nn.Linear(config.hidden_size, self.projection_size, bias_attr=not config.no_bias)
+            self.c_proj = nn.Linear(
+                config.hidden_size,
+                self.projection_size,
+                bias_attr=not config.no_bias,
+            )
 
         if config.rotary_pct == 1.0:
             self.rotary_ndims = None
@@ -205,6 +227,11 @@ def _attn(self, query, key, value, attention_mask=None):
                     is_causal=attention_mask is None,
                 )
                 attn_weights = None
+
+            if self.sequence_parallel:
+                attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+            else:
+                attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
             return attn_output, attn_weights
         else:
             # [bz, sql, nh, hid] ==> [bz, nh, sql hdim]
@@ -230,6 +257,11 @@ def _attn(self, query, key, value, attention_mask=None):
             attn_weights = self.attn_dropout(attn_weights)
             attn_output = paddle.matmul(attn_weights, value)
             attn_output = attn_output.transpose([0, 2, 1, 3])
+
+            if self.sequence_parallel:
+                attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+            else:
+                attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
             return attn_output, attn_weights
 
     def _split_heads(self, tensor, num_heads, attn_head_size):
@@ -237,12 +269,6 @@ def _split_heads(self, tensor, num_heads, attn_head_size):
         tensor = tensor.reshape(new_shape)
         return tensor
 
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        new_shape = tensor.shape[:-2] + [
-            num_heads * attn_head_size,
-        ]
-        return tensor.reshape(new_shape)
-
     def forward(
         self,
         hidden_states,
@@ -256,14 +282,18 @@ def forward(
     ):
         # [bz, sql, hid] ==> [bz, sql, 3*hid]
         mixed_x_layer = self.c_attn(hidden_states)
-        # [bz, sql, 3*hid] ==> [bz, sql, hid]
-        query, key, value = paddle.split(mixed_x_layer, num_or_sections=3, axis=-1)
+
+        if self.sequence_parallel:
+            target_shape = [-1, self.seq_length, self.num_heads * 3 * self.head_dim]
+            mixed_x_layer = paddle.reshape_(mixed_x_layer, target_shape)
 
         # [bz, sql, hid] ==> [bz, sql, nh, hdim]
+        query, key, value = paddle.split(mixed_x_layer, num_or_sections=3, axis=-1)
         query = self._split_heads(query, self.num_heads, self.head_dim)
         key = self._split_heads(key, self.num_heads, self.head_dim)
         value = self._split_heads(value, self.num_heads, self.head_dim)
-        kv_seq_len = hidden_states.shape[1]
+
+        kv_seq_len = key.shape[-3]
         if layer_past:
             # layer past[0] shape: bs * seq_len * head_num * dim
             kv_seq_len += layer_past[0].shape[1]
@@ -322,17 +352,22 @@ def forward(
         has_gradient = not (query.stop_gradient and key.stop_gradient and value.stop_gradient)
         if self.enable_recompute and self.training and has_gradient and self.recompute_granularity == "core_attn":
             attn_output, attn_weight = recompute(
-                self._attn, query, key, value, attention_mask, use_reentrant=self.config.recompute_use_reentrant
+                self._attn,
+                query,
+                key,
+                value,
+                attention_mask,
+                use_reentrant=self.config.recompute_use_reentrant,
             )
         else:
             attn_output, attn_weight = self._attn(query, key, value, attention_mask)
-        context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim)
 
-        attn_output = self.c_proj(context_layer)
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        attn_output = self.c_proj(attn_output)
         outputs = (attn_output, present)
         if output_attentions:
             outputs += (attn_weight,)
-
         return outputs
 
 
@@ -401,6 +436,7 @@ def forward(self, hidden_states):
 class QWenBlock(nn.Layer):
     def __init__(self, config):
         super().__init__()
+        self.sequence_parallel = config.sequence_parallel
         self.ln_1 = QWenRMSNorm(config)
         self.attn = QWenAttention(config)
         self.ln_2 = QWenRMSNorm(config)
@@ -417,6 +453,8 @@ def forward(
         use_cache=False,
         output_attentions=False,
     ):
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
         layernorm_output = self.ln_1(hidden_states)
 
         attn_outputs = self.attn(
@@ -431,7 +469,6 @@ def forward(
 
         outputs = attn_outputs[1:]
 
-        residual = hidden_states
         layernorm_input = attn_output + residual
 
         layernorm_output = self.ln_2(layernorm_input)
@@ -448,7 +485,6 @@ def forward(
         # remove empty tuple for pipeline parallel
         if type(outputs) is tuple and len(outputs) == 1:
             outputs = outputs[0]
-
         return outputs
 
 
@@ -476,8 +512,6 @@ def get_tensor_parallel_split_mappings(num_hidden_layers):
             base_actions = {
                 # Column Linear
                 "lm_head.weight": partial(fn, is_column=True),
-                "qwen.h.0.mlp.w2.weight": partial(fn, is_column=True),
-                "qwen.h.0.mlp.w1.weight": partial(fn, is_column=True),
                 "qwen.h.0.attn.c_attn.weight": partial(fn, is_column=True, is_naive_3fuse=True),
                 "qwen.h.0.attn.c_attn.bias": partial(fn, is_column=True, is_naive_3fuse=True),
                 # Row Linear
@@ -485,6 +519,15 @@ def get_tensor_parallel_split_mappings(num_hidden_layers):
                 "qwen.h.0.mlp.c_proj.weight": partial(fn, is_column=False),
                 "qwen.h.0.attn.c_proj.weight": partial(fn, is_column=False),
             }
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["qwen.h.0.mlp.w2.weight"] = partial(fn, is_column=True)
+                base_actions["qwen.h.0.mlp.w1.weight"] = partial(fn, is_column=True)
+
             for key, action in base_actions.items():
                 if "h.0." in key:
                     for i in range(num_hidden_layers):
@@ -569,6 +612,8 @@ def _get_name_mappings(cls, config: QWenConfig) -> List[StateDictNameMapping]:
 
     def _init_weights(self, module):
         """Initialize the weights."""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
         if isinstance(
             module,
             (
@@ -578,11 +623,24 @@ def _init_weights(self, module):
                 mpu.RowParallelLinear,
                 mpu.VocabParallelEmbedding,
                 QWenLMHead,
+                linear_utils.ColumnSequenceParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
             ),
         ):
-            module.weight.set_value(
-                paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=module.weight.shape)
-            )
+            if isinstance(module.weight, paddle.Tensor):
+                if module.weight.is_distributed:
+                    with rng_tracker():
+                        module.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range,
+                                shape=module.weight.shape,
+                            )
+                        )
+            else:
+                module.weight.set_value(
+                    paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=module.weight.shape)
+                )
 
         for name, p in module.named_parameters():
             if name == "c_proj.weight":
@@ -604,6 +662,7 @@ def __init__(self, config):
         self.embed_dim = config.hidden_size
         self.enable_recompute = False
         self.recompute_granularity = config.recompute_granularity
+        self.sequence_parallel = config.sequence_parallel
 
         if config.tensor_parallel_degree > 1:
             self.wte = mpu.VocabParallelEmbedding(
@@ -705,6 +764,9 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        if self.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -731,6 +793,14 @@ def forward(
         encoder_attention_mask = None
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
+
+        if self.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = inputs_embeds.shape
+            inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
         hidden_states = inputs_embeds
 
         # bool 4D mask
@@ -741,9 +811,6 @@ def forward(
         attention_mask = paddle.where(attention_mask, zero, neg_inf)
 
         hidden_states = self.drop(hidden_states)
-        output_shape = input_shape + [
-            hidden_states.shape[-1],
-        ]
 
         if self.enable_recompute and self.training:
             if use_cache:
@@ -794,7 +861,7 @@ def forward(
                 all_self_attentions = all_self_attentions + (outputs[1],)
 
         hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.reshape(output_shape)
+
         # Add last hidden state
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -836,6 +903,11 @@ def __init__(self, config: QWenConfig):
             self.weight.split_axis = 1
 
     def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            seq_length = self.config.seq_length
+            hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
         if tensor_parallel_output is None:
             tensor_parallel_output = self.config.tensor_parallel_output and self.config.tensor_parallel_degree > 1
 
@@ -1091,6 +1163,8 @@ def __init__(self, config):
             dtype=paddle.get_default_dtype(),
             default_initializer=nn.initializer.Constant(1.0),
         )
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
 
     def _norm(self, x):
         return x * paddle.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
diff --git a/paddlenlp/transformers/qwen/modeling_pp.py b/paddlenlp/transformers/qwen/modeling_pp.py
index 238737ecda66..47357d6921e3 100644
--- a/paddlenlp/transformers/qwen/modeling_pp.py
+++ b/paddlenlp/transformers/qwen/modeling_pp.py
@@ -76,6 +76,7 @@ class QWenEmbeddingPipe(nn.Layer):
     def __init__(self, config):
         super(QWenEmbeddingPipe, self).__init__()
         self.hidden_size = config.hidden_size
+        self.sequence_parallel = config.sequence_parallel
         if config.tensor_parallel_degree > 1:
             self.wte = fleet.meta_parallel.VocabParallelEmbedding(
                 config.vocab_size,
@@ -96,6 +97,14 @@ def forward(self, args):
         """
         input_ids, attention_mask, position_ids = parse_args(args)
         input_embeds = self.wte(input_ids)
+        if self.sequence_parallel:
+            from paddlenlp.transformers import ScatterOp
+
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = input_embeds.shape
+            input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            input_embeds = ScatterOp.apply(input_embeds)
 
         batch_size, seq_length = input_ids.shape
         if attention_mask is not None: