Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto sft #9728

Open
wants to merge 91 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
91 commits
Select commit Hold shift + click to select a range
85d6511
add single_model network and use intermediate api
blacksheep-Aristotle Nov 12, 2024
e87135d
[AutoParallel]: fix llama_model_network run error
blacksheep-Aristotle Nov 15, 2024
8bb66c9
New version of auto config
FeixLiu Nov 19, 2024
0f0ad13
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 3, 2024
802827c
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 5, 2024
8427405
update api
FeixLiu Dec 5, 2024
f150efa
update plan
FeixLiu Dec 9, 2024
8e9f00d
qwen fit base api
FeixLiu Dec 9, 2024
0bed451
[AutoParallel]:gpt single network support tp to share_embedding
blacksheep-Aristotle Dec 12, 2024
f18b49d
add intermediate ci
blacksheep-Aristotle Dec 18, 2024
18dc01d
add single_model network and use intermediate api
blacksheep-Aristotle Nov 12, 2024
917d995
New version of auto config
FeixLiu Nov 19, 2024
573ccd1
fix sharding
FeixLiu Nov 27, 2024
9fab6f3
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 3, 2024
1a3c7a0
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 5, 2024
64ced7c
update gpt run_pretrain_py
blacksheep-Aristotle Dec 19, 2024
05b4845
fix sharding error
blacksheep-Aristotle Dec 19, 2024
dff5db5
fix gpt format error
blacksheep-Aristotle Dec 19, 2024
78d94e5
[AutoParallel]:fix llama vpp ci error
blacksheep-Aristotle Dec 19, 2024
57eb989
[AutoParallel]:fix ipp error
blacksheep-Aristotle Dec 24, 2024
a7dfee3
[AutoParallel]:fix a100 ci error
blacksheep-Aristotle Dec 24, 2024
64cdab4
[AutoParallel]:fix a100 ci error
blacksheep-Aristotle Dec 24, 2024
3ec357b
[AutoParallel]:add explanatory note
blacksheep-Aristotle Dec 27, 2024
c22e514
Delete =1.0.0
blacksheep-Aristotle Dec 27, 2024
f71d655
[AutoParallel]:add run_fintune scripts
blacksheep-Aristotle Dec 23, 2024
c015b35
[AutoParallel]:auto parallel support lora model
blacksheep-Aristotle Dec 27, 2024
af1af5c
update auto_lora_model
blacksheep-Aristotle Jan 2, 2025
aa841d4
update auto_lora_model
blacksheep-Aristotle Jan 2, 2025
8f8b0a4
[AutoParallel]:nlp support run lora model with intermediate
blacksheep-Aristotle Jan 2, 2025
ebde991
[AutoParallel]:update format
blacksheep-Aristotle Jan 6, 2025
3386a3e
[AutoParallel]:support input attentionmask
blacksheep-Aristotle Jan 7, 2025
3a73ad8
[AutoParallel]:shard dataloader support multi inputs
blacksheep-Aristotle Jan 7, 2025
1fd7d7e
[AutoParallel]:auto_sft rebase develop
blacksheep-Aristotle Jan 8, 2025
907f287
[AutoParallel]:auto_sft rebase develop
blacksheep-Aristotle Jan 8, 2025
fe9e56e
[AutoParallel]:fix lora parallel mode
blacksheep-Aristotle Jan 9, 2025
14a0fc4
add single_model network and use intermediate api
blacksheep-Aristotle Nov 12, 2024
80a676b
[AutoParallel]: fix llama_model_network run error
blacksheep-Aristotle Nov 15, 2024
576c5f1
New version of auto config
FeixLiu Nov 19, 2024
6dc3345
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 3, 2024
caa8a50
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 5, 2024
c3b3ee6
update api
FeixLiu Dec 5, 2024
2d38858
update plan
FeixLiu Dec 9, 2024
f869aa0
qwen fit base api
FeixLiu Dec 9, 2024
4c79de3
[AutoParallel]:gpt single network support tp to share_embedding
blacksheep-Aristotle Dec 12, 2024
2b8992b
add intermediate ci
blacksheep-Aristotle Dec 18, 2024
098f454
add single_model network and use intermediate api
blacksheep-Aristotle Nov 12, 2024
3ec33ec
New version of auto config
FeixLiu Nov 19, 2024
c867b31
fix sharding
FeixLiu Nov 27, 2024
e314a49
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 3, 2024
4017c25
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 5, 2024
63a423d
update gpt run_pretrain_py
blacksheep-Aristotle Dec 19, 2024
882fe73
fix sharding error
blacksheep-Aristotle Dec 19, 2024
255c1f1
fix gpt format error
blacksheep-Aristotle Dec 19, 2024
fcb0593
[AutoParallel]:fix llama vpp ci error
blacksheep-Aristotle Dec 19, 2024
906c892
[AutoParallel]:fix ipp error
blacksheep-Aristotle Dec 24, 2024
763c00f
[AutoParallel]:fix a100 ci error
blacksheep-Aristotle Dec 24, 2024
1e9fc7a
[AutoParallel]:fix a100 ci error
blacksheep-Aristotle Dec 24, 2024
7f5b872
[AutoParallel]:add explanatory note
blacksheep-Aristotle Dec 27, 2024
ba7a3e7
Delete =1.0.0
blacksheep-Aristotle Dec 27, 2024
142d07f
[AutoParallel]:add run_fintune scripts
blacksheep-Aristotle Dec 23, 2024
587a936
[AutoParallel]:auto parallel support lora model
blacksheep-Aristotle Dec 27, 2024
b108cc0
update auto_lora_model
blacksheep-Aristotle Jan 2, 2025
f535993
update auto_lora_model
blacksheep-Aristotle Jan 2, 2025
24ed043
[AutoParallel]:nlp support run lora model with intermediate
blacksheep-Aristotle Jan 2, 2025
2e540c6
[AutoParallel]:update format
blacksheep-Aristotle Jan 6, 2025
981c246
[AutoParallel]:support input attentionmask
blacksheep-Aristotle Jan 7, 2025
bf1c607
[AutoParallel]:shard dataloader support multi inputs
blacksheep-Aristotle Jan 7, 2025
c259ff3
[AutoParallel]:auto_sft rebase develop
blacksheep-Aristotle Jan 8, 2025
8a0823e
[AutoParallel]:auto_sft rebase develop
blacksheep-Aristotle Jan 8, 2025
b8977d4
[AutoParallel]:fix lora parallel mode
blacksheep-Aristotle Jan 9, 2025
1d0585b
[AutoParallel]: fix bug about file dep circular and lora config error
liufengwei0103 Feb 21, 2025
059ba83
[AutoParallel]: recover default base api config in launch script
liufengwei0103 Feb 21, 2025
b3d0fa5
[AutoParallel]: add to do
liufengwei0103 Feb 21, 2025
d917f46
[AutoParallel]: add to do
liufengwei0103 Feb 21, 2025
95f7925
Merge branch 'auto_sft' into auto_sft
liufengwei0103 Feb 21, 2025
211a493
rebase and fix lora config and file dep bug (#28)
liufengwei0103 Feb 21, 2025
e07595a
[AutoParallel]: add lora ci and fix bug
liufengwei0103 Feb 22, 2025
965a491
[AutoParallel]: add lora ci and fix bug (#29)
liufengwei0103 Feb 22, 2025
6a669f4
[AutoParallel]: fix lint error
liufengwei0103 Feb 22, 2025
57dc1f1
[AutoParallel]: fix lint error
liufengwei0103 Feb 24, 2025
5761457
[AutoParallel]: fix lint error
liufengwei0103 Feb 24, 2025
79c954e
[AutoParallel]: fix ci error about print model
liufengwei0103 Feb 24, 2025
92c0861
[AutoParallel]: fix typo
liufengwei0103 Feb 24, 2025
cf7bac0
[AutoParallel]: fix
liufengwei0103 Feb 24, 2025
be2cb7b
[AutoParallel]: fix llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermedi…
liufengwei0103 Feb 24, 2025
54a44a1
[AutoParallel]: delete typo
liufengwei0103 Feb 24, 2025
c77619b
[AutoParallel]: delete comment
liufengwei0103 Feb 24, 2025
4413a72
[AutoParallel]: fix ci error not on a100
liufengwei0103 Feb 24, 2025
f0352ad
[AutoParallel]: delete comment
liufengwei0103 Feb 25, 2025
641bcc7
[AutoParallel]: delete useless code
liufengwei0103 Feb 25, 2025
7309602
[AutoParallel]: ddelete launch script
liufengwei0103 Feb 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions llm/auto_parallel/llama/llama_finetune_with_api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# just for debug

set -x

unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT

export NNODES=1
export PADDLE_TRAINERS_NUM=1

export GLOG_v=0

export FLAGS_cudnn_deterministic=0
export FLAGS_embedding_deterministic=0
# export FLAGS_max_inplace_grad_add=65536
export FLAGS_enable_auto_parallel_align_mode=0

task_name="llama_3.1_sft_auto"
rm -rf output/$task_name/
rm -rf "log/$task_name""_log"

export SOT_LOG_LEVEL=4
# export PYTHONPATH=../:$PYTHONPATH
export PYTHONPATH=../../../:$PYTHONPATH
#ulimit -c unlimited
to_static=true

python -u -m paddle.distributed.launch \
--gpus "0,1,2,3,4,5,6,7" \
--log_dir "log/$task_name""_log" \
../run_finetune_auto.py \
--model_name_or_path "meta-llama/Meta-Llama-3.1-8B-Instruct" \
--dataset_name_or_path "../../finetune_data/data" \
--output_dir "output/$task_name/" \
--enable_auto_parallel true \
--lora false \
--use_mora false \
--model_type "llama" \
--use_intermediate_api false \
--to_static $to_static \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 2 \
--per_device_eval_batch_size 8 \
--eval_accumulation_steps 16 \
--num_train_epochs 1 \
--learning_rate 3e-05 \
--max_steps 10 \
--warmup_steps 30 \
--logging_steps 1 \
--evaluation_strategy "epoch" \
--save_strategy "epoch" \
--src_length 1024 \
--max_length 2048 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--do_train true \
--do_eval false \
--disable_tqdm true \
--load_best_model_at_end true \
--eval_with_do_generation false \
--metric_for_best_model "accuracy" \
--recompute false \
--save_total_limit 1 \
--tensor_parallel_degree 2 \
--pipeline_parallel_degree 2\
--zero_padding false \
--unified_checkpoint false \
--flash_mask false \
--use_flash_attention true \
--fuse_attention_qkv true \
--sharding "stage1" \
--auto_parallel_resume_form_hybrid_parallel true \
--num_hidden_layers 2 \
# --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
# --tensor_parallel_config "enable_mp_async_allreduce" \
# --pipeline_parallel_config "enable_send_recv_overlap" \
Loading
Loading