From eff028977061ea58df26f7b4070c22a25687a72d Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Mon, 25 Mar 2024 08:46:13 +0530 Subject: [PATCH 1/3] Update README for FSDP This corrects the command for FSDP --- examples/language-modeling/README.md | 33 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index abf19c457b..55486f3734 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -562,41 +562,42 @@ python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lora_clm.py \ - Multi-card finetuning of Llama2-70B with FSDP and LoRA: ```bash -PT_HPU_MAX_COMPOUND_OP_SIZE=10 DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 \ -python3 ../gaudi_spawn.py --use_mpi --world_size 8 run_lora_clm.py \ +LOWER_LIST=ops_bf16.txt PT_HPU_LAZY_MODE=0 \ +python3 ../gaudi_spawn.py \ + --world_size 8 --use_mpi run_lora_clm.py \ --model_name_or_path meta-llama/Llama-2-70b-hf \ --dataset_name tatsu-lab/alpaca \ --bf16 True \ - --output_dir ./lora_out \ - --num_train_epochs 2 \ + --output_dir /tmp/lora_fsdp_out \ --max_seq_len 2048 \ - --per_device_train_batch_size 10 \ - --per_device_eval_batch_size 10 \ --gradient_checkpointing \ - --evaluation_strategy epoch \ - --eval_delay 2 \ + --per_device_train_batch_size 5 \ --save_strategy no \ --learning_rate 0.0004 \ --warmup_ratio 0.03 \ --lr_scheduler_type "constant" \ --logging_steps 1 \ --dataset_concatenation \ - --attn_softmax_bf16 True \ --do_train \ - --do_eval \ --use_habana \ - --use_lazy_mode False \ - --pipelining_fwd_bwd False \ --throughput_warmup_steps 3 \ --lora_rank 4 \ --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \ + --attn_softmax_bf16 True \ --validation_split_percentage 4 \ - --use_flash_attention True \ + --use_lazy_mode False \ --fsdp_config fsdp_config.json \ - --fsdp "auto_wrap" \ - --torch_compile_backend hpu_backend \. + --fsdp auto_wrap \ + --num_train_epochs 2 \ + --evaluation_strategy epoch \ + --per_device_eval_batch_size 1 \ + --eval_delay 2 \ + --do_eval \ + --pipelining_fwd_bwd False \ + --use_fused_rope False \ + --torch_compile_backend hpu_backend \ --torch_compile \ - --use_fused_rope False + --gradient_accumulation_steps 2 ``` - Multi-card finetuning of Falcon-180B: From b641283a6450f55dbfd6fb9a0223b66fb66f924a Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Mon, 25 Mar 2024 08:52:47 +0530 Subject: [PATCH 2/3] Update README.md --- examples/language-modeling/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index 55486f3734..de06f137aa 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -563,8 +563,7 @@ python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lora_clm.py \ ```bash LOWER_LIST=ops_bf16.txt PT_HPU_LAZY_MODE=0 \ -python3 ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_lora_clm.py \ +python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \ --model_name_or_path meta-llama/Llama-2-70b-hf \ --dataset_name tatsu-lab/alpaca \ --bf16 True \ From 198deef9c4d730c1022b41d6e0905c5f34cc95eb Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Mon, 25 Mar 2024 08:53:45 +0530 Subject: [PATCH 3/3] Update README.md --- examples/language-modeling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index de06f137aa..776993aca1 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -567,7 +567,7 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \ --model_name_or_path meta-llama/Llama-2-70b-hf \ --dataset_name tatsu-lab/alpaca \ --bf16 True \ - --output_dir /tmp/lora_fsdp_out \ + --output_dir ./lora_out \ --max_seq_len 2048 \ --gradient_checkpointing \ --per_device_train_batch_size 5 \