forked from hiyouga/LLaMA-Factory
-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_meta_phi_uni_3.sh
107 lines (89 loc) · 2.96 KB
/
run_meta_phi_uni_3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
TIME=$(date "+%m-%d-%H-%M")
DATASET=metamathQA
TEMPLATE=alpaca
# wandb
export WANDB_PROJECT=xukp20-$DATASET-e3
RUN_NAME=$1
if [ -z $RUN_NAME ]; then
RUN_NAME_PAR=""
else
RUN_NAME_PAR="--run_name $RUN_NAME"
fi
# set HF_HOME env
# export HF_HOME=/lustre/cache/huggingface
OUTPUT_DIR=/cephfs/xukangping/root/models/$WANDB_PROJECT/$TEMPLATE-$DATASET-$TIME
# OUTPUT_DIR=~/models/llama-tuned/codellama-34b-$DATASET-$TIME
# mistral 7b
# MODEL_NAME_OR_PATH="/cephfs/shared/hf_cache/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658"
# gemma 2b
# MODEL_NAME_OR_PATH="/cephfs/shared/hf_cache/hub/models--google--gemma-2b/snapshots/9d067f00def958594aaa16b39a65b07d69ca655b"
# phi-2
# MODEL_NAME_OR_PATH="/cephfs/shared/hf_cache/hub/models--microsoft--phi-2/snapshots/710686f446f02286c858c11f052acb87c306ddd2"
# phi-2 copyied 48
# MODEL_NAME_OR_PATH=/cephfs/xukangping/root/models/phi-2-48
# llama-3
# MODEL_NAME_OR_PATH="/cephfs/shared/hf_cache/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/b6887ce03ea47d068bf8502ba6ed27f8c5c12a6b"
# uni-phi
# MODEL_NAME_OR_PATH="/cephfs/xukangping/root/models/uni-phi-2"
# uni-phi v2
# MODEL_NAME_OR_PATH="/cephfs/xukangping/root/models/uni-phi-2-0512"
# MODEL_NAME_OR_PATH="/cephfs/xukangping/root/models/uni-phi-2-0512-tcr"
# uni-loop-phi
# MODEL_NAME_OR_PATH="/cephfs/xukangping/root/models/uni-coloop-phi-0514"
# uni-loop-phi cov
MODEL_NAME_OR_PATH=/cephfs/xukangping/root/models/uni-coloop-phi-cov-0516
VAL_SIZE=0.005
NUM_GPUS=8
BATCH_SIZE=4
GRADIENT_ACCUMULATION_STEPS=4
# LR=5e-5
LR=1e-5 # for meta_math
EPOCHS=3
MAX_LEN=2048
SAVE_STEPS=3000
# for continue the gate training
# VAL_SIZE=0.005
# NUM_GPUS=8
# BATCH_SIZE=4
# # LR=5e-5
# LR=2e-6 # for meta_math
# EPOCHS=4
# MAX_LEN=2048
# SAVE_STEPS=3000
# export WANDB_EVAL_CALLBACK=1
# accelerate launch src/train_bash.py \
# deepspeed --hostfile hostfile.txt src/train_bash.py \
deepspeed --num_gpus=$NUM_GPUS --master_port=9901 src/train_bash.py \
--deepspeed "/cephfs/xukangping/code/LLaMA-Efficient-Tuning/ds_config_zero1.json" \
--stage sft \
--model_name_or_path $MODEL_NAME_OR_PATH \
--do_train True \
--overwrite_cache False \
--finetuning_type full \
--template $TEMPLATE \
--dataset_dir data \
--dataset $DATASET \
--cutoff_len $MAX_LEN \
--learning_rate $LR \
--num_train_epochs $EPOCHS \
--max_samples 10000000 \
--per_device_train_batch_size $BATCH_SIZE \
--per_device_eval_batch_size $BATCH_SIZE \
--gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
--lr_scheduler_type cosine \
--max_grad_norm 1.0 \
--logging_steps 5 \
--save_steps $SAVE_STEPS \
--warmup_steps 0 \
--lora_rank 8 \
--output_dir $OUTPUT_DIR \
--fp16 \
--plot_loss True \
--val_size $VAL_SIZE \
--evaluation_strategy steps \
--eval_steps 100 \
--report_to wandb \
$RUN_NAME_PAR \
--preprocessing_num_workers 32 \
--flash_attn
# --load_best_model_at_end True