@@ -3060,13 +3060,13 @@ jobs:
3060
3060
AFTER_SCRIPT : |
3061
3061
rm -rf /home/TestData/nlp/megatron_ir/working_dir
3062
3062
3063
- L2_Megatron_GPT_PEFT_Lora_PP2 :
3063
+ L2_Megatron_GPT_PEFT_Lora_PP2_O2 :
3064
3064
needs : [cicd-test-container-setup]
3065
3065
uses : ./.github/workflows/_test_template.yml
3066
3066
with :
3067
3067
RUNNER : self-hosted-azure
3068
3068
SCRIPT : |
3069
- rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
3069
+ rm -rf /home/TestData/nlp/lora_tuning_pp2
3070
3070
3071
3071
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
3072
3072
trainer.devices=2 \
@@ -3075,11 +3075,12 @@ jobs:
3075
3075
trainer.max_steps=3 \
3076
3076
trainer.val_check_interval=3 \
3077
3077
++trainer.limit_val_batches=2 \
3078
- trainer.precision=16 \
3079
- exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
3078
+ trainer.precision=bf16 \
3079
+ exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \
3080
3080
model.pipeline_model_parallel_size=2 \
3081
3081
model.tensor_model_parallel_size=1 \
3082
- model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
3082
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
3083
+ model.megatron_amp_O2=True \
3083
3084
model.peft.peft_scheme=lora \
3084
3085
model.answer_only_loss=True \
3085
3086
model.micro_batch_size=1 \
@@ -3090,10 +3091,28 @@ jobs:
3090
3091
model.data.validation_ds.num_workers=0 \
3091
3092
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
3092
3093
model.data.validation_ds.names=[quarel]
3094
+
3095
+ python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
3096
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
3097
+ model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
3098
+ model.pipeline_model_parallel_size=2 \
3099
+ model.tensor_model_parallel_size=1 \
3100
+ trainer.devices=2 \
3101
+ model.megatron_amp_O2=True \
3102
+ model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
3103
+ model.data.test_ds.names=['quarel4'] \
3104
+ model.global_batch_size=2 \
3105
+ model.micro_batch_size=1 \
3106
+ model.data.test_ds.tokens_to_generate=10 \
3107
+ model.data.test_ds.write_predictions_to_file=True \
3108
+ model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \
3109
+ inference.greedy=True \
3110
+ inference.repetition_penalty=1.0 \
3111
+ inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'
3093
3112
AFTER_SCRIPT : |
3094
- rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
3113
+ rm -rf /home/TestData/nlp/lora_tuning_pp2
3095
3114
3096
- L2_Megatron_GPT_PEFT_Lora_TP2 :
3115
+ L2_Megatron_GPT_PEFT_Lora_TP2_O1 :
3097
3116
needs : [cicd-test-container-setup]
3098
3117
uses : ./.github/workflows/_test_template.yml
3099
3118
with :
@@ -3108,11 +3127,11 @@ jobs:
3108
3127
trainer.max_steps=3 \
3109
3128
trainer.val_check_interval=3 \
3110
3129
++trainer.limit_val_batches=2 \
3111
- trainer.precision=16 \
3130
+ trainer.precision=bf16 \
3112
3131
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
3113
3132
model.pipeline_model_parallel_size=1 \
3114
3133
model.tensor_model_parallel_size=2 \
3115
- model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2 .nemo \
3134
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama .nemo \
3116
3135
model.peft.peft_scheme='lora' \
3117
3136
model.answer_only_loss=True \
3118
3137
model.micro_batch_size=1 \
@@ -3125,7 +3144,7 @@ jobs:
3125
3144
model.data.validation_ds.names=[quarel]
3126
3145
3127
3146
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
3128
- model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2 .nemo \
3147
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama .nemo \
3129
3148
model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
3130
3149
model.tensor_model_parallel_size=2 \
3131
3150
trainer.devices=2 \
@@ -4234,8 +4253,8 @@ jobs:
4234
4253
- L2_Megatron_GPT_Finetuning_PP2
4235
4254
- L2_Megatron_GPT_Finetuning_StarCoder_PP1
4236
4255
- L2_Megatron_GPT_Embedding
4237
- - L2_Megatron_GPT_PEFT_Lora_PP2
4238
- - L2_Megatron_GPT_PEFT_Lora_TP2
4256
+ - L2_Megatron_GPT_PEFT_Lora_PP2_O2
4257
+ - L2_Megatron_GPT_PEFT_Lora_TP2_O1
4239
4258
- L2_Megatron_GPT_Eval
4240
4259
- L2_Megatron_GPT_Eval_PP2
4241
4260
- L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
0 commit comments