Skip to content

Commit f0a2a69

Browse files
cuichenxXuesongYang
authored andcommitted
Fix unwrap model (NVIDIA#9480)
* fix unwrap model Signed-off-by: Chen Cui <[email protected]> * add O2 to ci test Signed-off-by: Chen Cui <[email protected]> * fix ci test Signed-off-by: Chen Cui <[email protected]> * fix ci test Signed-off-by: Chen Cui <[email protected]> * fix ci test Signed-off-by: Chen Cui <[email protected]> --------- Signed-off-by: Chen Cui <[email protected]>
1 parent f1f1d97 commit f0a2a69

File tree

2 files changed

+38
-19
lines changed

2 files changed

+38
-19
lines changed

.github/workflows/cicd-main.yml

+31-12
Original file line numberDiff line numberDiff line change
@@ -3060,13 +3060,13 @@ jobs:
30603060
AFTER_SCRIPT: |
30613061
rm -rf /home/TestData/nlp/megatron_ir/working_dir
30623062
3063-
L2_Megatron_GPT_PEFT_Lora_PP2:
3063+
L2_Megatron_GPT_PEFT_Lora_PP2_O2:
30643064
needs: [cicd-test-container-setup]
30653065
uses: ./.github/workflows/_test_template.yml
30663066
with:
30673067
RUNNER: self-hosted-azure
30683068
SCRIPT: |
3069-
rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
3069+
rm -rf /home/TestData/nlp/lora_tuning_pp2
30703070
30713071
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
30723072
trainer.devices=2 \
@@ -3075,11 +3075,12 @@ jobs:
30753075
trainer.max_steps=3 \
30763076
trainer.val_check_interval=3 \
30773077
++trainer.limit_val_batches=2 \
3078-
trainer.precision=16 \
3079-
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
3078+
trainer.precision=bf16 \
3079+
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \
30803080
model.pipeline_model_parallel_size=2 \
30813081
model.tensor_model_parallel_size=1 \
3082-
model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
3082+
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
3083+
model.megatron_amp_O2=True \
30833084
model.peft.peft_scheme=lora \
30843085
model.answer_only_loss=True \
30853086
model.micro_batch_size=1 \
@@ -3090,10 +3091,28 @@ jobs:
30903091
model.data.validation_ds.num_workers=0 \
30913092
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
30923093
model.data.validation_ds.names=[quarel]
3094+
3095+
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
3096+
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
3097+
model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
3098+
model.pipeline_model_parallel_size=2 \
3099+
model.tensor_model_parallel_size=1 \
3100+
trainer.devices=2 \
3101+
model.megatron_amp_O2=True \
3102+
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
3103+
model.data.test_ds.names=['quarel4'] \
3104+
model.global_batch_size=2 \
3105+
model.micro_batch_size=1 \
3106+
model.data.test_ds.tokens_to_generate=10 \
3107+
model.data.test_ds.write_predictions_to_file=True \
3108+
model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \
3109+
inference.greedy=True \
3110+
inference.repetition_penalty=1.0 \
3111+
inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'
30933112
AFTER_SCRIPT: |
3094-
rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
3113+
rm -rf /home/TestData/nlp/lora_tuning_pp2
30953114
3096-
L2_Megatron_GPT_PEFT_Lora_TP2:
3115+
L2_Megatron_GPT_PEFT_Lora_TP2_O1:
30973116
needs: [cicd-test-container-setup]
30983117
uses: ./.github/workflows/_test_template.yml
30993118
with:
@@ -3108,11 +3127,11 @@ jobs:
31083127
trainer.max_steps=3 \
31093128
trainer.val_check_interval=3 \
31103129
++trainer.limit_val_batches=2 \
3111-
trainer.precision=16 \
3130+
trainer.precision=bf16 \
31123131
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
31133132
model.pipeline_model_parallel_size=1 \
31143133
model.tensor_model_parallel_size=2 \
3115-
model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
3134+
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
31163135
model.peft.peft_scheme='lora' \
31173136
model.answer_only_loss=True \
31183137
model.micro_batch_size=1 \
@@ -3125,7 +3144,7 @@ jobs:
31253144
model.data.validation_ds.names=[quarel]
31263145
31273146
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
3128-
model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
3147+
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
31293148
model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
31303149
model.tensor_model_parallel_size=2 \
31313150
trainer.devices=2 \
@@ -4234,8 +4253,8 @@ jobs:
42344253
- L2_Megatron_GPT_Finetuning_PP2
42354254
- L2_Megatron_GPT_Finetuning_StarCoder_PP1
42364255
- L2_Megatron_GPT_Embedding
4237-
- L2_Megatron_GPT_PEFT_Lora_PP2
4238-
- L2_Megatron_GPT_PEFT_Lora_TP2
4256+
- L2_Megatron_GPT_PEFT_Lora_PP2_O2
4257+
- L2_Megatron_GPT_PEFT_Lora_TP2_O1
42394258
- L2_Megatron_GPT_Eval
42404259
- L2_Megatron_GPT_Eval_PP2
42414260
- L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len

nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,11 @@ def _get_all_keys(
109109
"""
110110
Returns all the keys in the model
111111
"""
112-
k = [n for n, p in self._unwrap_model().named_parameters()]
112+
k = [n for n, p in self._unwrap_model().named_parameters(prefix="model")]
113113
b = [
114114
n
115-
for n, p in self._unwrap_model().named_buffers()
116-
if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict().keys()
115+
for n, p in self._unwrap_model().named_buffers(prefix="model")
116+
if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict(prefix="model.").keys()
117117
]
118118
# we include buffers because ptuning representations are cached in a buffer and saved to state_dict for inference time use.
119119
return set(k + b)
@@ -292,13 +292,13 @@ def setup_optimizer_param_groups(self):
292292
self.freeze(training=True) # Freeze the entire model
293293
if not self.ptuning_only_and_non_first_stage:
294294
opt_params = []
295-
for _, module in self._unwrap_model().named_modules():
295+
for _, module in self._unwrap_model().named_modules(prefix="model"):
296296
if isinstance(module, AdapterModuleMixin) and module.is_adapter_available():
297297
module.set_enabled_adapters(enabled=True)
298298
module.unfreeze_enabled_adapters() # selectively unfreeze the adapter modules.
299299
opt_params += [p for p in module.parameters() if p.requires_grad]
300300

301-
for name, param in self._unwrap_model().named_parameters():
301+
for name, param in self._unwrap_model().named_parameters(prefix="model"):
302302
if name in self.tunable_base_param_keys:
303303
param.requires_grad = True
304304
opt_params += [param]
@@ -397,11 +397,11 @@ def get_peft_state_dict(self):
397397
"""
398398
Gets the keys associated with the adapters only.
399399
"""
400-
state_dict = self._unwrap_model().state_dict()
400+
state_dict = self._unwrap_model().state_dict(prefix="model.")
401401
peft_state_dict = {}
402402
for k in self.adapter_keys.union(self.tunable_base_param_keys):
403403
# state_dict keys needs to be in non-O2 format and will be corrected in PEFTSaveRestoreConnector if O2=True
404-
new_k = k.replace("module.", "", 1)
404+
new_k = k.replace("model.module.", "model.", 1)
405405
peft_state_dict[new_k] = state_dict[new_k]
406406
return peft_state_dict
407407

0 commit comments

Comments
 (0)