diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index c4350a42f59b..a0353a42fb5e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3690,6 +3690,75 @@ jobs: uses: actions/checkout@v2 - run: | python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=10 + + python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=20 + + rm -rf examples/nlp/language_modeling/mcore_retro_results + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + + L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: + needs: [cicd-test-container-setup] + runs-on: self-hosted-azure + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v2 + - run: | + python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ trainer.devices=2 \ trainer.num_nodes=1 \ trainer.accelerator=gpu \ @@ -3700,7 +3769,7 @@ jobs: trainer.precision=16 \ trainer.gradient_clip_val=1.0 \ trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ model.data.data_prefix= \ model.data.knn_index= \ model.data.retrieval_prefix= \ @@ -3720,7 +3789,7 @@ jobs: model.dec_cross_attention=[1] \ +model.data.mock=True - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ trainer.devices=2 \ trainer.num_nodes=1 \ trainer.accelerator=gpu \ @@ -3731,7 +3800,7 @@ jobs: trainer.precision=16 \ trainer.gradient_clip_val=1.0 \ trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ model.data.data_prefix= \ model.data.knn_index= \ model.data.retrieval_prefix= \ @@ -3751,7 +3820,7 @@ jobs: model.dec_cross_attention=[1] \ +model.data.mock=True - rm -rf examples/nlp/language_modeling/retro_results + rm -rf examples/nlp/language_modeling/retro_legacy_results - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()"