diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml index 35a50aae0ea..5dfaa4776b1 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml @@ -143,8 +143,8 @@ jobs: - name: clean up and install Megatron-Bridge run: | rm -rf checkpoints - pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation - pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1 - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek) run: | diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml index fb3e73ed02d..2d5e0821d48 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml @@ -122,8 +122,8 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] - pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation - pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1 - name: Prepare GSM8K dataset run: | diff --git a/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh b/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh index 5344cfd9aa6..890b719b349 100644 --- a/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh +++ b/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh @@ -1,6 +1,11 @@ #!/usr/bin/env bash set -xeuo pipefail +# Need to install Megatron-Bridge +# NOTE: Make sure you use Megatron-Bridge later than 0.2.0 +# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/a489bed3a2410ed9b000ec13a3c90176fec7d99c or later) +# for proper MoE LoRA support. + # For Megatron communication/computation overlapping export CUDA_DEVICE_MAX_CONNECTIONS=1 @@ -41,8 +46,16 @@ DATA=( MODEL=( actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct - actor_rollout_ref.model.lora.rank=16 - actor_rollout_ref.model.lora.alpha=32 + actor_rollout_ref.model.lora.rank=256 + actor_rollout_ref.model.lora.alpha=512 + actor_rollout_ref.model.lora.lora_A_init_method=kaiming + # # Optional: Use canonical LoRA + # actor_rollout_ref.model.lora.type="canonical_lora" + # actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]' + + # # Optional: Add dropout to LoRA layers + # actor_rollout_ref.model.lora.dropout=0.05 + # actor_rollout_ref.model.lora.dropout_position=pre ) ACTOR=( @@ -58,6 +71,9 @@ ACTOR=( actor_rollout_ref.actor.kl_loss_coef=0.001 actor_rollout_ref.actor.kl_loss_type=low_var_kl actor_rollout_ref.actor.entropy_coeff=0 + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 ) ROLLOUT=( diff --git a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh index 77805cdfb3b..3a92171b6e2 100644 --- a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh +++ b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh @@ -3,9 +3,11 @@ set -xeuo pipefail # Need to install Megatron-Bridge # NOTE: Make sure you use Megatron-Bridge later than 0.2.0 -# (after https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/36302b7ca1305f0690e17cf4e4019ac822746872) -# for MoE LoRA When you want to set ETP and ETP!=TP. -# https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues/1363 +# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/a489bed3a2410ed9b000ec13a3c90176fec7d99c or later) +# for proper MoE LoRA support. + +# For Megatron communication/computation overlapping +export CUDA_DEVICE_MAX_CONNECTIONS=1 ########################### Quick Config ########################### @@ -41,9 +43,17 @@ DATA=( MODEL=( actor_rollout_ref.model.path=Qwen/Qwen3-30B-A3B-Instruct-2507 - actor_rollout_ref.model.lora.rank=16 - actor_rollout_ref.model.lora.alpha=32 actor_rollout_ref.model.use_fused_kernels=True + actor_rollout_ref.model.lora.rank=32 + actor_rollout_ref.model.lora.alpha=64 + actor_rollout_ref.model.lora.lora_A_init_method=kaiming + # # Optional: Use canonical LoRA + # actor_rollout_ref.model.lora.type="canonical_lora" + # actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]' + + # # Optional: Add dropout to LoRA layers + # actor_rollout_ref.model.lora.dropout=0.05 + # actor_rollout_ref.model.lora.dropout_position=pre ) ACTOR=(