Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/e2e_ppo_trainer_megatron_vllm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,8 @@ jobs:
- name: clean up and install Megatron-Bridge
run: |
rm -rf checkpoints
pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation
pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek)
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ jobs:
- name: Install the current repository
run: |
pip3 install --no-deps -e .[test]
pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation
pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
- name: Prepare GSM8K dataset
run: |
Expand Down
20 changes: 18 additions & 2 deletions examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#!/usr/bin/env bash
set -xeuo pipefail

# Need to install Megatron-Bridge
# NOTE: Make sure you use Megatron-Bridge later than 0.2.0
# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/a489bed3a2410ed9b000ec13a3c90176fec7d99c or later)
# for proper MoE LoRA support.

# For Megatron communication/computation overlapping
export CUDA_DEVICE_MAX_CONNECTIONS=1

Expand Down Expand Up @@ -41,8 +46,16 @@ DATA=(

MODEL=(
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct
actor_rollout_ref.model.lora.rank=16
actor_rollout_ref.model.lora.alpha=32
actor_rollout_ref.model.lora.rank=256
actor_rollout_ref.model.lora.alpha=512
actor_rollout_ref.model.lora.lora_A_init_method=kaiming
# # Optional: Use canonical LoRA
# actor_rollout_ref.model.lora.type="canonical_lora"
# actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'

# # Optional: Add dropout to LoRA layers
# actor_rollout_ref.model.lora.dropout=0.05
# actor_rollout_ref.model.lora.dropout_position=pre
)

ACTOR=(
Expand All @@ -58,6 +71,9 @@ ACTOR=(
actor_rollout_ref.actor.kl_loss_coef=0.001
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.entropy_coeff=0
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
)

ROLLOUT=(
Expand Down
20 changes: 15 additions & 5 deletions examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ set -xeuo pipefail

# Need to install Megatron-Bridge
# NOTE: Make sure you use Megatron-Bridge later than 0.2.0
# (after https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/36302b7ca1305f0690e17cf4e4019ac822746872)
# for MoE LoRA When you want to set ETP and ETP!=TP.
# https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues/1363
# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/a489bed3a2410ed9b000ec13a3c90176fec7d99c or later)
# for proper MoE LoRA support.

# For Megatron communication/computation overlapping
export CUDA_DEVICE_MAX_CONNECTIONS=1

########################### Quick Config ###########################

Expand Down Expand Up @@ -41,9 +43,17 @@ DATA=(

MODEL=(
actor_rollout_ref.model.path=Qwen/Qwen3-30B-A3B-Instruct-2507
actor_rollout_ref.model.lora.rank=16
actor_rollout_ref.model.lora.alpha=32
actor_rollout_ref.model.use_fused_kernels=True
actor_rollout_ref.model.lora.rank=32
actor_rollout_ref.model.lora.alpha=64
actor_rollout_ref.model.lora.lora_A_init_method=kaiming
# # Optional: Use canonical LoRA
# actor_rollout_ref.model.lora.type="canonical_lora"
# actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'

# # Optional: Add dropout to LoRA layers
# actor_rollout_ref.model.lora.dropout=0.05
# actor_rollout_ref.model.lora.dropout_position=pre
)

ACTOR=(
Expand Down