zzong2006 · zzong2006 · May 23, 2025 · May 23, 2025
diff --git a/.github/workflows/e2e_ppo_trainer_megatron.yml b/.github/workflows/e2e_ppo_trainer_megatron.yml
@@ -65,7 +65,7 @@ jobs:
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with validation and saving
         run: |
           ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 bash tests/e2e/run_ppo_trainer_megatron.sh
+          ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 bash tests/e2e/run_ppo_trainer_megatron.sh
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) after resuming
         run: |
           ray stop --force
@@ -107,7 +107,7 @@ jobs:
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
         run: |
           ray stop --force
-          SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
+          ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
         run: |
           ray stop --force
@@ -149,7 +149,7 @@ jobs:
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) with validation and saving
         run: |
           ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/e2e/run_ppo_trainer_megatron.sh
+          ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/e2e/run_ppo_trainer_megatron.sh
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) after resuming
         run: |
           ray stop --force
@@ -305,34 +305,5 @@ jobs:
       - name: clean up
         run: |
           rm -rf checkpoints
-  e2e_ppo_trainer_megatron-offload-qwen3:
-    runs-on: [L20x8]
-    timeout-minutes: 30 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-      - name: Prepare GSM8K dataset
-        run: |
-          python3 examples/data_preprocess/gsm8k.py
-      - name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron and Offload
-        run: |
-          ray stop --force
-          COMMON_PARAM_OFFLOAD=True COMMON_GRAD_OFFLOAD=True COMMON_OPTIMIZER_OFFLOAD=True ADV_ESTIMATOR=grpo MODEL_ID=Qwen/Qwen3-0.6B bash tests/e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
 
 
diff --git a/tests/e2e/run_ppo_trainer_megatron.sh b/tests/e2e/run_ppo_trainer_megatron.sh
@@ -55,9 +55,10 @@ RM_VPP=${RM_VPP:-$COMMON_VPP}
 RM_CP=${RM_CP:-$COMMON_CP}
 RM_TP=${RM_TP:-$TRAIN_TP}
 
-COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-False}
-COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-False}
-COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-False}
+ALL_OFFLOAD=${ALL_OFFLOAD:-False}
+COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
+COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
+COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
 
 ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
 ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
@@ -66,9 +67,7 @@ REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
 CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
 CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
 CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
-RM_GRAD_OFFLOAD=${RM_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
 RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
-RM_OPTIMIZER_OFFLOAD=${RM_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
 
 CHECKPOINT_CONTENTS=['model','hf_model','optimizer','extra']
 SKIP_SAVE_HF_MODEL=${SKIP_SAVE_HF_MODEL:-0}
@@ -134,8 +133,6 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     reward_model.megatron.context_parallel_size=$RM_CP \
     reward_model.megatron.tensor_model_parallel_size=$RM_TP \
     reward_model.megatron.param_offload=${RM_PARAM_OFFLOAD} \
-    reward_model.megatron.optimizer_offload=${RM_OPTIMIZER_OFFLOAD} \
-    reward_model.megatron.grad_offload=${RM_GRAD_OFFLOAD} \
     algorithm.use_kl_in_reward=False \
     algorithm.kl_penalty=kl \
     algorithm.kl_ctrl.kl_coef=0.001 \

diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -227,8 +227,6 @@ reward_model:
   strategy: megatron
   megatron:
     param_offload: False
-    grad_offload: False
-    optimizer_offload: False
     tensor_model_parallel_size: 1
     expert_model_parallel_size: 1
     expert_tensor_parallel_size: null