Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 0 additions & 175 deletions .github/workflows/checkpoint_converter.yml

This file was deleted.

10 changes: 0 additions & 10 deletions .github/workflows/e2e_ppo_trainer_megatron_sglang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,6 @@ jobs:
export VLLM_USE_V1=1
ray start --head
ENGINE=sglang MODE=async RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
run: |
exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
- name: Profiling GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
run: |
ray stop --force
Expand Down Expand Up @@ -181,11 +176,6 @@ jobs:
run: |
ray stop --force
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Test Megatron checkpoints merging function (Qwen3 Actor and Critic)
run: |
exp_name="qwen3-0.6b-megatron-gsm8k-minimal"
python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
run: |
ray stop --force
Expand Down
32 changes: 0 additions & 32 deletions .github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,37 +105,6 @@ jobs:
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"

e2e_ppo_trainer_megatron-qwen2_5vl-3b:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install --no-deps -e .[test]
- name: Prepare Geo3k dataset
run: |
python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/
- name: Prepare dist_ckpt of Qwen2.5-VL-3B, only supports dist_ckpt
run: |
python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct --output_path checkpoints/verl-test/qwen2.5-vl-3b-megatron
- name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
run: |
ray stop --force
ENGINE=sglang ROLLOUT_MODE=async TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048 MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False SKIP_SAVE_HF_MODEL=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: clean up
run: |
rm -rf checkpoints

e2e_ppo_trainer_fsdp_sglang:
needs: setup
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
Expand Down Expand Up @@ -221,7 +190,6 @@ jobs:
needs:
[
setup,
e2e_ppo_trainer_megatron-qwen2_5vl-3b,
e2e_ppo_trainer_fsdp-qwen2_5vl-3b,
e2e_ppo_trainer_fsdp_sglang,
]
Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/e2e_ppo_trainer_megatron_vllm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,6 @@ jobs:
run: |
ray stop --force
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Test Megatron checkpoints merging function (Qwen3 Actor and Critic)
run: |
exp_name="qwen3-0.6b-megatron-gsm8k-minimal"
python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
run: |
ray stop --force
Expand Down
37 changes: 0 additions & 37 deletions .github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,42 +153,6 @@ jobs:
run: |
rm -rf checkpoints

e2e_ppo_trainer_megatron-qwen2_5vl-3b:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install --no-deps -e .[test]
pip3 install transformers==$TRANSFORMERS_VERSION
- name: Prepare Geo3k dataset
run: |
python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/
- name: Prepare dist_ckpt of Qwen2.5-VL-3B, only supports dist_ckpt
run: |
python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct --output_path checkpoints/verl-test/qwen2.5-vl-3b-megatron
- name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
run: |
ray stop --force
TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet \
MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048 MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo \
USE_DYNAMIC_BSZ=False USE_FUSED_KERNELS=True SKIP_SAVE_HF_MODEL=1 \
COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true \
DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: clean up
run: |
rm -rf checkpoints

e2e_ppo_trainer_fsdp_vllm:
needs: setup
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
Expand Down Expand Up @@ -330,7 +294,6 @@ jobs:
[
setup,
e2e_ppo_trainer_megatron-moe-expert-parallel,
e2e_ppo_trainer_megatron-qwen2_5vl-3b,
e2e_ppo_trainer_fsdp-qwen2_5vl-3b,
e2e_ppo_trainer_fsdp_vllm,
]
Expand Down
30 changes: 0 additions & 30 deletions .github/workflows/model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ on:
# Entrypoints
- ".github/workflows/model.yml"
- "tests/special_distributed/test_fsdp_ckpt.py"
- "tests/special_distributed/test_mcore_config_converter.py"
- "tests/special_distributed/test_tensor_dict.py"
- "tests/models/**"
- "tests/special_distributed/run_all.sh"
Expand Down Expand Up @@ -144,34 +143,6 @@ jobs:
run: |
STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py

mcore_config_converter:
needs: setup
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test]
# - name: Download model config files
# run: |
# hf download Qwen/Qwen2.5-7B config.json --local-dir $HOME/configs/Qwen/Qwen2.5-7B
# hf download Qwen/Qwen3-8B config.json --local-dir $HOME/configs/Qwen/Qwen3-8B
# hf download deepseek-ai/deepseek-coder-1.3b-instruct config.json --local-dir $HOME/configs/deepseek-ai/deepseek-coder-1.3b-instruct
# hf download Qwen/Qwen2-57B-A14B config.json --local-dir $HOME/configs/Qwen/Qwen2-57B-A14B
# hf download Qwen/Qwen3-30B-A3B config.json --local-dir $HOME/configs/Qwen/Qwen3-30B-A3B
# hf download deepseek-ai/DeepSeek-V3-Base config.json --local-dir $HOME/configs/deepseek-ai/DeepSeek-V3-Base
- name: Running mcore config converter tests on 8 L20 GPUs
run: |
torchrun --nproc_per_node=8 tests/special_distributed/test_mcore_config_converter.py

model_engine:
needs: setup
Expand Down Expand Up @@ -206,7 +177,6 @@ jobs:
setup,
model_rmpad,
model_rmpad_fsdp2_unstable,
mcore_config_converter,
model_engine
]
if: always()
Expand Down
28 changes: 2 additions & 26 deletions docs/advance/checkpoint.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,32 +137,8 @@ Current implementation use solution 2.
HuggingFace to Megatron DistCheckpoint details
----------------------------------------------

If your model is quite huge, we recommend you to use Megatron dist-checkpoint to load the model.
Megatron dist-checkpoint supports loading with different kinds of model parallelism,
and it is much faster than the original checkpoint loading.

To convert original HuggingFace model to Megatron dist-checkpoint,
you can use the ``scripts/converter_hf_to_mcore.py`` script. Large MoE models are temporarily supported with CPU initialization,
which is a little slower. While we are working on a better solution to support large models.

Example command to convert the model is as follows:

.. code:: bash

python scripts/converter_hf_to_mcore.py \
--hf_model_path Qwen/Qwen1.5-MoE-A2.7B-Chat \
--output_path /mnt/disk/Qwen/Qwen1.5-MoE-A2.7B-Chat \
--use_cpu_initialization # Only work for MoE models


Example command to distributed convert the huge model like deepseekv3 671B is as follows:

.. code:: bash

torchrun --nproc_per_node 1 --nnodes 8 --node_rank ${RANK} scripts/converter_hf_to_mcore.py \
--hf_model_path deepseek-ai/DeepSeek-V3 \
--output_path /mnt/disk/deepseek-ai/DeepSeek-V3 \
--use_cpu_initialization # Only work for MoE models
Through ``mbridge``, we can directly save the mcore model to huggingface format during training.
No need to convert the model to Megatron dist-checkpoint format.

Original Checkpoint Utils
-------------------------
Expand Down
Loading
Loading