Skip to content
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
cb75604
x
erictang000 Dec 4, 2025
4de19a8
basic weight sync test working!
erictang000 Dec 6, 2025
59e1899
x
erictang000 Dec 8, 2025
c1cd5e6
Merge branch 'main' of https://github.com/erictang000/SkyRL into mega…
erictang000 Dec 8, 2025
7621360
x
erictang000 Dec 9, 2025
3bf5e33
pyproject.toml solved?
erictang000 Dec 11, 2025
d4f7280
Merge branch 'megatron_lora' of https://github.com/erictang000/SkyRL …
erictang000 Dec 11, 2025
35f87ab
x
erictang000 Dec 11, 2025
0211323
lora weight merging broken but things otherwise running e2e
erictang000 Dec 15, 2025
c4438cf
figured out pp=1 is the problem
erictang000 Dec 16, 2025
9d07855
x
erictang000 Dec 18, 2025
3becd46
x
erictang000 Dec 19, 2025
6d83aca
x
erictang000 Dec 19, 2025
3383112
x
erictang000 Dec 20, 2025
2c84b1f
Merge branch 'main' of https://github.com/erictang000/SkyRL into mega…
erictang000 Dec 20, 2025
8a172d9
x
erictang000 Dec 20, 2025
d0f0939
Merge branch 'main' of https://github.com/erictang000/SkyRL into mega…
erictang000 Dec 20, 2025
e3ad7ee
Merge branch 'megatron_lora' of https://github.com/erictang000/SkyRL …
erictang000 Dec 20, 2025
3ab4924
lint
erictang000 Dec 20, 2025
cf06951
x
erictang000 Dec 21, 2025
a7f480e
x
erictang000 Dec 21, 2025
43cfba5
X
erictang000 Dec 21, 2025
c86381a
add temporary separate lora checkpointing path
erictang000 Dec 21, 2025
ce606d7
x
erictang000 Dec 21, 2025
52d4bbc
Merge branch 'megatron_lora' of https://github.com/erictang000/SkyRL …
erictang000 Dec 21, 2025
36f99de
x
erictang000 Dec 21, 2025
20f134d
Merge branch 'megatron_lora' of https://github.com/erictang000/SkyRL …
erictang000 Dec 21, 2025
09bee5a
thanks gemini
erictang000 Dec 21, 2025
2e1999a
x
erictang000 Dec 23, 2025
240174e
x
erictang000 Dec 23, 2025
c311bff
add canonical lora
erictang000 Dec 28, 2025
002be54
Merge branch 'main' of https://github.com/erictang000/SkyRL into mega…
erictang000 Dec 28, 2025
4a3a0b3
pin commit
erictang000 Dec 28, 2025
3ac51cd
x
erictang000 Dec 28, 2025
f9109a9
x
erictang000 Dec 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
set -x

# Colocated DAPO training+generation for Qwen3-4B-Base on DAPO training data with Megatron.
# bash examples/algorithms/dapo/prepare_dapo_data.sh
# bash examples/megatron/run_megatron_dapo_qwen3_4b.sh

MODEL_NAME="Qwen/Qwen3-4B-Base"
DATA_DIR="$HOME/data/dapo"
TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet"
TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet"
NUM_NODES=1
NUM_GPUS_PER_NODE=8
NUM_INFERENCE_ENGINES=4
INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=2
LOGGER="wandb" # change to "console" to print to stdout

CLIP_RATIO_LOW=0.2
CLIP_RATIO_HIGH=0.28
# use token mean loss reduction
LOSS_REDUCTION="token_mean"
# applies overlong filtering (but not soft overlong punishment)
APPLY_OVERLONG_FILTERING=true
# apply soft overlong punishment with custom trainer impl in main_dapo.py
OVERLONG_BUFFER_LEN=$((1024 * 4))
OVERLONG_BUFFER_PENALTY_FACTOR=1.0

# other DAPO parameters
USE_KL_LOSS=false
TEMPERATURE=1.0
TOP_P=1.0
EVAL_TOP_P=0.7
CLIP_RATIO_C=10.0
MAX_PROMPT_LENGTH=$((1024 * 2))
MAX_RESPONSE_LENGTH=$((1024 * 8))

# repro run parameters
TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
LR=1e-6

# megatron config
MEGATRON_TP=4
MEGATRON_PP=2
MEGATRON_CP=1
MEGATRON_EP=1
MEGATRON_ETP=null

# TIS parameters
TIS_IMP_RATIO_CAP=2.0
USE_TIS=true

uv run --isolated --extra mcore -m examples.algorithms.dapo.main_dapo \
data.train_data="['$TRAIN_FILE']" \
data.val_data="['$TEST_FILE']" \
trainer.algorithm.advantage_estimator="grpo" \
trainer.algorithm.policy_loss_type="dual_clip" \
+trainer.algorithm.overlong_buffer.len=$OVERLONG_BUFFER_LEN \
+trainer.algorithm.overlong_buffer.penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \
trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
generator.enforce_eager=$ENFORCE_EAGER \
generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
generator.sampling_params.temperature=$TEMPERATURE \
generator.sampling_params.top_p=$TOP_P \
generator.eval_sampling_params.top_p=$EVAL_TOP_P \
generator.eval_sampling_params.temperature=$TEMPERATURE \
trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \
trainer.policy.model.path="$MODEL_NAME" \
trainer.placement.colocate_all=true \
trainer.strategy=megatron \
trainer.placement.policy_num_nodes=$NUM_NODES \
trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \
generator.num_inference_engines=$NUM_INFERENCE_ENGINES \
generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \
trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
trainer.algorithm.use_tis=$USE_TIS \
trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \
trainer.epochs=20 \
trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \
trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \
trainer.eval_batch_size=1024 \
trainer.eval_before_train=false \
trainer.eval_interval=5 \
trainer.update_epochs_per_batch=1 \
trainer.train_batch_size=$TRAIN_BATCH_SIZE \
trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
trainer.micro_forward_batch_size_per_gpu=8 \
trainer.micro_train_batch_size_per_gpu=8 \
trainer.ckpt_interval=10 \
trainer.max_prompt_length=$MAX_PROMPT_LENGTH \
generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
trainer.policy.optimizer_config.lr=$LR \
trainer.policy.optimizer_config.num_warmup_steps=160 \
trainer.policy.optimizer_config.weight_decay=0.1 \
trainer.policy.optimizer_config.max_grad_norm=1.0 \
generator.backend=vllm \
generator.run_engines_locally=true \
generator.weight_sync_backend=nccl \
generator.async_engine=false \
generator.batched=true \
environment.env_class=aime \
generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \
generator.gpu_memory_utilization=0.8 \
trainer.logger="$LOGGER" \
trainer.project_name="dapo_aime" \
trainer.run_name="dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \
trainer.export_path="$HOME/exports/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \
trainer.hf_save_interval=25 \
trainer.resume_mode=latest \
trainer.max_ckpts_to_keep=3 \
trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \
$@
130 changes: 130 additions & 0 deletions skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
set -x

# Colocated DAPO training+generation for Qwen3-4B-Base on DAPO training data with Megatron and LoRA.
# bash examples/algorithms/dapo/prepare_dapo_data.sh
# bash examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh

MODEL_NAME="Qwen/Qwen3-4B-Base"
DATA_DIR="$HOME/data/dapo"
TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet"
TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet"
NUM_NODES=1
NUM_GPUS_PER_NODE=8
NUM_INFERENCE_ENGINES=4
INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=2
LOGGER="wandb" # change to "console" to print to stdout

CLIP_RATIO_LOW=0.2
CLIP_RATIO_HIGH=0.28
# use token mean loss reduction
LOSS_REDUCTION="token_mean"
# applies overlong filtering (but not soft overlong punishment)
APPLY_OVERLONG_FILTERING=true
# apply soft overlong punishment with custom trainer impl in main_dapo.py
OVERLONG_BUFFER_LEN=$((1024 * 4))
OVERLONG_BUFFER_PENALTY_FACTOR=1.0

# other DAPO parameters
USE_KL_LOSS=false
TEMPERATURE=1.0
TOP_P=1.0
EVAL_TOP_P=0.7
CLIP_RATIO_C=10.0
MAX_PROMPT_LENGTH=$((1024 * 2))
MAX_RESPONSE_LENGTH=$((1024 * 8))

# repro run parameters
TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
LR=3e-5

# megatron config
MEGATRON_TP=4
MEGATRON_PP=1
MEGATRON_CP=1
MEGATRON_EP=1
MEGATRON_ETP=null

# lora config
LORA_RANK=32
LORA_ALPHA=64
LORA_A_INIT_METHOD="kaiming"
LORA_METHOD="canonical_lora"

# TIS parameters
TIS_IMP_RATIO_CAP=2.0
USE_TIS=true

uv run --isolated --extra mcore -m examples.algorithms.dapo.main_dapo \
data.train_data="['$TRAIN_FILE']" \
data.val_data="['$TEST_FILE']" \
trainer.algorithm.advantage_estimator="grpo" \
trainer.algorithm.policy_loss_type="dual_clip" \
+trainer.algorithm.overlong_buffer.len=$OVERLONG_BUFFER_LEN \
+trainer.algorithm.overlong_buffer.penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \
trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
generator.enforce_eager=$ENFORCE_EAGER \
generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
generator.sampling_params.temperature=$TEMPERATURE \
generator.sampling_params.top_p=$TOP_P \
generator.eval_sampling_params.top_p=$EVAL_TOP_P \
generator.eval_sampling_params.temperature=$TEMPERATURE \
trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \
trainer.policy.model.path="$MODEL_NAME" \
trainer.placement.colocate_all=true \
trainer.strategy=megatron \
trainer.placement.policy_num_nodes=$NUM_NODES \
trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \
generator.num_inference_engines=$NUM_INFERENCE_ENGINES \
generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \
trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
trainer.policy.megatron_config.lora_config.lora_type=$LORA_METHOD \
trainer.algorithm.use_tis=$USE_TIS \
trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \
trainer.policy.model.lora.rank=$LORA_RANK \
trainer.policy.model.lora.alpha=$LORA_ALPHA \
trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \
trainer.epochs=20 \
trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \
trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \
trainer.eval_batch_size=1024 \
trainer.eval_before_train=true \
trainer.eval_interval=5 \
trainer.update_epochs_per_batch=1 \
trainer.train_batch_size=$TRAIN_BATCH_SIZE \
trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
trainer.micro_forward_batch_size_per_gpu=8 \
trainer.micro_train_batch_size_per_gpu=8 \
trainer.ckpt_interval=10 \
trainer.max_prompt_length=$MAX_PROMPT_LENGTH \
generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
trainer.policy.optimizer_config.lr=$LR \
trainer.policy.optimizer_config.num_warmup_steps=160 \
trainer.policy.optimizer_config.weight_decay=0.1 \
trainer.policy.optimizer_config.max_grad_norm=1.0 \
generator.backend=vllm \
generator.run_engines_locally=true \
generator.weight_sync_backend=nccl \
generator.async_engine=false \
generator.batched=true \
environment.env_class=aime \
generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \
generator.gpu_memory_utilization=0.8 \
trainer.logger="$LOGGER" \
trainer.project_name="dapo_aime" \
trainer.run_name="dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}_canonical_lora_use_tis" \
trainer.export_path="$HOME/exports/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}_canonical_lora_use_tis" \
trainer.hf_save_interval=300 \
trainer.resume_mode=latest \
trainer.max_ckpts_to_keep=3 \
trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}_canonical_lora_use_tis" \
$@
78 changes: 78 additions & 0 deletions skyrl-train/examples/megatron/run_megatron_lora_qwen3-0.6b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
set -x

# Colocated GRPO training+generation for Qwen3-0.6B on GSM8K with Megatron and LoRA.

# uv run examples/gsm8k/gsm8k_dataset.py --output_dir $HOME/data/gsm8k
# export WANDB_API_KEY=<your_key_here>
# bash examples/megatron/run_megatron_lora_qwen3-0.6b.sh

DATA_DIR="$HOME/data/gsm8k"
NUM_GPUS=8
LOGGER="wandb" # change to "console" to print to stdout
MODEL_NAME="Qwen/Qwen3-0.6B"

INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron

MEGATRON_TP=1
MEGATRON_PP=1
MEGATRON_CP=1

# LoRA configuration
LORA_RANK=32
LORA_ALPHA=64
LORA_A_INIT_METHOD="kaiming"
LORA_METHOD="canonical_lora"


uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
data.train_data="['$DATA_DIR/train.parquet']" \
data.val_data="['$DATA_DIR/validation.parquet']" \
trainer.algorithm.advantage_estimator="grpo" \
trainer.policy.model.path=$MODEL_NAME \
trainer.placement.colocate_all=true \
trainer.strategy=megatron \
trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \
generator.num_inference_engines=$NUM_GPUS \
generator.inference_engine_tensor_parallel_size=1 \
trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
trainer.ref.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
trainer.ref.megatron_config.context_parallel_size=$MEGATRON_CP \
trainer.ref.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
trainer.policy.model.lora.rank=$LORA_RANK \
trainer.policy.model.lora.alpha=$LORA_ALPHA \
trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \
trainer.policy.megatron_config.lora_config.lora_type=$LORA_METHOD \
trainer.gradient_checkpointing=true \
trainer.policy.model.lora.target_modules="all-linear" \
trainer.use_sample_packing=true \
trainer.epochs=20 \
trainer.eval_batch_size=1024 \
trainer.eval_before_train=false \
trainer.eval_interval=5 \
trainer.update_epochs_per_batch=1 \
trainer.train_batch_size=128 \
trainer.policy_mini_batch_size=64 \
trainer.micro_forward_batch_size_per_gpu=4 \
trainer.micro_train_batch_size_per_gpu=4 \
trainer.ckpt_interval=10 \
trainer.max_prompt_length=512 \
generator.sampling_params.max_generate_length=1024 \
trainer.policy.optimizer_config.lr=1.0e-5 \
trainer.algorithm.use_kl_loss=true \
generator.backend=$INFERENCE_BACKEND \
generator.run_engines_locally=true \
generator.weight_sync_backend=nccl \
generator.async_engine=true \
generator.batched=true \
environment.env_class=gsm8k \
generator.n_samples_per_prompt=5 \
generator.gpu_memory_utilization=0.6 \
trainer.logger="$LOGGER" \
trainer.project_name="gsm8k_megatron" \
trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}_lora_r${LORA_RANK}_a${LORA_ALPHA}" \
trainer.resume_mode=null \
trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \
$@
Loading