Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 251 additions & 0 deletions recipe/dapo/test_dapo_gptoss_20b_megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
#!/usr/bin/env bash
set -xeuo pipefail

################################################### document for gptoss ###################################################

####################### running environment: #######################
# option 1: use a pre-built docker image dedicated for gptoss: `docker://iseekyan/verl:nemo.gptoss_vllm0.11.0`, which is
# built upon nemo's dedicated image, see Dockerfile at https://github.com/volcengine/verl/blob/main/docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.vllm011.mcore_gpt-oss
#
# option 2: self build TE>=2.8 with CUDNN>=9.13.1, megatron with branch `core_dev_r0.15.0`, latest vllm or sglang
# you can modify the dockerfile to build the image, see Dockerfile at https://github.com/volcengine/verl/blob/main/docker/Dockerfile.stable.vllm or https://github.com/volcengine/verl/blob/main/docker/Dockerfile.stable.sglang

####################### before training: #######################
# # install matched mbridge version
# pip uninstall -y mbridge && pip install git+https://github.com/ISEEKYAN/mbridge@gpt-oss

# # convert gptoss to bf16
cat > get_model.py << EOF
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config

model_id = "openai/gpt-oss-20b"
output_dir = "$HOME/models/gpt-oss-20b-bf16"

quantization_config = Mxfp4Config(dequantize=True)
model_kwargs = dict(
attn_implementation="eager",
torch_dtype=torch.bfloat16,
quantization_config=quantization_config,
use_cache=False,
device_map="auto",
)

model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

# Patch config with custom attribute before saving
model.config.attn_implementation = "eager"

model.save_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(output_dir)
EOF

python get_model.py

####################### specific training config: #######################

GPT_OSS_CONFIG=(
# only support mbridge for gptoss
actor_rollout_ref.actor.megatron.use_mbridge=True
# for now (latest TE=2.10), gptoss's optimized attn kernel is not supported for thd format, so we use bshd format here
# when bshd format is used, we need to pad the input_ids to the longest sequence length
# so we recommend to disable dynamic batch size and set micro batch size to 1 to avoid paddings
# but it is ok to try with micro_batch_size>1
actor_rollout_ref.actor.megatron.use_remove_padding=False
)
use_dynamic_bsz=False # recommended but not necessary

################################################### quick config ###################################################

rollout_mode="sync"
rollout_name="vllm" # sglang or vllm
return_raw_chat="False"
if [ "$rollout_mode" = "async" ]; then
export VLLM_USE_V1=1
return_raw_chat="True"
fi
dtype="bfloat16" # ["bfloat16", "float16"]

project_name='DAPO'
exp_name='gptoss'

adv_estimator=grpo

use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0

clip_ratio_low=0.2
clip_ratio_high=0.28

max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 8))
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0

loss_agg_mode="token-mean"

train_prompt_bsz=32
n_resp_per_prompt=16
train_prompt_mini_bsz=32

# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-1}
# Paths
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/gpt-oss-20b"}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The default MODEL_PATH is inconsistent with the output_dir in the get_model.py script. The get_model.py script saves the model to "$HOME/models/gpt-oss-20b-bf16", but MODEL_PATH defaults to "${RAY_DATA_HOME}/models/gpt-oss-20b", which resolves to "${HOME}/verl/models/gpt-oss-20b". This will cause the script to fail with a "model not found" error unless MODEL_PATH is explicitly set. To ensure consistency, the default MODEL_PATH should point to the correct directory.

Suggested change
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/gpt-oss-20b"}
MODEL_PATH=${MODEL_PATH:-"$HOME/models/gpt-oss-20b-bf16"}

CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}

# Algorithm
temperature=1.0
top_p=1.0
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_top_p=0.7

# Performance Related Parameter
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
offload=True
gen_tp=4
train_tp=4
EP=8
ETP=1
train_pp=1

################################################### start of config ###################################################


DATA=(
data.train_files="${TRAIN_FILE}"
data.val_files="${TEST_FILE}"
data.prompt_key=prompt
data.return_raw_chat=$return_raw_chat
data.truncation='left'
data.max_prompt_length=${max_prompt_length}
data.max_response_length=${max_response_length}
data.train_batch_size=${train_prompt_bsz}
)

REWARD_MODEL=(
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer}
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len}
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor}
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False
+reward_model.reward_kwargs.max_resp_len=${max_response_length}
reward_model.reward_manager=dapo
)

PERF_OPT=(
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True
actor_rollout_ref.model.use_fused_kernels=False
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
)

ACTOR=(
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low}
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high}
actor_rollout_ref.actor.clip_ratio_c=10.0
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.optim.lr_warmup_steps=10
actor_rollout_ref.actor.optim.weight_decay=0.1
actor_rollout_ref.actor.optim.clip_grad=1.0
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
actor_rollout_ref.actor.megatron.param_offload=${offload}
actor_rollout_ref.actor.megatron.optimizer_offload=${offload}
actor_rollout_ref.actor.megatron.grad_offload=${offload}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.entropy_coeff=0
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
)

ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.mode=${rollout_mode}
actor_rollout_ref.rollout.dtype=${dtype}
actor_rollout_ref.rollout.gpu_memory_utilization=0.70
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
actor_rollout_ref.rollout.enable_chunked_prefill=True
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length))
actor_rollout_ref.rollout.temperature=${temperature}
actor_rollout_ref.rollout.top_p=${top_p}
actor_rollout_ref.rollout.top_k=${top_k}
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature}
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p}
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k}
actor_rollout_ref.rollout.val_kwargs.do_sample=True
actor_rollout_ref.rollout.val_kwargs.n=1
actor_rollout_ref.rollout.calculate_log_probs=True
actor_rollout_ref.rollout.n=${n_resp_per_prompt}
)

TRAINER=(
trainer.logger=['console','wandb']
trainer.project_name="${project_name}"
trainer.experiment_name="${exp_name}"
trainer.n_gpus_per_node=8
trainer.nnodes="${NNODES}"
trainer.val_before_train=False
trainer.test_freq=10
trainer.save_freq=-1
trainer.total_epochs=10
trainer.default_local_dir="${CKPTS_DIR}"
trainer.resume_mode=auto
trainer.log_val_generations=10
)

FORWARD_ONLY_SETS=(
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
)

MODEL=(
actor_rollout_ref.model.path="${MODEL_PATH}"
)

ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=${use_kl_in_reward}
algorithm.kl_ctrl.kl_coef=${kl_coef}
)
################################################### start script ###################################################
ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
-- python3 -m verl.trainer.main_ppo \
--config-path=config \
--config-name='ppo_megatron_trainer.yaml' \
"${DATA[@]}" \
"${ALGORITHM[@]}" \
"${MODEL[@]}" \
"${ROLLOUT[@]}" \
"${ACTOR[@]}" \
"${REWARD_MODEL[@]}" \
"${PERF_OPT[@]}" \
"${TRAINER[@]}" \
"${GPT_OSS_CONFIG[@]}" \
"${FORWARD_ONLY_SETS[@]}" \
Loading
Loading