diff --git a/docs/algo/baseline.md b/docs/algo/baseline.md index c85e8602551..4d23a9c15a8 100644 --- a/docs/algo/baseline.md +++ b/docs/algo/baseline.md @@ -28,7 +28,6 @@ Refer to the table below to reproduce RL training from different pre-trained che | NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | Instruct model | 83.7 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) | | NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | RLOO (Megatron) | 92.3 | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d) | | NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPIN | 92 | [script](https://github.com/volcengine/verl/tree/main/recipe/spin/README.md) | -| NVIDIA GPU | Qwen/Qwen2.5-VL-7B-Instruct | GRPO (Megatron) | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek) | | AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | PPO | 70.5 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log) | | AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | GRPO | 71.4 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log) | diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh index 1ad5141f943..63791481657 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh @@ -5,33 +5,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation ov HF_MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct DIST_CKPT_PATH=${DIST_CKPT_PATH} -# convert HF model to meagatron format offlinely -# python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH +# convert HF model to verl format +# python scripts/converter_hf_to_verl.py --hf_model_path $HF_MODEL_PATH --output_dir $DIST_CKPT_PATH - -# megatron tuning guide: -# 1. recommend to offload all states by setting ALL_OFFLOAD=True -# 2. enable dynamic batch size by setting actor_rollout_ref.actor.use_dynamic_bsz=True ref.log_prob_use_dynamic_bsz=True rollout.log_prob_use_dynamic_bsz=True -# 3. set ppo_max_token_len_per_gpu and log_prob_max_token_len_per_gpu as large as possible for better MFU (limited by GPU memory). assure ppo_max_token_len_per_gpu > max_prompt_length+max_response_length, if sequence length is too long, you can increase the TP/PP size -# 4. if memory is very limited, enable full recompute, but the mfu will be 30% lower -# full recompute settings: -# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ -# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ -# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - -ALL_OFFLOAD=${ALL_OFFLOAD:-True} -COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD} -COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD} -COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD} - -ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} -ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD} -ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD} -REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} - - -train_path=$HOME/data/geo3k/train.parquet -test_path=$HOME/data/geo3k/test.parquet +train_path=/data/geo3k/train.parquet +test_path=/data/geo3k/test.parquet python3 -m verl.trainer.main_ppo --config-path=config \ --config-name='ppo_megatron_trainer.yaml'\ @@ -53,16 +31,11 @@ python3 -m verl.trainer.main_ppo --config-path=config \ actor_rollout_ref.actor.kl_loss_coef=0.01 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ - actor_rollout_ref.actor.use_dynamic_bsz=True \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5120 \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=20480 \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1 \ @@ -71,10 +44,6 @@ python3 -m verl.trainer.main_ppo --config-path=config \ actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \ actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ - actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \ - actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \ - actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger=['console','wandb'] \