From 2f9b50cde40a5256c10be1713ff473ca3ac9b8ec Mon Sep 17 00:00:00 2001 From: FightingZhen <295632982@qq.com> Date: Sat, 29 Nov 2025 09:26:08 +0800 Subject: [PATCH] Switch ascend ci calculation resource --- .github/workflows/e2e_ascend.yml | 24 ++++--------------- tests/special_npu/run_qwen2_5_05b_dapo.sh | 23 +++++++----------- tests/special_npu/run_qwen2_5_05b_grpo.sh | 18 +++++++------- .../run_qwen2_5_05b_grpo_mindspeed.sh | 20 ++++++++-------- .../run_qwen2_5_05b_sft_peft_sp2.sh | 2 +- tests/special_npu/run_qwen2_5_vl_3b_npu.sh | 18 +++++++------- tests/special_npu/run_qwen3_06b_ppo.sh | 17 ++++++------- .../run_qwen3_30b_dapo_mindspeed.sh | 16 ++++++------- 8 files changed, 57 insertions(+), 81 deletions(-) diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml index fa116972856..32ed62e5838 100644 --- a/.github/workflows/e2e_ascend.yml +++ b/.github/workflows/e2e_ascend.yml @@ -68,27 +68,11 @@ jobs: test: if: github.repository_owner == 'volcengine' name: verl Ascend test (self-host) - runs-on: [self-hosted, npu-0] + runs-on: linux-aarch64-a2-8 timeout-minutes: 60 # Increase this timeout value as needed container: - image: quay.io/ascend/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest - volumes: - - /usr/local/dcmi:/usr/local/dcmi - - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi - - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ - - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info - - /etc/ascend_install.info:/etc/ascend_install.info - - /data00/dataset:/github/home/dataset - - /data00/models:/github/home/models - # Use self-host cache speed up pip and model download - # - /home/action/actions-runner/_work/cache:/github/home/.cache/ + image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest options: >- - --device /dev/davinci0 - --device /dev/davinci_manager - --device /dev/devmm_svm - --device /dev/hisi_hdc - --network host - --privileged --shm-size 16g env: HTTP_PROXY: ${{ secrets.PROXY_HTTP }} @@ -118,10 +102,10 @@ jobs: pip list - name: Preprocess gsm8k dataset run: | - python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/dataset/openai/gsm8k + python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k - name: Preprocess geo3k dataset run: | - python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/dataset/hiyouga/geometry3k + python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k - name: Running gsm8k e2e qwen3 training tests with PPO on ASCEND NPU run: | ray stop --force diff --git a/tests/special_npu/run_qwen2_5_05b_dapo.sh b/tests/special_npu/run_qwen2_5_05b_dapo.sh index 0a2a90a2524..d90b63cb277 100644 --- a/tests/special_npu/run_qwen2_5_05b_dapo.sh +++ b/tests/special_npu/run_qwen2_5_05b_dapo.sh @@ -2,10 +2,10 @@ set -xeuo pipefail export VLLM_ASCEND_ENABLE_NZ=0 -NUM_GPUS=${NUM_GPUS:-16} +NUM_GPUS=${NUM_GPUS:-8} MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} adv_estimator=grpo @@ -25,19 +25,15 @@ overlong_penalty_factor=1.0 loss_agg_mode="token-mean" -enable_filter_groups=True +enable_filter_groups=False filter_groups_metric=seq_reward max_num_gen_batches=10 -train_traj_micro_bsz_per_gpu=2 # b -n_resp_per_prompt=4 # g +train_traj_micro_bsz_per_gpu=1 # b +n_resp_per_prompt=2 # g -train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n -train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n -train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g -train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g - -gen_prompt_bsz=$((train_prompt_bsz * 4)) +train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) +train_prompt_mini_bsz=$((train_traj_micro_bsz * n_resp_per_prompt * 2)) exp_name="$(basename "${MODEL_ID,,}")-dapo-minimal" @@ -58,8 +54,7 @@ python3 -m recipe.dapo.main_dapo \ reward_model.overlong_buffer.len=${overlong_buffer_len} \ reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ + data.gen_batch_size=${train_prompt_mini_bsz} \ algorithm.filter_groups.enable=${enable_filter_groups} \ algorithm.filter_groups.metric=${filter_groups_metric} \ algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \ @@ -96,5 +91,5 @@ python3 -m recipe.dapo.main_dapo \ trainer.total_epochs=1 \ trainer.resume_mode=disable \ trainer.val_before_train=False \ - trainer.total_training_steps=2 \ + trainer.total_training_steps=1 \ trainer.device=npu $@ diff --git a/tests/special_npu/run_qwen2_5_05b_grpo.sh b/tests/special_npu/run_qwen2_5_05b_grpo.sh index 1c72e0cf581..cd3edc1e30e 100644 --- a/tests/special_npu/run_qwen2_5_05b_grpo.sh +++ b/tests/special_npu/run_qwen2_5_05b_grpo.sh @@ -2,13 +2,13 @@ set -x export VLLM_ASCEND_ENABLE_NZ=0 MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} python3 -m verl.trainer.main_ppo \ algorithm.adv_estimator=grpo \ data.train_files=$HOME/data/gsm8k/train.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \ - data.train_batch_size=128 \ + data.train_batch_size=16 \ data.max_prompt_length=512 \ data.max_response_length=128 \ data.filter_overlong_prompts=True \ @@ -16,8 +16,8 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.path="${MODEL_PATH}" \ actor_rollout_ref.actor.optim.lr=5e-7 \ actor_rollout_ref.model.use_remove_padding=False \ - actor_rollout_ref.actor.ppo_mini_batch_size=64 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \ + actor_rollout_ref.actor.ppo_mini_batch_size=8 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ @@ -26,23 +26,23 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.use_torch_compile=False \ actor_rollout_ref.ref.use_torch_compile=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ - actor_rollout_ref.rollout.n=5 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.rollout.n=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ trainer.logger=console \ trainer.project_name='verl_grpo_example_gsm8k' \ trainer.experiment_name='qwen2_7b_function_rm' \ - trainer.n_gpus_per_node=16 \ + trainer.n_gpus_per_node=8 \ trainer.nnodes=1 \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=2 \ + trainer.total_training_steps=1 \ trainer.device=npu $@ diff --git a/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh b/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh index bb1bb1db45a..bdf225dc3a1 100644 --- a/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh +++ b/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh @@ -2,7 +2,7 @@ set -x export VLLM_ASCEND_ENABLE_NZ=0 MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} USE_DIST_CKPT=${USE_DIST_CKPT:-False} DIST_CKPT_PATH=${DIST_CKPT_PATH:-${HOME}/dist_ckpt/qwen2_5_05b_grpo_mindspeed} @@ -21,15 +21,15 @@ python3 -m verl.trainer.main_ppo --config-path=config \ algorithm.adv_estimator=grpo \ data.train_files=$HOME/data/gsm8k/train.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \ - data.train_batch_size=128 \ + data.train_batch_size=16 \ data.max_prompt_length=512 \ data.max_response_length=128 \ data.filter_overlong_prompts=True \ data.truncation='error' \ - actor_rollout_ref.model.path=${MODEL_ID} \ + actor_rollout_ref.model.path=${MODEL_PATH} \ actor_rollout_ref.actor.optim.lr=5e-7 \ - actor_rollout_ref.actor.ppo_mini_batch_size=64 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \ + actor_rollout_ref.actor.ppo_mini_batch_size=8 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.strategy=megatron \ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \ @@ -40,13 +40,13 @@ python3 -m verl.trainer.main_ppo --config-path=config \ actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.use_torch_compile=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ - actor_rollout_ref.rollout.n=5 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.rollout.n=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.strategy=megatron \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \ @@ -59,11 +59,11 @@ python3 -m verl.trainer.main_ppo --config-path=config \ trainer.logger=console \ trainer.project_name='verl_grpo_example_gsm8k' \ trainer.experiment_name='qwen2_7b_function_rm' \ - trainer.n_gpus_per_node=16 \ + trainer.n_gpus_per_node=8 \ trainer.nnodes=1 \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=2 \ + trainer.total_training_steps=1 \ trainer.device=npu \ +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True $@ diff --git a/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh b/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh index 19c3ac033b6..5af44c9907a 100644 --- a/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh +++ b/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh @@ -3,7 +3,7 @@ set -x mkdir -p ./save_ckpts MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} torchrun --standalone --nnodes=1 --nproc_per_node=8 \ -m verl.trainer.fsdp_sft_trainer \ diff --git a/tests/special_npu/run_qwen2_5_vl_3b_npu.sh b/tests/special_npu/run_qwen2_5_vl_3b_npu.sh index 932bd776de7..10ffdf3747f 100644 --- a/tests/special_npu/run_qwen2_5_vl_3b_npu.sh +++ b/tests/special_npu/run_qwen2_5_vl_3b_npu.sh @@ -8,13 +8,13 @@ ENGINE=${1:-vllm} export USE_OPTIMIZED_MODEL=0 MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-VL-3B-Instruct} -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} python3 -m verl.trainer.main_ppo \ algorithm.adv_estimator=grpo \ data.train_files=$HOME/data/geo3k/train.parquet \ data.val_files=$HOME/data/geo3k/test.parquet \ - data.train_batch_size=512 \ + data.train_batch_size=16 \ data.max_prompt_length=1024 \ data.max_response_length=2048 \ data.filter_overlong_prompts=True \ @@ -23,8 +23,8 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.path="${MODEL_PATH}" \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.actor.ppo_mini_batch_size=32 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.actor.ppo_mini_batch_size=8 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.kl_loss_coef=0.01 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ @@ -34,25 +34,25 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.ref.use_torch_compile=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=$ENGINE \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.enforce_eager=True \ actor_rollout_ref.rollout.free_cache_engine=True \ - actor_rollout_ref.rollout.n=5 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.n=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger=console \ trainer.project_name='verl_grpo_example_geo3k' \ trainer.experiment_name='qwen2_5_vl_3b_function_rm' \ - trainer.n_gpus_per_node=16 \ + trainer.n_gpus_per_node=8 \ trainer.nnodes=1 \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=2 \ + trainer.total_training_steps=1 \ trainer.device=npu $@ \ No newline at end of file diff --git a/tests/special_npu/run_qwen3_06b_ppo.sh b/tests/special_npu/run_qwen3_06b_ppo.sh index 34c9f4a13ee..284ad091e84 100644 --- a/tests/special_npu/run_qwen3_06b_ppo.sh +++ b/tests/special_npu/run_qwen3_06b_ppo.sh @@ -1,17 +1,14 @@ set -x export VLLM_ASCEND_ENABLE_NZ=0 -# TODO (FightingZhen) Env VLLM_USE_V1=1 is not supported in vllm==0.7.3 -# export VLLM_USE_V1=1 - MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} # TODO: change to Qwen3-0.6B when CI server is ready -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} python3 -m verl.trainer.main_ppo \ algorithm.adv_estimator=gae \ data.train_files=$HOME/data/gsm8k/train.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \ - data.train_batch_size=128 \ + data.train_batch_size=16 \ data.max_prompt_length=512 \ data.max_response_length=128 \ data.shuffle=False \ @@ -19,15 +16,15 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.ppo_mini_batch_size=64 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.actor.ppo_mini_batch_size=8 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.fsdp_config.param_offload=True \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ actor_rollout_ref.actor.use_kl_loss=False \ actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \ actor_rollout_ref.actor.use_dynamic_bsz=True \ actor_rollout_ref.actor.use_torch_compile=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ @@ -38,7 +35,7 @@ python3 -m verl.trainer.main_ppo \ critic.model.use_remove_padding=True \ critic.model.path="${MODEL_PATH}" \ critic.model.enable_gradient_checkpointing=True \ - critic.ppo_micro_batch_size_per_gpu=8 \ + critic.ppo_micro_batch_size_per_gpu=1 \ critic.ulysses_sequence_parallel_size=2 \ critic.model.fsdp_config.param_offload=True \ critic.model.fsdp_config.optimizer_offload=True \ @@ -52,5 +49,5 @@ python3 -m verl.trainer.main_ppo \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=2 \ + trainer.total_training_steps=1 \ trainer.device=npu $@ diff --git a/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh b/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh index 09e975b5a34..aece3d11471 100644 --- a/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh +++ b/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh @@ -4,7 +4,7 @@ set -xeuo pipefail export VLLM_ASCEND_ENABLE_NZ=0 MODEL_ID=${MODEL_ID:-Qwen/Qwen3-30B-A3B-Instruct-2507} -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} USE_DIST_CKPT=${USE_DIST_CKPT:-False} DIST_CKPT_PATH=${DIST_CKPT_PATH:-${HOME}/dist_ckpt/qwen3_30b_dapo_mindspeed} @@ -73,12 +73,12 @@ python3 -m recipe.dapo.main_dapo \ algorithm.filter_groups.enable=False \ algorithm.filter_groups.max_num_gen_batches=10 \ algorithm.filter_groups.metric=acc \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ - actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.rollout.n=2 \ actor_rollout_ref.rollout.temperature=1.0 \ actor_rollout_ref.rollout.top_p=1.0 \ actor_rollout_ref.rollout.top_k=-1 \ @@ -96,8 +96,8 @@ python3 -m recipe.dapo.main_dapo \ actor_rollout_ref.actor.use_dynamic_bsz=True \ actor_rollout_ref.model.path="${MODEL_PATH}" \ actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.ppo_mini_batch_size=16 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.ppo_mini_batch_size=8 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ @@ -108,7 +108,7 @@ python3 -m recipe.dapo.main_dapo \ actor_rollout_ref.actor.use_kl_loss=False \ actor_rollout_ref.actor.loss_agg_mode="token-mean" \ actor_rollout_ref.ref.strategy=megatron \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \ actor_rollout_ref.ref.megatron.expert_model_parallel_size=2 \ @@ -119,12 +119,12 @@ python3 -m recipe.dapo.main_dapo \ trainer.logger=['console'] \ trainer.project_name='verl_gsm8k_example' \ trainer.experiment_name='qwen3_30b_a3b_cut_gsm8k_mindspeed' \ - trainer.n_gpus_per_node=16 \ + trainer.n_gpus_per_node=8 \ trainer.nnodes=1 \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=2 \ + trainer.total_training_steps=1 \ trainer.device=npu \ actor_rollout_ref.actor.use_torch_compile=False \ actor_rollout_ref.ref.use_torch_compile=False \