From 463cb753237bb0b5167de2262b6dfc2f7b8c8084 Mon Sep 17 00:00:00 2001 From: "hzji210@gmail.com" Date: Thu, 11 Dec 2025 09:12:25 +0800 Subject: [PATCH 1/4] add one step off policy test cases for npu --- .github/workflows/e2e_ascend.yml | 44 ++++++ tests/special_npu/run_one_step_off_policy.sh | 140 +++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 tests/special_npu/run_one_step_off_policy.sh diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml index c0301be747a..04ce6a7a224 100644 --- a/.github/workflows/e2e_ascend.yml +++ b/.github/workflows/e2e_ascend.yml @@ -222,3 +222,47 @@ jobs: ray stop --force bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh rm -rf $HOME/ckpts + + recipe_job: + if: github.repository_owner == 'volcengine' + name: E2E Ascend testing for recipes + runs-on: linux-aarch64-a2-8 + timeout-minutes: 60 + container: + image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest + options: >- + --shm-size 16g + env: + HTTP_PROXY: ${{ secrets.PROXY_HTTP }} + HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} + NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" + HF_ENDPOINT: "https://hf-mirror.com" + HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable + steps: + - name: Check npu and CANN info + run: | + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + npu-smi info + - name: Check initial pip list from image + run: | + pip list + - name: Checkout volcengine/verl repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + clean: true + - name: Install the current repository + run: | + pip install -r requirements-npu.txt + pip install -e . + - name: Check final pip list + run: | + pip list + - name: Preprocess gsm8k dataset + run: | + python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k + - name: Running the E2E test with one_step_off_policy algorithm on ASCEND NPU (FSDP2) + run: | + ray stop --force + bash tests/special_npu/run_one_step_off_policy.sh + rm -rf $HOME/ckpts diff --git a/tests/special_npu/run_one_step_off_policy.sh b/tests/special_npu/run_one_step_off_policy.sh new file mode 100644 index 00000000000..fa7d387253d --- /dev/null +++ b/tests/special_npu/run_one_step_off_policy.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Test script for one_step_off_policy E2E regression testing +# This script runs one_step_off_policy with FSDP2 +# to ensure the asynchronous training mechanism works correctly + +NUM_GPUS=${NUM_GPUS:-8} +ACTOR_STRATEGY="fsdp2" + +# Download model if not exists +MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} +#huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=1024 +max_response_length=2048 +enable_overlong_buffer=True +overlong_buffer_len=128 +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" +train_prompt_bsz=8 +n_resp_per_prompt=3 +train_prompt_mini_bsz=4 + +# Temperature parameters +temperature=1.0 +top_p=1.0 +top_k=-1 +val_top_p=0.7 + +# One-step-off-policy specific parameters +# Allocate 2 NPUs for rollout, 2 NPUs for training +n_npus_rollout=2 +n_npus_training=2 + +exp_name="$(basename "${MODEL_ID,,}")-one-step-off-policy-${ACTOR_STRATEGY}-minimal" + +echo "Running one_step_off_policy with ${ACTOR_STRATEGY} strategy" +echo "Total GPUs: ${NUM_GPUS}, Rollout GPUs: ${n_npus_rollout}, Training GPUs: ${n_npus_training}" + +common_params=( + data.train_files="${HOME}/data/gsm8k/train.parquet" + data.val_files="${HOME}/data/gsm8k/test.parquet" + data.prompt_key=prompt + data.truncation='left' + data.max_prompt_length=${max_prompt_length} + data.max_response_length=${max_response_length} + data.train_batch_size=${train_prompt_bsz} + actor_rollout_ref.rollout.n=${n_resp_per_prompt} + algorithm.adv_estimator=${adv_estimator} + algorithm.use_kl_in_reward=${use_kl_in_reward} + algorithm.kl_ctrl.kl_coef=${kl_coef} + actor_rollout_ref.hybrid_engine=False \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} + actor_rollout_ref.actor.clip_ratio_c=10.0 + actor_rollout_ref.model.path="${MODEL_PATH}" + actor_rollout_ref.actor.optim.lr=1e-6 + actor_rollout_ref.actor.optim.lr_warmup_steps=-1 + actor_rollout_ref.actor.optim.weight_decay=0.1 + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} + actor_rollout_ref.actor.entropy_coeff=0 + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 + actor_rollout_ref.rollout.temperature=${temperature} + actor_rollout_ref.rollout.top_p=${top_p} + actor_rollout_ref.rollout.top_k=${top_k} + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} + actor_rollout_ref.rollout.val_kwargs.do_sample=True + actor_rollout_ref.rollout.val_kwargs.n=1 + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.name=vllm \ + reward_model.reward_manager=dapo + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False + +reward_model.reward_kwargs.max_resp_len=${max_response_length} + trainer.logger=['console'] + trainer.project_name='verl-test' + trainer.experiment_name="${exp_name}" + trainer.val_before_train=True + trainer.test_freq=-1 + trainer.save_freq=-1 + trainer.total_epochs=2 + trainer.total_training_steps=2 + trainer.resume_mode=disable + trainer.nnodes=1 + trainer.n_gpus_per_node=${n_npus_training} + trainer.device=npu + rollout.nnodes=1 + rollout.n_gpus_per_node=${n_npus_rollout} + +) + +# FSDP2 specific parameters +gen_tp=2 +sp_size=2 +fsdp_size=2 +ref_offload=True +actor_offload=False + +python3 -m recipe.one_step_off_policy.main_ppo \ + "${common_params[@]}" \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} $@ + +echo "One-step-off-policy E2E test completed successfully with ${ACTOR_STRATEGY} strategy" \ No newline at end of file From 527a8af121ba116fbb8b8aa2ee1f2fd197ba69f3 Mon Sep 17 00:00:00 2001 From: Huazhong Date: Sat, 27 Dec 2025 15:11:37 +0800 Subject: [PATCH 2/4] Change runner from a2-8 to a2-4 for recipe job --- .github/workflows/e2e_ascend.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml index e4ba1eb56ae..db37890836d 100644 --- a/.github/workflows/e2e_ascend.yml +++ b/.github/workflows/e2e_ascend.yml @@ -232,15 +232,13 @@ jobs: recipe_job: if: github.repository_owner == 'volcengine' name: E2E Ascend testing for recipes - runs-on: linux-aarch64-a2-8 + runs-on: linux-aarch64-a2-4 timeout-minutes: 60 container: image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest options: >- --shm-size 16g env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable From ed8c06f57381df887861c47e2e55ef780791acf0 Mon Sep 17 00:00:00 2001 From: Huazhong Date: Sat, 27 Dec 2025 15:13:23 +0800 Subject: [PATCH 3/4] Refactor run_one_step_off_policy.sh for clarity Removed NUM_GPUS variable and updated echo statement to reflect current GPU settings. --- tests/special_npu/run_one_step_off_policy.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/special_npu/run_one_step_off_policy.sh b/tests/special_npu/run_one_step_off_policy.sh index fa7d387253d..28da11cb7be 100644 --- a/tests/special_npu/run_one_step_off_policy.sh +++ b/tests/special_npu/run_one_step_off_policy.sh @@ -5,7 +5,6 @@ set -xeuo pipefail # This script runs one_step_off_policy with FSDP2 # to ensure the asynchronous training mechanism works correctly -NUM_GPUS=${NUM_GPUS:-8} ACTOR_STRATEGY="fsdp2" # Download model if not exists @@ -51,7 +50,7 @@ n_npus_training=2 exp_name="$(basename "${MODEL_ID,,}")-one-step-off-policy-${ACTOR_STRATEGY}-minimal" echo "Running one_step_off_policy with ${ACTOR_STRATEGY} strategy" -echo "Total GPUs: ${NUM_GPUS}, Rollout GPUs: ${n_npus_rollout}, Training GPUs: ${n_npus_training}" +echo "Rollout GPUs: ${n_npus_rollout}, Training GPUs: ${n_npus_training}" common_params=( data.train_files="${HOME}/data/gsm8k/train.parquet" @@ -106,7 +105,6 @@ common_params=( trainer.resume_mode=disable trainer.nnodes=1 trainer.n_gpus_per_node=${n_npus_training} - trainer.device=npu rollout.nnodes=1 rollout.n_gpus_per_node=${n_npus_rollout} @@ -121,7 +119,7 @@ actor_offload=False python3 -m recipe.one_step_off_policy.main_ppo \ "${common_params[@]}" \ - actor_rollout_ref.actor.strategy=fsdp2 \ + actor_rollout_ref.actor.strategy=$ACTOR_STRATEGY \ critic.strategy=fsdp2 \ actor_rollout_ref.actor.grad_clip=1.0 \ actor_rollout_ref.model.use_remove_padding=True \ @@ -137,4 +135,4 @@ python3 -m recipe.one_step_off_policy.main_ppo \ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} $@ -echo "One-step-off-policy E2E test completed successfully with ${ACTOR_STRATEGY} strategy" \ No newline at end of file +echo "One-step-off-policy E2E test completed successfully with ${ACTOR_STRATEGY} strategy" From 7bb025433baf829b4af188df8b2a759a6252123c Mon Sep 17 00:00:00 2001 From: Zhen <295632982@qq.com> Date: Sat, 27 Dec 2025 15:57:56 +0800 Subject: [PATCH 4/4] remove proxy setting --- .github/workflows/e2e_ascend.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml index db37890836d..8d55a274d26 100644 --- a/.github/workflows/e2e_ascend.yml +++ b/.github/workflows/e2e_ascend.yml @@ -239,7 +239,6 @@ jobs: options: >- --shm-size 16g env: - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable steps: