diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml index 41673db6541..8d55a274d26 100644 --- a/.github/workflows/e2e_ascend.yml +++ b/.github/workflows/e2e_ascend.yml @@ -228,3 +228,44 @@ jobs: ray stop --force bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh rm -rf $HOME/ckpts + + recipe_job: + if: github.repository_owner == 'volcengine' + name: E2E Ascend testing for recipes + runs-on: linux-aarch64-a2-4 + timeout-minutes: 60 + container: + image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest + options: >- + --shm-size 16g + env: + HF_ENDPOINT: "https://hf-mirror.com" + HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable + steps: + - name: Check npu and CANN info + run: | + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + npu-smi info + - name: Check initial pip list from image + run: | + pip list + - name: Checkout volcengine/verl repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + clean: true + - name: Install the current repository + run: | + pip install -r requirements-npu.txt + pip install -e . + - name: Check final pip list + run: | + pip list + - name: Preprocess gsm8k dataset + run: | + python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k + - name: Running the E2E test with one_step_off_policy algorithm on ASCEND NPU (FSDP2) + run: | + ray stop --force + bash tests/special_npu/run_one_step_off_policy.sh + rm -rf $HOME/ckpts diff --git a/tests/special_npu/run_one_step_off_policy.sh b/tests/special_npu/run_one_step_off_policy.sh new file mode 100644 index 00000000000..28da11cb7be --- /dev/null +++ b/tests/special_npu/run_one_step_off_policy.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Test script for one_step_off_policy E2E regression testing +# This script runs one_step_off_policy with FSDP2 +# to ensure the asynchronous training mechanism works correctly + +ACTOR_STRATEGY="fsdp2" + +# Download model if not exists +MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} +MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} +#huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=1024 +max_response_length=2048 +enable_overlong_buffer=True +overlong_buffer_len=128 +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" +train_prompt_bsz=8 +n_resp_per_prompt=3 +train_prompt_mini_bsz=4 + +# Temperature parameters +temperature=1.0 +top_p=1.0 +top_k=-1 +val_top_p=0.7 + +# One-step-off-policy specific parameters +# Allocate 2 NPUs for rollout, 2 NPUs for training +n_npus_rollout=2 +n_npus_training=2 + +exp_name="$(basename "${MODEL_ID,,}")-one-step-off-policy-${ACTOR_STRATEGY}-minimal" + +echo "Running one_step_off_policy with ${ACTOR_STRATEGY} strategy" +echo "Rollout GPUs: ${n_npus_rollout}, Training GPUs: ${n_npus_training}" + +common_params=( + data.train_files="${HOME}/data/gsm8k/train.parquet" + data.val_files="${HOME}/data/gsm8k/test.parquet" + data.prompt_key=prompt + data.truncation='left' + data.max_prompt_length=${max_prompt_length} + data.max_response_length=${max_response_length} + data.train_batch_size=${train_prompt_bsz} + actor_rollout_ref.rollout.n=${n_resp_per_prompt} + algorithm.adv_estimator=${adv_estimator} + algorithm.use_kl_in_reward=${use_kl_in_reward} + algorithm.kl_ctrl.kl_coef=${kl_coef} + actor_rollout_ref.hybrid_engine=False \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} + actor_rollout_ref.actor.clip_ratio_c=10.0 + actor_rollout_ref.model.path="${MODEL_PATH}" + actor_rollout_ref.actor.optim.lr=1e-6 + actor_rollout_ref.actor.optim.lr_warmup_steps=-1 + actor_rollout_ref.actor.optim.weight_decay=0.1 + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} + actor_rollout_ref.actor.entropy_coeff=0 + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 + actor_rollout_ref.rollout.temperature=${temperature} + actor_rollout_ref.rollout.top_p=${top_p} + actor_rollout_ref.rollout.top_k=${top_k} + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} + actor_rollout_ref.rollout.val_kwargs.do_sample=True + actor_rollout_ref.rollout.val_kwargs.n=1 + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.name=vllm \ + reward_model.reward_manager=dapo + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False + +reward_model.reward_kwargs.max_resp_len=${max_response_length} + trainer.logger=['console'] + trainer.project_name='verl-test' + trainer.experiment_name="${exp_name}" + trainer.val_before_train=True + trainer.test_freq=-1 + trainer.save_freq=-1 + trainer.total_epochs=2 + trainer.total_training_steps=2 + trainer.resume_mode=disable + trainer.nnodes=1 + trainer.n_gpus_per_node=${n_npus_training} + rollout.nnodes=1 + rollout.n_gpus_per_node=${n_npus_rollout} + +) + +# FSDP2 specific parameters +gen_tp=2 +sp_size=2 +fsdp_size=2 +ref_offload=True +actor_offload=False + +python3 -m recipe.one_step_off_policy.main_ppo \ + "${common_params[@]}" \ + actor_rollout_ref.actor.strategy=$ACTOR_STRATEGY \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} $@ + +echo "One-step-off-policy E2E test completed successfully with ${ACTOR_STRATEGY} strategy"