From 027bde54d08f0c60c79ba839926d9c8725241c39 Mon Sep 17 00:00:00 2001
From: eternally-z <zzywzj@gmail.com>
Date: Tue, 9 Dec 2025 16:15:03 +0800
Subject: [PATCH 1/5] recipe:add FlashRL with sglang recipe

Co-authored-by: AniZpZ <aniz1905@gmail.com>
Co-authored-by: Wilboludriver <wilbolu@outlook.com>
---
 recipe/flashrl/run_flashrl_sglang.sh | 134 +++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 recipe/flashrl/run_flashrl_sglang.sh

diff --git a/recipe/flashrl/run_flashrl_sglang.sh b/recipe/flashrl/run_flashrl_sglang.sh
new file mode 100644
index 00000000000..0546b9271c1
--- /dev/null
+++ b/recipe/flashrl/run_flashrl_sglang.sh
@@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+ID=${1:-"qwen2.5_14bInstruct_sglang_fp8_8gpu_tis_on"}
+CAP=${2:-"5"}
+
+project_name='DAPO-1'
+exp_name=$ID
+
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+echo "WORKING_DIR: ${WORKING_DIR}"
+
+# Paths
+CKPTS_DIR=${CKPTS_DIR:-"${WORKING_DIR}/ckpts/${project_name}/${exp_name}"}
+
+MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-14B-Instruct"}
+
+export SGLANG_PATCH=1
+export SGLANG_PORT=30300
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export RAY_TEMP_DIR=/workdir/ray_temp
+mkdir -p ${RAY_TEMP_DIR}
+
+
+# Start a new Ray cluster using GPUs 2,3,4,5 on a different port
+# This allows running multiple training tasks simultaneously
+RAY_PORT=6811
+RAY_DASHBOARD_PORT=8897
+
+# Check if Ray server is already running on the specified port
+# Use localhost instead of IP address to avoid IP mismatch issues
+echo "Checking if Ray server is already running on localhost:${RAY_PORT}..."
+if ray status --address="localhost:${RAY_PORT}" >/dev/null 2>&1; then
+    echo "Ray server is already running on localhost:${RAY_PORT}, using existing cluster."
+    RAY_RUNNING=true
+else
+    echo "Ray server is not running, starting new Ray cluster..."
+    # Start new Ray cluster with 8 GPUs
+    ray start --head \
+        --port=${RAY_PORT} \
+        --dashboard-port=${RAY_DASHBOARD_PORT} \
+        --num-gpus=8 \
+        --dashboard-host=0.0.0.0 \
+        --temp-dir=${RAY_TEMP_DIR} \
+        --disable-usage-stats || true
+    echo "Ray cluster started successfully on localhost:${RAY_PORT}"
+    # Wait a moment for Ray to fully initialize
+    echo "Waiting for Ray cluster to be ready..."
+    sleep 5
+    # Verify Ray is actually running
+    if ray status --address="localhost:${RAY_PORT}" >/dev/null 2>&1; then
+        echo "Ray cluster is ready and accessible."
+        RAY_RUNNING=true
+    else
+        echo "Warning: Ray cluster may not be fully ready yet, but continuing..."
+        RAY_RUNNING=false
+    fi
+fi
+
+PYTHONUNBUFFERED=1 python3 -m recipe.dapo.main_dapo \
+    data.train_files=$HOME/data/BytedTsinghua-SIA/DAPO-Math-17k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=2048 \
+    data.val_max_samples=1 \
+    data.max_response_length=4096 \
+    data.gen_batch_size=96 \
+    data.train_batch_size=32 \
+    actor_rollout_ref.rollout.n=16 \
+    algorithm.adv_estimator=grpo \
+    algorithm.use_kl_in_reward=False \
+    algorithm.kl_ctrl.kl_coef=0.0 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    algorithm.filter_groups.enable=True \
+    algorithm.filter_groups.max_num_gen_batches=10 \
+    algorithm.filter_groups.metric=acc \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=22528 \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=22528 \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=22528 \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.model.use_fused_kernels=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode="token-mean" \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.70 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.load_format=flash_rl \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=2 \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.top_p=0.7 \
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1 \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    reward_model.reward_manager=dapo \
+    reward_model.overlong_buffer.enable=True \
+    reward_model.overlong_buffer.len=4096 \
+    reward_model.overlong_buffer.penalty_factor=1.0 \
+    trainer.logger=['console'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.total_training_steps=100 \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    +algorithm.rollout_is_threshold=2.0 \
+    +algorithm.rollout_is_level=token \
+    +algorithm.rollout_is_mode=truncate \
+    +algorithm.rollout_is=True \
+    actor_rollout_ref.rollout.calculate_log_probs=True
\ No newline at end of file

From 6719269d5d3bbd7a4cff011e23de53d94730edc1 Mon Sep 17 00:00:00 2001
From: eternally-z <zzywzj@gmail.com>
Date: Wed, 10 Dec 2025 13:49:27 +0800
Subject: [PATCH 2/5] minor fix

---
 recipe/flashrl/run_flashrl_sglang.sh | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/recipe/flashrl/run_flashrl_sglang.sh b/recipe/flashrl/run_flashrl_sglang.sh
index 0546b9271c1..149cb59c68e 100644
--- a/recipe/flashrl/run_flashrl_sglang.sh
+++ b/recipe/flashrl/run_flashrl_sglang.sh
@@ -127,8 +127,13 @@ PYTHONUNBUFFERED=1 python3 -m recipe.dapo.main_dapo \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
-    +algorithm.rollout_is_threshold=2.0 \
-    +algorithm.rollout_is_level=token \
-    +algorithm.rollout_is_mode=truncate \
-    +algorithm.rollout_is=True \
+    algorithm.rollout_correction.rollout_is=token \
+    algorithm.rollout_correction.rollout_is_threshold=2.0 \
+    algorithm.rollout_correction.rollout_is_batch_normalize=False \
+    algorithm.rollout_correction.bypass_mode=False \
+    algorithm.rollout_correction.rollout_rs=null \
+    algorithm.rollout_correction.use_policy_gradient=False \
+    algorithm.rollout_correction.rollout_rs_threshold=null \
+    algorithm.rollout_correction.rollout_rs_threshold_lower=null \
+    algorithm.rollout_correction.rollout_token_veto_threshold=null \
     actor_rollout_ref.rollout.calculate_log_probs=True
\ No newline at end of file

From 448b4101f66a916085302917c71ed75a22017004 Mon Sep 17 00:00:00 2001
From: eternally-z <zzywzj@gmail.com>
Date: Wed, 10 Dec 2025 15:10:47 +0800
Subject: [PATCH 3/5] move the script to rollout_correction folder

---
 .../rollout_correction/run_tis_sglang.sh                          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename recipe/flashrl/run_flashrl_sglang.sh => examples/rollout_correction/run_tis_sglang.sh (100%)

diff --git a/recipe/flashrl/run_flashrl_sglang.sh b/examples/rollout_correction/run_tis_sglang.sh
similarity index 100%
rename from recipe/flashrl/run_flashrl_sglang.sh
rename to examples/rollout_correction/run_tis_sglang.sh

From 240ef4b62538c9f300f9eca65acb1900cc0b3d9a Mon Sep 17 00:00:00 2001
From: eternally-z <zzywzj@gmail.com>
Date: Mon, 22 Dec 2025 20:58:28 +0800
Subject: [PATCH 4/5] minor fix

---
 examples/rollout_correction/run_tis_sglang.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/rollout_correction/run_tis_sglang.sh b/examples/rollout_correction/run_tis_sglang.sh
index 149cb59c68e..ac5be966dd1 100644
--- a/examples/rollout_correction/run_tis_sglang.sh
+++ b/examples/rollout_correction/run_tis_sglang.sh
@@ -13,7 +13,7 @@ echo "WORKING_DIR: ${WORKING_DIR}"
 # Paths
 CKPTS_DIR=${CKPTS_DIR:-"${WORKING_DIR}/ckpts/${project_name}/${exp_name}"}
 
-MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-14B-Instruct"}
+MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-8B-Base"}
 
 export SGLANG_PATCH=1
 export SGLANG_PORT=30300
@@ -59,7 +59,7 @@ fi
 
 PYTHONUNBUFFERED=1 python3 -m recipe.dapo.main_dapo \
     data.train_files=$HOME/data/BytedTsinghua-SIA/DAPO-Math-17k/train.parquet \
-    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.val_files=$HOME/data/AIME-2024/train.parquet \
     data.prompt_key=prompt \
     data.truncation='left' \
     data.max_prompt_length=2048 \
@@ -107,7 +107,7 @@ PYTHONUNBUFFERED=1 python3 -m recipe.dapo.main_dapo \
     actor_rollout_ref.ref.ulysses_sequence_parallel_size=2 \
     actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
     actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
-    actor_rollout_ref.rollout.val_kwargs.top_p=0.7 \
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0 \
     actor_rollout_ref.rollout.val_kwargs.top_k=-1 \
     actor_rollout_ref.rollout.val_kwargs.do_sample=True \
     actor_rollout_ref.rollout.val_kwargs.n=1 \
@@ -120,9 +120,9 @@ PYTHONUNBUFFERED=1 python3 -m recipe.dapo.main_dapo \
     trainer.experiment_name="${exp_name}" \
     trainer.n_gpus_per_node=8 \
     trainer.nnodes=1 \
-    trainer.total_training_steps=100 \
+    trainer.total_training_steps=150 \
     trainer.val_before_train=True \
-    trainer.test_freq=20 \
+    trainer.test_freq=5 \
     trainer.save_freq=-1 \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \

From 8e3c6d586d6ea82f9ea1ce0810a377c12a2760c7 Mon Sep 17 00:00:00 2001
From: eternally-z <zzywzj@gmail.com>
Date: Thu, 15 Jan 2026 19:39:00 +0800
Subject: [PATCH 5/5] minor fix

---
 examples/rollout_correction/run_tis_sglang.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/rollout_correction/run_tis_sglang.sh b/examples/rollout_correction/run_tis_sglang.sh
index ac5be966dd1..88cabfdfed3 100644
--- a/examples/rollout_correction/run_tis_sglang.sh
+++ b/examples/rollout_correction/run_tis_sglang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
 
-ID=${1:-"qwen2.5_14bInstruct_sglang_fp8_8gpu_tis_on"}
+ID=${1:-"qwen3_8b_base_sglang_fp8_8gpu_tis_on"}
 CAP=${2:-"5"}
 
 project_name='DAPO-1'
@@ -63,8 +63,8 @@ PYTHONUNBUFFERED=1 python3 -m recipe.dapo.main_dapo \
     data.prompt_key=prompt \
     data.truncation='left' \
     data.max_prompt_length=2048 \
-    data.val_max_samples=1 \
-    data.max_response_length=4096 \
+    data.val_max_samples=-1 \
+    data.max_response_length=$((1024 * 20)) \
     data.gen_batch_size=96 \
     data.train_batch_size=32 \
     actor_rollout_ref.rollout.n=16 \
@@ -115,7 +115,7 @@ PYTHONUNBUFFERED=1 python3 -m recipe.dapo.main_dapo \
     reward_model.overlong_buffer.enable=True \
     reward_model.overlong_buffer.len=4096 \
     reward_model.overlong_buffer.penalty_factor=1.0 \
-    trainer.logger=['console'] \
+    trainer.logger=['console','wandb'] \
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
     trainer.n_gpus_per_node=8 \