From cc12b4e8a79a52567a2b1448c160d62493b56b21 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 24 Jul 2025 16:33:08 +0800
Subject: [PATCH 001/182] init async training pipline

---
 .../fully_async_policy/README_fully_async.md  | 183 +++++++
 .../config/fully_async_ppo_trainer.yaml       | 136 +++++
 recipe/fully_async_policy/fully_async_main.py | 244 +++++++++
 .../fully_async_policy/fully_async_trainer.py | 490 ++++++++++++++++++
 recipe/fully_async_policy/message_queue.py    | 238 +++++++++
 recipe/fully_async_policy/param_sync.py       | 175 +++++++
 recipe/fully_async_policy/rollouter.py        | 414 +++++++++++++++
 .../run_fully_async_example.sh                | 149 ++++++
 recipe/fully_async_policy/test_fully_async.py | 197 +++++++
 tests/special_sanity/check_license.py         |   4 +-
 10 files changed, 2229 insertions(+), 1 deletion(-)
 create mode 100644 recipe/fully_async_policy/README_fully_async.md
 create mode 100644 recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
 create mode 100644 recipe/fully_async_policy/fully_async_main.py
 create mode 100644 recipe/fully_async_policy/fully_async_trainer.py
 create mode 100644 recipe/fully_async_policy/message_queue.py
 create mode 100644 recipe/fully_async_policy/param_sync.py
 create mode 100644 recipe/fully_async_policy/rollouter.py
 create mode 100644 recipe/fully_async_policy/run_fully_async_example.sh
 create mode 100644 recipe/fully_async_policy/test_fully_async.py

diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md
new file mode 100644
index 00000000000..979f9aff783
--- /dev/null
+++ b/recipe/fully_async_policy/README_fully_async.md
@@ -0,0 +1,183 @@
+# 完全异步训练工作流 (Fully Async Training Workflow)
+
+## 概述
+
+本项目实现了基于现有 one step off policy 代码的完全异步训练工作流，将样本生成（Rollouter）和模型训练（Trainer）完全解耦，通过 MessageQueue 进行异步通信。
+
+## 架构设计
+
+### 核心组件
+
+1. **MessageQueue**: 基于 ZeroMQ 的异步消息队列，作为 Ray Actor 存在
+   - 管理生成的样本队列
+   - 支持新鲜度控制，自动丢弃过期样本
+   - 提供线程安全的生产者-消费者接口
+
+2. **Rollouter**: 专门负责样本生成的组件
+   - 持续循环生成训练样本
+   - 支持暂停/恢复机制，用于参数更新
+   - 实现新鲜度阈值控制，避免生成过多过期样本
+
+3. **FullyAsyncTrainer**: 修改后的训练器
+   - 从 MessageQueue 获取样本进行训练
+   - 训练完成后通知 Rollouter 更新参数
+   - 支持样本新鲜度监控和统计
+
+4. **ParameterSynchronizer**: 参数同步模块
+   - 基于 NCCL 实现高效的参数同步
+   - 支持 Actor 到 Rollout 的参数传递
+
+### 工作流程
+
+```
+┌─────────────┐    put_batch    ┌──────────────┐    get_batch    ┌─────────────┐
+│  Rollouter  │ ──────────────► │ MessageQueue │ ──────────────► │   Trainer   │
+│             │                 │              │                 │             │
+│ - 生成样本   │                 │ - 队列管理    │                 │ - 模型训练   │
+│ - 暂停/恢复  │                 │ - 新鲜度控制  │                 │ - 参数更新   │
+│ - 新鲜度控制 │                 │ - 统计信息    │                 │ - 同步通知   │
+└─────────────┘                 └──────────────┘                 └─────────────┘
+       ▲                                                                 │
+       │                        update_rollout_weights                   │
+       └─────────────────────────────────────────────────────────────────┘
+```
+
+## 新鲜度控制机制
+
+### 配置参数
+
+- `freshness_threshold`: 新鲜度阈值，队列中超过此版本差异的样本会被丢弃
+- `max_staleness_allowed`: 最大允许的新鲜度差异，Rollouter 会暂停生成
+- `max_queue_size`: MessageQueue 的最大队列大小
+
+### 控制逻辑
+
+1. **样本丢弃**: 当样本的参数版本与当前 Trainer 版本差异超过 `freshness_threshold` 时，样本被丢弃
+2. **生成暂停**: 当 Rollouter 的参数版本与 Trainer 版本差异超过 `max_staleness_allowed` 时，暂停生成
+3. **队列管理**: 队列长度限制为 `freshness_threshold * batch_size`，避免内存溢出
+
+## 性能优势
+
+### 相比同步训练
+
+- **GPU 利用率提升**: 生成和训练并行进行，减少 GPU 空闲时间
+- **长尾样本优化**: 训练不需要等待最慢的样本生成完成
+- **资源隔离**: 可以独立配置生成和训练的资源分配
+
+### 相比 One Step Off Policy
+
+- **更高的异步度**: 完全解耦生成和训练，支持多步异步
+- **更灵活的控制**: 支持动态的新鲜度控制和队列管理
+- **更好的监控**: 提供详细的统计信息和性能指标
+
+## 使用方法
+
+### 1. 安装依赖
+
+```bash
+pip install zmq filelock
+```
+
+### 2. 配置文件
+
+使用 `config/fully_async_ppo_trainer.yaml` 配置文件，关键配置项：
+
+```yaml
+async_training:
+  freshness_threshold: 3      # 新鲜度阈值
+  max_staleness_allowed: 5    # 最大允许新鲜度差异
+  max_queue_size: 1000        # 队列最大大小
+  min_batch_count: 1          # 最小batch数量
+  batch_timeout: 30.0         # 获取batch超时时间
+
+actor_rollout_ref:
+  rollout:
+    mode: async               # 使用异步模式
+    n_gpus: 4                # rollout专用GPU数量
+    name: vllm               # 使用vLLM引擎
+```
+
+### 3. 启动训练
+
+```bash
+python -m recipe.one_step_off_policy.fully_async_main \
+    data.train_files=~/data/train.parquet \
+    data.val_files=~/data/val.parquet \
+    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
+    trainer.total_training_steps=1000
+```
+
+### 4. 监控训练
+
+训练过程中会输出以下统计信息：
+
+- `queue_size`: 当前队列大小
+- `avg_sample_age`: 平均样本年龄（参数版本差异）
+- `max_sample_age`: 最大样本年龄
+- `param_version`: 当前参数版本
+- `processed_samples`: 已处理样本数
+- `dropped_samples`: 丢弃的过期样本数
+
+## 性能调优建议
+
+### 1. 资源分配
+
+- **生成资源**: 根据模型大小和生成速度需求分配 GPU
+- **训练资源**: 根据batch大小和训练复杂度分配 GPU
+- **比例建议**: 生成:训练 = 1:2 到 1:3
+
+### 2. 新鲜度控制
+
+- **快速生成场景**: 降低 `freshness_threshold` (2-3)
+- **慢速生成场景**: 提高 `freshness_threshold` (5-8)
+- **队列大小**: 设置为 `freshness_threshold * batch_size * 2`
+
+### 3. 网络优化
+
+- **单节点**: MessageQueue 使用 IPC 协议
+- **多节点**: MessageQueue 使用 TCP 协议，注意网络带宽
+
+## 故障排除
+
+### 常见问题
+
+1. **队列为空**: 检查 Rollouter 是否正常运行，是否被新鲜度控制暂停
+2. **内存溢出**: 减少 `max_queue_size` 或增加 `freshness_threshold`
+3. **参数同步失败**: 检查 NCCL 配置和网络连接
+4. **性能下降**: 调整资源分配比例，监控 GPU 利用率
+
+### 调试模式
+
+设置环境变量启用详细日志：
+
+```bash
+export VERL_LOGGING_LEVEL=DEBUG
+export NCCL_DEBUG=INFO
+```
+
+## 与现有系统对比
+
+| 特性 | 同步训练 | One Step Off | 完全异步 |
+|------|----------|--------------|----------|
+| 异步程度 | 无 | 一步 | 多步 |
+| 资源利用率 | 低 | 中 | 高 |
+| 实现复杂度 | 低 | 中 | 高 |
+| 样本新鲜度 | 最新 | 一步延迟 | 可控延迟 |
+| 内存使用 | 低 | 中 | 中-高 |
+
+## 实验结果预期
+
+基于现有 one step off policy 的实验结果，完全异步训练预期能够：
+
+- **训练速度**: 相比同步训练提升 30-50%
+- **GPU 利用率**: 提升至 85-95%
+- **内存开销**: 增加 20-30%（主要用于队列缓存）
+- **模型收敛**: 与同步训练基本一致（在合理的新鲜度控制下）
+
+## 后续改进
+
+1. **自适应新鲜度控制**: 根据训练进度动态调整新鲜度阈值
+2. **多队列支持**: 支持不同优先级的样本队列
+3. **分布式队列**: 支持跨节点的分布式消息队列
+4. **更精细的资源调度**: 支持动态的资源分配和调整
+
diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
new file mode 100644
index 00000000000..cbc7058f108
--- /dev/null
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -0,0 +1,136 @@
+hydra:
+  searchpath:
+    - file://verl/trainer/config
+
+defaults:
+  - ppo_trainer
+  - _self_
+
+# 完全异步训练的特殊配置
+async_training:
+  # 新鲜度阈值，超过此版本差异的样本会被丢弃
+  freshness_threshold: 3
+
+  # 最大允许的新鲜度差异，rollout会暂停生成
+  max_staleness_allowed: 5
+
+  # MessageQueue的最大队列大小
+  max_queue_size: 1000
+
+  # 最小batch数量，trainer会等待至少这么多batch
+  min_batch_count: 1
+
+  # 获取batch的超时时间（秒）
+  batch_timeout: 30.0
+
+# 重写默认的训练配置
+actor_rollout_ref:
+  hybrid_engine: false
+  rollout:
+    # 异步模式
+    mode: async
+
+    # rollout专用的GPU数量
+    n_gpus: 4
+
+    # 使用vLLM异步rollout
+    name: vllm
+
+    # 其他rollout参数
+    temperature: 1.0
+    top_k: -1
+    top_p: 1.0
+    tensor_model_parallel_size: 2
+    gpu_memory_utilization: 0.6
+    max_num_batched_tokens: 8192
+    free_cache_engine: true
+    enforce_eager: true
+
+# 训练器配置
+trainer:
+  # 总训练步数
+  total_training_steps: 1000
+
+  # 设备
+  device: cuda
+
+  # 保存频率
+  save_freq: 100
+
+  # 验证频率
+  val_freq: 50
+
+  # 日志配置
+  logger: '["console", "wandb"]'
+  project_name: "fully_async_ppo"
+  experiment_name: "test_async_training"
+
+# 数据配置
+data:
+  # 训练batch大小
+  train_batch_size: 128
+
+  # 数据文件路径
+  train_files: "~/data/train.parquet"
+  val_files: "~/data/val.parquet"
+
+  # 序列长度
+  max_prompt_length: 1024
+  max_response_length: 1024
+
+# 算法配置
+algorithm:
+  # 优势估计器
+  adv_estimator: gae
+
+  # PPO参数
+  cliprange: 0.2
+  cliprange_value: 0.2
+  vf_coeff: 0.1
+  entropy_coeff: 0.01
+
+  # KL相关
+  kl_coeff: 0.1
+  adaptive_kl: true
+  target_kl: 0.01
+
+# 模型配置
+actor_rollout_ref:
+  model:
+    # 模型路径
+    path: "Qwen/Qwen2-7B-Instruct"
+
+    # 使用LoRA
+    lora_rank: 64
+    lora_alpha: 128
+    lora_dropout: 0.1
+
+  actor:
+    # Actor优化器
+    optim:
+      lr: 1e-6
+      weight_decay: 0.01
+
+    # FSDP配置
+    fsdp_config:
+      fsdp_size: -1
+      param_offload: false
+      optimizer_offload: false
+
+    # PPO配置
+    ppo_mini_batch_size: 32
+    use_dynamic_bsz: true
+
+# Critic配置
+critic:
+  model:
+    path: "Qwen/Qwen2-7B-Instruct"
+
+  optim:
+    lr: 1e-5
+    weight_decay: 0.01
+
+  fsdp_config:
+    fsdp_size: -1
+    param_offload: false
+
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
new file mode 100644
index 00000000000..3bab5d91eb1
--- /dev/null
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -0,0 +1,244 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import threading
+import time
+
+import hydra
+import ray
+
+from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
+from recipe.fully_async_policy.rollouter import Rollouter
+from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+from verl.trainer.ppo.reward import load_reward_manager
+
+from .fully_async_trainer import FullyAsyncTrainer
+
+logger = logging.getLogger(__name__)
+
+
+def setup_logging():
+    """设置日志配置"""
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+
+
+@ray.remote
+class RollouterActor:
+    """Rollouter的Ray Actor包装器"""
+
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping,
+        resource_pool_manager,
+        ray_worker_group_cls,
+        processor=None,
+        train_dataset=None,
+        collate_fn=None,
+        train_sampler=None,
+        device_name="cuda",
+    ):
+        self.rollouter = Rollouter(
+            config=config,
+            tokenizer=tokenizer,
+            role_worker_mapping=role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            processor=processor,
+            train_dataset=train_dataset,
+            collate_fn=collate_fn,
+            train_sampler=train_sampler,
+            device_name=device_name,
+        )
+
+    def init_workers(self):
+        """初始化worker"""
+        return self.rollouter.init_workers()
+
+    def set_message_queue_client(self, message_queue_client):
+        """设置消息队列客户端"""
+        return self.rollouter.set_message_queue_client(message_queue_client)
+
+    def set_parameter_synchronizer(self, param_synchronizer):
+        """设置参数同步器"""
+        return self.rollouter.set_parameter_synchronizer(param_synchronizer)
+
+    def update_rollout_weights(self, param_version: int):
+        """更新rollout权重"""
+        return self.rollouter.update_rollout_weights(param_version)
+
+    def fit(self):
+        """开始生成循环"""
+        return self.rollouter.fit()
+
+    def shutdown(self):
+        """关闭rollouter"""
+        return self.rollouter.shutdown()
+
+    def get_statistics(self):
+        """获取统计信息"""
+        return self.rollouter.get_statistics()
+
+
+def run_fully_async_ppo(config):
+    """运行完全异步的PPO训练"""
+    setup_logging()
+
+    logger.info("Starting fully async PPO training...")
+
+    # 初始化Ray
+    if not ray.is_initialized():
+        ray.init(
+            address=os.environ.get("RAY_ADDRESS", None),
+            runtime_env={"env_vars": {"NCCL_DEBUG": "WARN", "VLLM_USE_V1": "1"}},
+        )
+
+    try:
+        # 创建数据集和采样器
+        logger.info("Creating dataset and sampler...")
+        from verl.utils import hf_processor, hf_tokenizer
+
+        tokenizer = hf_tokenizer(config.actor_rollout_ref.model.path)
+        processor = hf_processor(config.actor_rollout_ref.model.path)
+
+        train_dataset, val_dataset = create_rl_dataset(config, tokenizer, processor)
+        train_sampler = create_rl_sampler(config, train_dataset)
+
+        # 创建collate function
+        from verl.trainer.ppo.ray_trainer import default_collate_fn
+
+        collate_fn = default_collate_fn
+
+        # 创建奖励函数
+        reward_fn, val_reward_fn = load_reward_manager(config, tokenizer)
+
+        # 创建资源池管理器和worker映射
+        from verl.single_controller.ray import RayWorkerGroup
+        from verl.trainer.ppo.ray_trainer import (
+            Role,
+            create_resource_pool_manager,
+            create_role_worker_mapping,
+        )
+
+        # resource_pool_manager = create_resource_pool_manager(config)
+        role_worker_mapping = create_role_worker_mapping(config)
+
+        # 1. 创建MessageQueue
+        logger.info("Creating MessageQueue...")
+        max_queue_size = config.async_training.get("max_queue_size", 1000)
+        message_queue = MessageQueue.remote(config, max_queue_size)
+        message_queue_client = MessageQueueClient(message_queue)
+
+        # 2. 创建Rollouter Actor
+        logger.info("Creating Rollouter...")
+        rollouter_actor = RollouterActor.remote(
+            config=config,
+            tokenizer=tokenizer,
+            role_worker_mapping={Role.Rollout: role_worker_mapping[Role.Rollout]},
+            resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]),
+            ray_worker_group_cls=RayWorkerGroup,
+            processor=processor,
+            train_dataset=train_dataset,
+            collate_fn=collate_fn,
+            train_sampler=train_sampler,
+            device_name=config.trainer.device,
+        )
+
+        # 初始化Rollouter
+        ray.get(rollouter_actor.init_workers.remote())
+        ray.get(rollouter_actor.set_message_queue_client.remote(message_queue_client))
+
+        # 3. 创建Trainer
+        logger.info("Creating FullyAsyncTrainer...")
+        trainer_role_mapping = {
+            role: worker_cls for role, worker_cls in role_worker_mapping.items() if role != Role.Rollout
+        }
+
+        trainer = FullyAsyncTrainer(
+            config=config,
+            tokenizer=tokenizer,
+            role_worker_mapping=trainer_role_mapping,
+            resource_pool_manager=create_resource_pool_manager(config, roles=list(trainer_role_mapping.keys())),
+            ray_worker_group_cls=RayWorkerGroup,
+            processor=processor,
+            reward_fn=reward_fn,
+            val_reward_fn=val_reward_fn,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            collate_fn=collate_fn,
+            train_sampler=train_sampler,
+            device_name=config.trainer.device,
+        )
+
+        # 初始化Trainer
+        trainer.init_workers()
+        trainer.set_message_queue_client(message_queue_client)
+        trainer.set_rollouter_actor(rollouter_actor)
+
+        # 4. 设置参数同步
+        logger.info("Setting up parameter synchronization...")
+        # param_synchronizer = AsyncParameterSynchronizer(
+        #     config=config, actor_wg=trainer.actor_wg, rollouter_actor=rollouter_actor
+        # )
+
+        # 5. 启动Rollouter（在后台线程中）
+        logger.info("Starting Rollouter in background...")
+
+        def run_rollouter():
+            try:
+                ray.get(rollouter_actor.fit.remote())
+            except Exception as e:
+                logger.error(f"Rollouter error: {e}")
+
+        rollouter_thread = threading.Thread(target=run_rollouter, daemon=True)
+        rollouter_thread.start()
+
+        # 等待一下让Rollouter启动
+        time.sleep(5)
+
+        # 6. 启动Trainer（主线程）
+        logger.info("Starting FullyAsyncTrainer...")
+        trainer.fit()
+
+        # 7. 关闭
+        logger.info("Shutting down...")
+        ray.get(rollouter_actor.shutdown.remote())
+
+        # 等待Rollouter线程结束
+        rollouter_thread.join(timeout=10)
+
+        # 关闭MessageQueue
+        ray.get(message_queue.shutdown.remote())
+
+        logger.info("Fully async PPO training completed successfully!")
+
+    except Exception as e:
+        logger.error(f"Error in fully async PPO training: {e}")
+        raise
+    finally:
+        if ray.is_initialized():
+            ray.shutdown()
+
+
+@hydra.main(config_path="../one_step_off_policy/config", config_name="fully_async_ppo_trainer", version_base=None)
+def main(config):
+    """主入口函数"""
+    run_fully_async_ppo(config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
new file mode 100644
index 00000000000..192d33817a6
--- /dev/null
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -0,0 +1,490 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pprint import pprint
+from typing import Optional
+
+import numpy as np
+import ray
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset, Sampler
+from tqdm import tqdm
+
+from recipe.fully_async_policy.message_queue import BatchSample, MessageQueueClient
+from verl import DataProto
+from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo.metric_utils import (
+    compute_data_metrics,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+)
+from verl.trainer.ppo.ray_trainer import (
+    ResourcePoolManager,
+    Role,
+    WorkerType,
+    apply_kl_penalty,
+    compute_advantage,
+    compute_response_mask,
+)
+from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.utils.debug import marked_timer
+from verl.utils.metric import reduce_metrics
+from verl.utils.tracking import ValidationGenerationsLogger
+
+logger = logging.getLogger(__name__)
+
+
+class FullyAsyncTrainer:
+    """
+    完全异步的PPO训练器，从MessageQueue获取样本进行训练
+    """
+
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name="cuda",
+    ):
+        self.config = config
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.reward_fn = reward_fn
+        self.val_reward_fn = val_reward_fn
+
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name
+        self.validation_generations_logger = ValidationGenerationsLogger()
+
+        # 数据相关
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.collate_fn = collate_fn
+        self.train_sampler = train_sampler
+
+        # 角色配置
+        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
+        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.use_critic = Role.Critic in role_worker_mapping
+        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
+
+        # Worker groups
+        self.actor_wg = None
+        self.critic_wg = None
+        self.ref_policy_wg = None
+        self.rm_wg = None
+
+        # 训练状态
+        self.global_steps = 0
+        self.current_param_version = 0
+        self.total_training_steps = config.trainer.total_training_steps
+
+        # MessageQueue客户端
+        self.message_queue_client = None
+
+        # 与Rollouter的通信
+        self.rollouter_actor = None
+
+        # 统计信息
+        self.processed_samples = 0
+        self.stale_samples_processed = 0
+
+    def set_message_queue_client(self, message_queue_client: MessageQueueClient):
+        """设置消息队列客户端"""
+        self.message_queue_client = message_queue_client
+
+    def set_rollouter_actor(self, rollouter_actor):
+        """设置Rollouter Actor的引用"""
+        self.rollouter_actor = rollouter_actor
+
+    def init_workers(self):
+        """初始化训练workers"""
+        logger.info("Initializing FullyAsyncTrainer workers...")
+
+        self.resource_pool_manager.create_resource_pool()
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
+        # 创建actor worker
+        actor_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor)
+        actor_cls = RayClassWithInitArgs(
+            cls=self.role_worker_mapping[Role.Actor],
+            config=self.config.actor_rollout_ref,
+            role="actor",
+        )
+        self.resource_pool_to_cls[actor_resource_pool]["actor"] = actor_cls
+
+        # 创建critic worker
+        if self.use_critic:
+            critic_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
+            self.resource_pool_to_cls[critic_resource_pool]["critic"] = critic_cls
+
+        # 创建reference policy worker
+        if self.use_reference_policy:
+            ref_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            ref_policy_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[Role.RefPolicy],
+                config=self.config.actor_rollout_ref,
+                role="ref",
+            )
+            self.resource_pool_to_cls[ref_resource_pool]["ref"] = ref_policy_cls
+
+        # 创建reward model worker
+        if self.use_rm:
+            rm_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            rm_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model
+            )
+            self.resource_pool_to_cls[rm_resource_pool]["rm"] = rm_cls
+
+        # 初始化WorkerGroup
+        all_wg = {}
+        wg_kwargs = {}
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                device_name=self.device_name,
+                **wg_kwargs,
+            )
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+
+        # 分配worker groups
+        self.actor_wg = all_wg["actor"]
+        self.actor_wg.init_model()
+
+        if self.use_critic:
+            self.critic_wg = all_wg["critic"]
+            self.critic_wg.init_model()
+
+        if self.use_reference_policy and not self.ref_in_actor:
+            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg.init_model()
+
+        if self.use_rm:
+            self.rm_wg = all_wg["rm"]
+            self.rm_wg.init_model()
+
+        logger.info("FullyAsyncTrainer workers initialized successfully")
+
+    def _load_checkpoint(self):
+        """加载检查点"""
+        # 简化的检查点加载逻辑
+        pass
+
+    def _validate(self):
+        """执行验证"""
+        if self.val_reward_fn is None:
+            return None
+
+        # 简化的验证逻辑
+        logger.info("Validation step skipped in async trainer")
+        return {"val_reward": 0.0}
+
+    def _save_checkpoint(self):
+        """保存检查点"""
+        # 简化的检查点保存逻辑
+        pass
+
+    def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path):
+        """保存生成结果"""
+        # 简化的生成结果保存逻辑
+        pass
+
+    def _update_param_version_and_sync(self):
+        """更新参数版本并同步到Rollouter"""
+        self.current_param_version += 1
+
+        # 通知MessageQueue更新参数版本
+        self.message_queue_client.update_param_version(self.current_param_version)
+
+        # 通知Rollouter更新参数
+        if self.rollouter_actor is not None:
+            ray.get(self.rollouter_actor.update_rollout_weights.remote(self.current_param_version))
+
+    def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto:
+        """处理从队列获取的batch样本"""
+        if len(batch_samples) == 1:
+            return batch_samples[0].data
+
+        # 如果有多个batch，需要合并
+        all_batches = [sample.data for sample in batch_samples]
+        return DataProto.concat(all_batches)
+
+    def fit(self):
+        """主训练循环"""
+        from omegaconf import OmegaConf
+
+        from verl.utils.tracking import Tracking
+
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+
+        self.global_steps = 0
+
+        # 加载检查点
+        self._load_checkpoint()
+
+        # 验证
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            if val_metrics:
+                pprint(f"Initial validation metrics: {val_metrics}")
+                logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+
+        # 进度条
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+
+        self.global_steps += 1
+        last_val_metrics = None
+
+        if self.message_queue_client is None:
+            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+
+        logger.info("Starting fully async training loop...")
+
+        while self.global_steps <= self.total_training_steps:
+            do_profile = (
+                self.global_steps in self.config.trainer.profile_steps
+                if self.config.trainer.profile_steps is not None
+                else False
+            )
+
+            if do_profile:
+                self.actor_wg.start_profile()
+                if self.use_reference_policy:
+                    self.ref_policy_wg.start_profile()
+                if self.use_critic:
+                    self.critic_wg.start_profile()
+                if self.use_rm:
+                    self.rm_wg.start_profile()
+
+            metrics = {}
+            timing_raw = {}
+            # is_last_step = self.global_steps >= self.total_training_steps
+
+            with marked_timer("step", timing_raw):
+                # 从队列获取样本
+                with marked_timer("get_batch_from_queue", timing_raw, color="blue"):
+                    min_batch_count = self.config.async_training.get("min_batch_count", 1)
+                    batch_timeout = self.config.async_training.get("batch_timeout", 30.0)
+
+                    batch_samples = self.message_queue_client.get_batch(
+                        min_batch_count=min_batch_count, timeout=batch_timeout
+                    )
+
+                    if batch_samples is None:
+                        logger.warning("Timeout waiting for batch samples, continuing...")
+                        continue
+
+                # 处理获取的样本
+                batch = self._process_batch_samples(batch_samples)
+
+                # 计算样本的新鲜度
+                sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples]
+                avg_sample_age = np.mean(sample_ages)
+                max_sample_age = max(sample_ages)
+
+                logger.info(
+                    f"Processing batch with {len(batch_samples)} samples, "
+                    f"avg_age={avg_sample_age:.1f}, max_age={max_sample_age}"
+                )
+
+                # 添加响应掩码
+                batch.batch["response_mask"] = compute_response_mask(batch)
+
+                # 计算奖励
+                with marked_timer("compute_reward", timing_raw, color="yellow"):
+                    if self.reward_fn is not None:
+                        batch, reward_extra_infos_dict = compute_reward(
+                            batch, reward_fn=self.reward_fn, tokenizer=self.tokenizer
+                        )
+                    elif self.use_rm:
+                        batch, reward_extra_infos_dict = compute_reward_async(
+                            batch, rm_wg=self.rm_wg, tokenizer=self.tokenizer
+                        )
+                    else:
+                        raise ValueError("No reward function or reward model provided")
+
+                # 计算reference log probabilities
+                if self.use_reference_policy:
+                    with marked_timer("compute_ref_log_prob", timing_raw, color="green"):
+                        if self.ref_in_actor:
+                            ref_log_prob_output = self.actor_wg.compute_ref_log_prob(batch)
+                        else:
+                            ref_log_prob_output = self.ref_policy_wg.compute_log_prob(batch)
+                        batch = batch.union(ref_log_prob_output)
+
+                # 计算actor log probabilities
+                with marked_timer("compute_log_prob", timing_raw, color="cyan"):
+                    log_prob_output = self.actor_wg.compute_log_prob(batch)
+                    batch = batch.union(log_prob_output)
+
+                # 应用KL惩罚
+                if self.use_reference_policy:
+                    batch = apply_kl_penalty(batch, self.config.algorithm)
+
+                # 计算优势
+                if self.use_critic:
+                    with marked_timer("compute_values", timing_raw, color="magenta"):
+                        values_output = self.critic_wg.compute_values(batch)
+                        batch = batch.union(values_output)
+
+                with marked_timer("compute_advantage", timing_raw, color="orange"):
+                    batch = compute_advantage(batch, self.config.algorithm)
+
+                # 更新critic
+                if self.use_critic:
+                    with marked_timer("update_critic", timing_raw, color="pink"):
+                        critic_output = self.critic_wg.update_critic(batch)
+                    critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                    metrics.update(critic_output_metrics)
+
+                # 更新actor
+                if self.config.trainer.critic_warmup <= self.global_steps:
+                    with marked_timer("update_actor", timing_raw, color="red"):
+                        batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                        actor_output = self.actor_wg.update_actor(batch)
+                    actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                    metrics.update(actor_output_metrics)
+
+                    # 更新参数版本并同步到Rollouter
+                    with marked_timer("sync_params", timing_raw, color="purple"):
+                        self._update_param_version_and_sync()
+
+                # 记录rollout生成
+                rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                if rollout_data_dir:
+                    with marked_timer("dump_rollout_generations", timing_raw, color="green"):
+                        inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                        outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                        scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                        self._dump_generations(
+                            inputs=inputs,
+                            outputs=outputs,
+                            scores=scores,
+                            reward_extra_infos_dict=reward_extra_infos_dict,
+                            dump_path=rollout_data_dir,
+                        )
+
+                # 验证
+                if (
+                    self.val_reward_fn is not None
+                    and self.config.trainer.val_freq is not None
+                    and self.global_steps % self.config.trainer.val_freq == 0
+                ):
+                    with marked_timer("validation", timing_raw, color="brown"):
+                        val_metrics = self._validate()
+                        if val_metrics:
+                            pprint(f"Validation metrics at step {self.global_steps}: {val_metrics}")
+                            last_val_metrics = val_metrics
+
+            # 计算性能指标
+            timing_metrics = compute_timing_metrics(timing_raw)
+            throughput_metrics = compute_throughout_metrics(timing_raw, len(batch))
+            data_metrics = compute_data_metrics(batch, self.tokenizer)
+
+            # 添加样本新鲜度指标
+            freshness_metrics = {
+                "avg_sample_age": avg_sample_age,
+                "max_sample_age": max_sample_age,
+                "processed_samples": self.processed_samples,
+                "param_version": self.current_param_version,
+            }
+
+            metrics.update(timing_metrics)
+            metrics.update(throughput_metrics)
+            metrics.update(data_metrics)
+            metrics.update(freshness_metrics)
+
+            if last_val_metrics is not None:
+                metrics.update(last_val_metrics)
+                last_val_metrics = None
+
+            # 记录日志
+            logger.log(data=metrics, step=self.global_steps)
+
+            # 更新进度条
+            progress_bar.update(1)
+            progress_bar.set_postfix(
+                {
+                    "reward": f"{metrics.get('reward/mean', 0):.3f}",
+                    "kl": f"{metrics.get('actor/approx_kl', 0):.3f}",
+                    "queue_size": self.message_queue_client.get_queue_size(),
+                    "param_version": self.current_param_version,
+                }
+            )
+
+            # 保存检查点
+            if self.config.trainer.save_freq is not None and self.global_steps % self.config.trainer.save_freq == 0:
+                self._save_checkpoint()
+
+            if do_profile:
+                self.actor_wg.end_profile()
+                if self.use_reference_policy:
+                    self.ref_policy_wg.end_profile()
+                if self.use_critic:
+                    self.critic_wg.end_profile()
+                if self.use_rm:
+                    self.rm_wg.end_profile()
+
+            self.global_steps += 1
+            self.processed_samples += len(batch_samples)
+
+        progress_bar.close()
+        logger.info(f"Training completed after {self.global_steps} steps")
+
+        # 最终验证
+        if self.val_reward_fn is not None:
+            val_metrics = self._validate()
+            if val_metrics:
+                pprint(f"Final validation metrics: {val_metrics}")
+                logger.log(data=val_metrics, step=self.global_steps)
+
+        # 最终检查点保存
+        self._save_checkpoint()
+
+    def get_statistics(self) -> dict:
+        """获取训练统计信息"""
+        return {
+            "global_steps": self.global_steps,
+            "processed_samples": self.processed_samples,
+            "stale_samples_processed": self.stale_samples_processed,
+            "current_param_version": self.current_param_version,
+            "queue_size": self.message_queue_client.get_queue_size() if self.message_queue_client else 0,
+        }
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
new file mode 100644
index 00000000000..e28346a9ccd
--- /dev/null
+++ b/recipe/fully_async_policy/message_queue.py
@@ -0,0 +1,238 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+import time
+import uuid
+from collections import deque
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import ray
+import zmq
+from filelock import FileLock
+from omegaconf import DictConfig
+
+from verl import DataProto
+
+
+@dataclass
+class BatchSample:
+    """单个batch样本，包含参数版本和新鲜度信息"""
+
+    batch_id: str
+    epoch: int
+    data: DataProto
+    param_version: int
+    timestamp: float
+    rollout_metadata: dict[str, Any]
+
+
+@ray.remote(num_cpus=1)
+class MessageQueue:
+    """
+    基于ZeroMQ的异步消息队列，用于Rollouter和Trainer之间的通信
+    """
+
+    def __init__(self, config: DictConfig, max_queue_size: int = 1000):
+        self.config = config
+        self.max_queue_size = max_queue_size
+        self.queue = deque(maxlen=max_queue_size)
+        self.current_param_version = 0
+        self.freshness_threshold = config.async_training.get("freshness_threshold", 3)
+
+        # ZeroMQ setup
+        self.context = zmq.Context()
+        self.socket = None
+        self.address = None
+        self._setup_zmq()
+
+        # Threading for message handling
+        self.running = True
+        self.lock = threading.RLock()
+        self.consumer_waiting = False
+        self.consumer_condition = threading.Condition(self.lock)
+
+        # Statistics
+        self.total_produced = 0
+        self.total_consumed = 0
+        self.dropped_samples = 0
+
+    def _setup_zmq(self):
+        """设置ZeroMQ socket"""
+        with FileLock("/tmp/verl_message_queue.lock"):
+            # 使用TCP socket
+            import socket as sock
+
+            with sock.socket() as s:
+                s.bind(("", 0))
+                port = s.getsockname()[1]
+
+            self.address = f"tcp://127.0.0.1:{port}"
+            self.socket = self.context.socket(zmq.PAIR)
+            self.socket.bind(self.address)
+
+    def put_batch(
+        self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
+    ) -> bool:
+        """
+        放入一个batch样本到队列
+
+        Args:
+            epoch: 当前epoch
+            batch: 样本数据
+            param_version: 参数版本号
+            rollout_metadata: rollout相关的元数据
+
+        Returns:
+            bool: 是否成功放入队列
+        """
+        with self.lock:
+            # 检查新鲜度
+            staleness = self.current_param_version - param_version
+            if staleness >= self.freshness_threshold:
+                self.dropped_samples += 1
+                return False
+
+            sample = BatchSample(
+                batch_id=str(uuid.uuid4()),
+                epoch=epoch,
+                data=batch,
+                param_version=param_version,
+                timestamp=time.time(),
+                rollout_metadata=rollout_metadata or {},
+            )
+
+            # 如果队列满了，移除最旧的样本
+            if len(self.queue) >= self.max_queue_size:
+                removed = self.queue.popleft()
+                self.dropped_samples += 1
+                print(f"Queue full, dropped sample {removed.batch_id}")
+
+            self.queue.append(sample)
+            self.total_produced += 1
+
+            # 通知等待的消费者
+            if self.consumer_waiting:
+                self.consumer_condition.notify()
+
+            return True
+
+    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
+        """
+        从队列获取batch样本
+
+        Args:
+            min_batch_count: 最小batch数量
+            timeout: 超时时间（秒）
+
+        Returns:
+            Optional[List[BatchSample]]: 获取的样本列表，如果超时返回None
+        """
+        with self.lock:
+            start_time = time.time()
+
+            while len(self.queue) < min_batch_count:
+                if time.time() - start_time > timeout:
+                    return None
+
+                self.consumer_waiting = True
+                self.consumer_condition.wait(timeout=1.0)
+                self.consumer_waiting = False
+
+            # 获取指定数量的样本
+            batch_count = min(min_batch_count, len(self.queue))
+            samples = []
+            for _ in range(batch_count):
+                if self.queue:
+                    samples.append(self.queue.popleft())
+
+            self.total_consumed += len(samples)
+            return samples
+
+    def update_param_version(self, version: int):
+        """更新当前参数版本"""
+        with self.lock:
+            self.current_param_version = version
+
+    def get_queue_size(self) -> int:
+        """获取当前队列长度"""
+        with self.lock:
+            return len(self.queue)
+
+    def get_statistics(self) -> dict[str, Any]:
+        """获取队列统计信息"""
+        with self.lock:
+            return {
+                "queue_size": len(self.queue),
+                "total_produced": self.total_produced,
+                "total_consumed": self.total_consumed,
+                "dropped_samples": self.dropped_samples,
+                "current_param_version": self.current_param_version,
+                "freshness_threshold": self.freshness_threshold,
+            }
+
+    def clear_queue(self):
+        """清空队列"""
+        with self.lock:
+            self.queue.clear()
+
+    def shutdown(self):
+        """关闭消息队列"""
+        self.running = False
+        if self.socket:
+            self.socket.close()
+        if self.context:
+            self.context.term()
+
+    def get_address(self) -> str:
+        """获取ZeroMQ地址"""
+        return self.address
+
+
+class MessageQueueClient:
+    """MessageQueue的客户端，用于与MessageQueue Actor通信"""
+
+    def __init__(self, queue_actor: ray.ActorHandle):
+        self.queue_actor = queue_actor
+
+    def put_batch(
+        self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
+    ) -> bool:
+        """放入batch到队列"""
+        return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata))
+
+    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
+        """从队列获取batch"""
+        return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout))
+
+    def update_param_version(self, version: int):
+        """更新参数版本"""
+        ray.get(self.queue_actor.update_param_version.remote(version))
+
+    def get_queue_size(self) -> int:
+        """获取队列大小"""
+        return ray.get(self.queue_actor.get_queue_size.remote())
+
+    def get_statistics(self) -> dict[str, Any]:
+        """获取统计信息"""
+        return ray.get(self.queue_actor.get_statistics.remote())
+
+    def clear_queue(self):
+        """清空队列"""
+        ray.get(self.queue_actor.clear_queue.remote())
+
+    def shutdown(self):
+        """关闭队列"""
+        ray.get(self.queue_actor.shutdown.remote())
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
new file mode 100644
index 00000000000..272f890cbbc
--- /dev/null
+++ b/recipe/fully_async_policy/param_sync.py
@@ -0,0 +1,175 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import ray
+from ray.util.collective import collective
+
+logger = logging.getLogger(__name__)
+
+
+class ParameterSynchronizer:
+    """
+    参数同步器，负责在actor和rollout之间同步模型参数
+    """
+
+    def __init__(self, config):
+        self.config = config
+        self.weights_info = None
+        self.sync_group_initialized = False
+
+    def initialize_sync_group(self, actor_workers: list, rollout_workers: list):
+        """
+        初始化参数同步组
+
+        Args:
+            actor_workers: actor worker列表
+            rollout_workers: rollout worker列表
+        """
+        logger.info("Initializing parameter synchronization group...")
+
+        try:
+            # 获取actor的权重信息
+            if actor_workers:
+                self.weights_info = ray.get(actor_workers[0].get_actor_weights_info.remote())[0]
+
+                # 设置rollout的权重信息
+                for rollout_worker in rollout_workers:
+                    ray.get(rollout_worker.set_actor_weights_info.remote(self.weights_info))
+
+            # 创建actor-rollout通信组
+            all_workers = actor_workers + rollout_workers
+            collective.create_collective_group(
+                all_workers,
+                len(all_workers),
+                list(range(0, len(all_workers))),
+                backend="nccl",
+                group_name="actor_rollout",
+            )
+
+            self.sync_group_initialized = True
+            logger.info("Parameter synchronization group initialized successfully")
+
+        except Exception as e:
+            logger.error(f"Failed to initialize sync group: {e}")
+            raise
+
+    def sync_weights(self, actor_workers: list, rollout_workers: list):
+        """
+        同步权重从actor到rollout
+
+        Args:
+            actor_workers: actor worker列表
+            rollout_workers: rollout worker列表
+        """
+        if not self.sync_group_initialized:
+            raise RuntimeError("Sync group not initialized. Call initialize_sync_group() first.")
+
+        logger.debug("Synchronizing weights from actor to rollout...")
+
+        try:
+            # 同步权重
+            sync_futures = []
+
+            # Actor端同步
+            for actor_worker in actor_workers:
+                future = actor_worker.sync_rollout_weights.remote()
+                sync_futures.append(future)
+
+            # Rollout端同步
+            for rollout_worker in rollout_workers:
+                future = rollout_worker.sync_rollout_weights.remote()
+                sync_futures.append(future)
+
+            # 等待所有同步完成
+            ray.get(sync_futures)
+
+            logger.debug("Weight synchronization completed")
+
+        except Exception as e:
+            logger.error(f"Failed to sync weights: {e}")
+            raise
+
+
+@ray.remote
+class ParameterSyncManager:
+    """
+    Ray Actor形式的参数同步管理器
+    """
+
+    def __init__(self, config):
+        self.config = config
+        self.synchronizer = ParameterSynchronizer(config)
+        self.actor_workers = []
+        self.rollout_workers = []
+
+    def register_workers(self, actor_workers: list, rollout_workers: list):
+        """注册worker"""
+        self.actor_workers = actor_workers
+        self.rollout_workers = rollout_workers
+
+        # 初始化同步组
+        self.synchronizer.initialize_sync_group(actor_workers, rollout_workers)
+
+    def sync_parameters(self):
+        """执行参数同步"""
+        self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers)
+        return True
+
+    def get_weights_info(self):
+        """获取权重信息"""
+        return self.synchronizer.weights_info
+
+
+class AsyncParameterSynchronizer:
+    """
+    异步参数同步器，用于完全异步训练工作流
+    """
+
+    def __init__(self, config, actor_wg, rollouter_actor):
+        """
+        Args:
+            config: 配置
+            actor_wg: actor worker group
+            rollouter_actor: rollouter actor引用
+        """
+        self.config = config
+        self.actor_wg = actor_wg
+        self.rollouter_actor = rollouter_actor
+        self.current_version = 0
+
+    def sync_to_rollouter(self, new_version: int):
+        """
+        将actor参数同步到rollouter
+
+        Args:
+            new_version: 新的参数版本号
+        """
+        logger.info(f"Syncing parameters to rollouter, version: {new_version}")
+
+        try:
+            # 通知rollouter更新参数
+            ray.get(self.rollouter_actor.update_rollout_weights.remote(new_version))
+
+            self.current_version = new_version
+            logger.info(f"Parameter sync to rollouter completed, version: {new_version}")
+
+        except Exception as e:
+            logger.error(f"Failed to sync parameters to rollouter: {e}")
+            raise
+
+    def get_current_version(self) -> int:
+        """获取当前参数版本"""
+        return self.current_version
diff --git a/recipe/fully_async_policy/rollouter.py b/recipe/fully_async_policy/rollouter.py
new file mode 100644
index 00000000000..d98f5e5fdf5
--- /dev/null
+++ b/recipe/fully_async_policy/rollouter.py
@@ -0,0 +1,414 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import threading
+import time
+import uuid
+from typing import Optional
+
+import numpy as np
+import ray
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset, Sampler
+
+from recipe.fully_async_policy.message_queue import MessageQueueClient
+from verl import DataProto
+from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType
+from verl.utils.debug import marked_timer
+
+logger = logging.getLogger(__name__)
+
+
+class RolloutController:
+    """控制rollout的暂停和恢复"""
+
+    def __init__(self):
+        self.is_paused = False
+        self.pause_event = threading.Event()
+        self.resume_event = threading.Event()
+        self.resume_event.set()  # 初始状态为可运行
+        self.pending_requests = []
+        self.lock = threading.RLock()
+
+    def pause(self):
+        """暂停rollout"""
+        with self.lock:
+            if not self.is_paused:
+                self.is_paused = True
+                self.resume_event.clear()
+                self.pause_event.set()
+                logger.info("Rollout paused")
+
+    def resume(self):
+        """恢复rollout"""
+        with self.lock:
+            if self.is_paused:
+                self.is_paused = False
+                self.pause_event.clear()
+                self.resume_event.set()
+                logger.info("Rollout resumed")
+
+    def wait_if_paused(self, timeout: float = None):
+        """如果被暂停则等待恢复"""
+        if self.is_paused:
+            self.resume_event.wait(timeout)
+
+    def is_pause_requested(self) -> bool:
+        """检查是否有暂停请求"""
+        return self.pause_event.is_set()
+
+
+class Rollouter:
+    """
+    异步样本生成器，负责持续生成训练样本并放入MessageQueue
+    """
+
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        train_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name="cuda",
+    ):
+        self.config = config
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name
+
+        # 数据相关
+        self.train_dataset = train_dataset
+        self.collate_fn = collate_fn
+        self.train_sampler = train_sampler
+
+        # Rollout控制
+        self.rollout_controller = RolloutController()
+        self.current_param_version = 0
+
+        # 新鲜度控制
+        self.freshness_threshold = config.async_training.get("freshness_threshold", 3)
+        self.max_staleness_allowed = config.async_training.get("max_staleness_allowed", 5)
+
+        # 统计信息
+        self.total_generated_samples = 0
+        self.dropped_stale_samples = 0
+        self.pause_count = 0
+
+        # Worker groups
+        self.rollout_wg = None
+        self.message_queue_client = None
+
+        # 运行状态
+        self.running = False
+        self.generation_thread = None
+
+    def init_workers(self):
+        """初始化rollout workers"""
+        logger.info("Initializing Rollouter workers...")
+
+        self.resource_pool_manager.create_resource_pool()
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
+        # 只创建rollout worker
+        resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout)
+        role_cls = RayClassWithInitArgs(
+            cls=self.role_worker_mapping[Role.Rollout],
+            config=self.config.actor_rollout_ref,
+            role="rollout",
+        )
+        self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls
+
+        # 初始化WorkerGroup
+        all_wg = {}
+        wg_kwargs = {}
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                device_name=self.device_name,
+                **wg_kwargs,
+            )
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+
+        self.rollout_wg = all_wg["rollout"]
+        self.rollout_wg.init_model()
+        logger.info("Rollouter workers initialized successfully")
+
+    def set_message_queue_client(self, message_queue_client: MessageQueueClient):
+        """设置消息队列客户端"""
+        self.message_queue_client = message_queue_client
+
+    def update_rollout_weights(self, param_version: int):
+        """
+        更新rollout模型参数
+        这个方法由外部Trainer调用
+        """
+        logger.info(f"Updating rollout weights to version {param_version}")
+
+        # 暂停rollout
+        self.rollout_controller.pause()
+
+        try:
+            # 暂停推理引擎
+            ray.get(self.rollout_wg.sleep.remote())
+
+            # 执行参数同步
+            # 这里需要与actor建立同步机制
+            if hasattr(self, "param_synchronizer") and self.param_synchronizer:
+                self.param_synchronizer.sync_weights()
+            else:
+                logger.warning("Parameter synchronizer not available, skipping weight sync")
+
+            # 更新参数版本
+            self.current_param_version = param_version
+
+            # 恢复推理引擎
+            ray.get(self.rollout_wg.wake_up.remote())
+
+        finally:
+            # 恢复rollout
+            self.rollout_controller.resume()
+
+        logger.info(f"Rollout weights updated to version {param_version}")
+
+    def set_parameter_synchronizer(self, param_synchronizer):
+        """设置参数同步器"""
+        self.param_synchronizer = param_synchronizer
+
+    def _create_dataloader(self):
+        """创建数据加载器"""
+        from torch.utils.data import DataLoader
+
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.config.data.train_batch_size,
+            sampler=self.train_sampler,
+            collate_fn=self.collate_fn,
+            num_workers=self.config.data.get("dataloader_num_workers", 0),
+            drop_last=True,
+        )
+
+    def _create_continuous_iterator(self):
+        """创建连续的数据迭代器"""
+        dataloader = self._create_dataloader()
+
+        for epoch in range(self.config.trainer.total_epochs):
+            for batch_dict in dataloader:
+                yield epoch, batch_dict
+
+    def _should_pause_generation(self) -> bool:
+        """
+        判断是否应该暂停生成，基于新鲜度控制
+        """
+        if self.message_queue_client is None:
+            return False
+
+        queue_stats = self.message_queue_client.get_statistics()
+        queue_size = queue_stats["queue_size"]
+        current_trainer_version = queue_stats["current_param_version"]
+
+        # 计算参数版本差异
+        version_diff = self.current_param_version - current_trainer_version
+
+        # 如果版本差异过大，暂停生成
+        if version_diff >= self.max_staleness_allowed:
+            logger.info(
+                f"Pausing generation due to staleness: rollout_version={self.current_param_version}, "
+                f"trainer_version={current_trainer_version}, diff={version_diff}"
+            )
+            return True
+
+        # 如果队列太满，也暂停生成
+        max_queue_size = self.freshness_threshold * self.config.data.train_batch_size
+        if queue_size >= max_queue_size:
+            logger.info(f"Pausing generation due to full queue: size={queue_size}, max={max_queue_size}")
+            return True
+
+        return False
+
+    def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]:
+        """生成单个batch的样本"""
+        try:
+            batch = DataProto.from_single_dict(batch_dict)
+
+            # 处理batch用于生成
+            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+
+            # 处理多模态数据
+            if "multi_modal_data" in batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("multi_modal_data")
+            if "raw_prompt" in batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("raw_prompt")
+            if "tools_kwargs" in batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("tools_kwargs")
+            if "interaction_kwargs" in batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+
+            gen_batch = batch.pop(
+                batch_keys=batch_keys_to_pop,
+                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+            )
+
+            # 重复生成多个响应
+            gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+
+            # 执行生成
+            if self.config.actor_rollout_ref.rollout.mode == "async":
+                gen_batch_output = ray.get(self.rollout_wg.async_generate_sequences.remote(gen_batch))
+            else:
+                gen_batch_output = ray.get(self.rollout_wg.generate_sequences.remote(gen_batch))
+
+            # 添加UID
+            batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+
+            # 重复原始batch以对齐生成的响应
+            batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+
+            # 合并数据
+            final_batch = batch.union(gen_batch_output)
+
+            return final_batch
+
+        except Exception as e:
+            logger.error(f"Error generating batch: {e}")
+            return None
+
+    def _generation_loop(self):
+        """主要的生成循环"""
+        logger.info("Starting generation loop...")
+
+        continuous_iterator = self._create_continuous_iterator()
+
+        for epoch, batch_dict in continuous_iterator:
+            if not self.running:
+                break
+
+            # 等待如果被暂停
+            self.rollout_controller.wait_if_paused(timeout=1.0)
+
+            if not self.running:
+                break
+
+            # 检查是否应该暂停生成
+            if self._should_pause_generation():
+                time.sleep(1.0)  # 等待一段时间再检查
+                continue
+
+            # 生成样本
+            timing_raw = {}
+            with marked_timer("generate_batch", timing_raw):
+                generated_batch = self._generate_batch(epoch, batch_dict)
+
+            if generated_batch is not None:
+                # 放入队列
+                rollout_metadata = {
+                    "timing": timing_raw,
+                    "generation_timestamp": time.time(),
+                }
+
+                success = self.message_queue_client.put_batch(
+                    epoch=epoch,
+                    batch=generated_batch,
+                    param_version=self.current_param_version,
+                    rollout_metadata=rollout_metadata,
+                )
+
+                if success:
+                    self.total_generated_samples += 1
+                    if self.total_generated_samples % 10 == 0:
+                        logger.info(
+                            f"Generated {self.total_generated_samples} batches, "
+                            f"param_version={self.current_param_version}"
+                        )
+                else:
+                    self.dropped_stale_samples += 1
+                    logger.warning(f"Dropped stale sample, total dropped: {self.dropped_stale_samples}")
+
+        logger.info("Generation loop finished")
+
+    def fit(self):
+        """开始异步生成样本"""
+        logger.info("Starting Rollouter...")
+
+        if self.message_queue_client is None:
+            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+
+        self.running = True
+
+        # 在单独的线程中运行生成循环
+        self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
+        self.generation_thread.start()
+
+        try:
+            # 主线程保持运行，处理控制信号
+            while self.running:
+                time.sleep(1.0)
+
+                # 定期打印统计信息
+                if self.total_generated_samples > 0 and self.total_generated_samples % 100 == 0:
+                    queue_stats = self.message_queue_client.get_statistics()
+                    logger.info(
+                        f"Rollouter stats - Generated: {self.total_generated_samples}, "
+                        f"Dropped: {self.dropped_stale_samples}, "
+                        f"Queue size: {queue_stats['queue_size']}, "
+                        f"Param version: {self.current_param_version}"
+                    )
+
+        except KeyboardInterrupt:
+            logger.info("Received interrupt signal, shutting down...")
+        finally:
+            self.shutdown()
+
+    def shutdown(self):
+        """关闭Rollouter"""
+        logger.info("Shutting down Rollouter...")
+
+        self.running = False
+
+        # 恢复可能被暂停的生成线程
+        self.rollout_controller.resume()
+
+        # 等待生成线程结束
+        if self.generation_thread and self.generation_thread.is_alive():
+            self.generation_thread.join(timeout=5.0)
+
+        logger.info("Rollouter shutdown complete")
+
+    def get_statistics(self) -> dict:
+        """获取统计信息"""
+        return {
+            "total_generated_samples": self.total_generated_samples,
+            "dropped_stale_samples": self.dropped_stale_samples,
+            "current_param_version": self.current_param_version,
+            "pause_count": self.pause_count,
+            "is_running": self.running,
+            "is_paused": self.rollout_controller.is_paused,
+        }
diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh
new file mode 100644
index 00000000000..d58e4ecc771
--- /dev/null
+++ b/recipe/fully_async_policy/run_fully_async_example.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+
+# 实验配置
+project_name='FullyAsyncPPO'
+exp_name='async-qwen2.5-7b-test'
+
+# 模型和数据路径
+MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-7B-Instruct"}
+TRAIN_FILE=${TRAIN_FILE:-"~/data/train.parquet"}
+VAL_FILE=${VAL_FILE:-"~/data/val.parquet"}
+
+# 硬件配置
+NNODES=${NNODES:-1}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+# 异步训练资源分配
+n_gpus_rollout=3  # rollout专用GPU数量
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))  # 训练GPU数量
+
+echo "==================================="
+echo "完全异步PPO训练启动"
+echo "==================================="
+echo "模型路径: $MODEL_PATH"
+echo "训练数据: $TRAIN_FILE"
+echo "验证数据: $VAL_FILE"
+echo "节点数: $NNODES"
+echo "每节点GPU数: $NGPUS_PER_NODE"
+echo "Rollout GPU数: $n_gpus_rollout"
+echo "训练GPU数: $n_gpus_training"
+echo "==================================="
+
+# 算法参数
+temperature=1.0
+top_p=1.0
+top_k=-1
+
+# 序列长度
+max_prompt_length=1024
+max_response_length=1024
+
+# 异步训练参数
+freshness_threshold=3
+max_staleness_allowed=5
+max_queue_size=1000
+min_batch_count=1
+batch_timeout=30.0
+
+# 训练参数
+train_batch_size=128
+total_training_steps=1000
+save_freq=100
+val_freq=50
+
+# 设置环境变量
+export NCCL_DEBUG=WARN
+export VLLM_USE_V1=1
+export VERL_LOGGING_LEVEL=INFO
+
+# 启动训练
+python -m recipe.one_step_off_policy.fully_async_main \
+    trainer.project_name="$project_name" \
+    trainer.experiment_name="$exp_name" \
+    trainer.device=cuda \
+    trainer.nnodes=$NNODES \
+    trainer.n_gpus_per_node=$NGPUS_PER_NODE \
+    data.train_files="$TRAIN_FILE" \
+    data.val_files="$VAL_FILE" \
+    data.train_batch_size=$train_batch_size \
+    data.max_prompt_length=$max_prompt_length \
+    data.max_response_length=$max_response_length \
+    data.train_files="$TRAIN_FILE" \
+    data.val_files="$VAL_FILE" \
+    data.train_batch_size=$train_batch_size \
+    data.max_prompt_length=$max_prompt_length \
+    data.max_response_length=$max_response_length \
+    \
+    # 模型配置
+    actor_rollout_ref.model.path="$MODEL_PATH" \
+    actor_rollout_ref.model.lora_rank=64 \
+    actor_rollout_ref.model.lora_alpha=128 \
+    \
+    # Rollout配置
+    actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.rollout.n_gpus=$n_gpus_rollout \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.temperature=$temperature \
+    actor_rollout_ref.rollout.top_k=$top_k \
+    actor_rollout_ref.rollout.top_p=$top_p \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=8192 \
+    actor_rollout_ref.rollout.free_cache_engine=true \
+    actor_rollout_ref.rollout.enforce_eager=true \
+    \
+    # Actor配置
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+    actor_rollout_ref.actor.use_dynamic_bsz=true \
+    actor_rollout_ref.actor.fsdp_config.param_offload=false \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=false \
+    \
+    # Critic配置
+    critic.model.path="$MODEL_PATH" \
+    critic.optim.lr=1e-5 \
+    critic.fsdp_config.param_offload=false \
+    \
+    # 异步训练配置
+    async_training.freshness_threshold=$freshness_threshold \
+    async_training.max_staleness_allowed=$max_staleness_allowed \
+    async_training.max_queue_size=$max_queue_size \
+    async_training.min_batch_count=$min_batch_count \
+    async_training.batch_timeout=$batch_timeout \
+    \
+    # 训练配置
+    trainer.total_training_steps=$total_training_steps \
+    trainer.save_freq=$save_freq \
+    trainer.val_freq=$val_freq \
+    trainer.critic_warmup=0 \
+    \
+    # 算法配置
+    algorithm.adv_estimator=gae \
+    algorithm.cliprange=0.2 \
+    algorithm.vf_coeff=0.1 \
+    algorithm.entropy_coeff=0.01 \
+    algorithm.kl_coeff=0.1 \
+    \
+    # 日志配置
+    trainer.logger='["console", "wandb"]' \
+    trainer.val_before_train=false
+
+echo "==================================="
+echo "完全异步PPO训练完成"
+echo "==================================="
+
diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py
new file mode 100644
index 00000000000..b2f7f866fd7
--- /dev/null
+++ b/recipe/fully_async_policy/test_fully_async.py
@@ -0,0 +1,197 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+测试完全异步训练工作流的组件
+"""
+
+import logging
+import unittest
+from unittest.mock import Mock
+
+import ray
+from omegaconf import OmegaConf
+
+from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
+from verl import DataProto
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class TestMessageQueue(unittest.TestCase):
+    """测试MessageQueue组件"""
+
+    def setUp(self):
+        """设置测试环境"""
+        if not ray.is_initialized():
+            ray.init(local_mode=True)
+
+        config = OmegaConf.create(
+            {
+                "async_training": {
+                    "freshness_threshold": 3,
+                    "max_staleness_allowed": 5,
+                }
+            }
+        )
+
+        self.message_queue = MessageQueue.remote(config, max_queue_size=100)
+        self.client = MessageQueueClient(self.message_queue)
+
+    def tearDown(self):
+        """清理测试环境"""
+        ray.get(self.message_queue.shutdown.remote())
+        if ray.is_initialized():
+            ray.shutdown()
+
+    def test_basic_put_get(self):
+        """测试基本的put和get操作"""
+        # 创建mock数据
+        mock_batch = Mock(spec=DataProto)
+
+        # 放入样本
+        success = self.client.put_batch(epoch=0, batch=mock_batch, param_version=1, rollout_metadata={"test": "data"})
+        self.assertTrue(success)
+
+        # 获取样本
+        samples = self.client.get_batch(min_batch_count=1, timeout=5.0)
+        self.assertIsNotNone(samples)
+        self.assertEqual(len(samples), 1)
+        self.assertEqual(samples[0].epoch, 0)
+        self.assertEqual(samples[0].param_version, 1)
+
+    def test_freshness_control(self):
+        """测试新鲜度控制"""
+        mock_batch = Mock(spec=DataProto)
+
+        # 更新参数版本
+        self.client.update_param_version(10)
+
+        # 尝试放入过期样本
+        success = self.client.put_batch(
+            epoch=0,
+            batch=mock_batch,
+            param_version=5,  # 版本差异为5，超过阈值3
+            rollout_metadata={},
+        )
+        self.assertFalse(success)  # 应该被拒绝
+
+    def test_queue_statistics(self):
+        """测试队列统计信息"""
+        stats = self.client.get_statistics()
+        self.assertIn("queue_size", stats)
+        self.assertIn("total_produced", stats)
+        self.assertIn("total_consumed", stats)
+        self.assertIn("dropped_samples", stats)
+
+
+class TestRollouterComponents(unittest.TestCase):
+    """测试Rollouter相关组件"""
+
+    def setUp(self):
+        """设置测试环境"""
+        from .rollouter import RolloutController
+
+        self.controller = RolloutController()
+
+    def test_rollout_controller(self):
+        """测试rollout控制器"""
+        # 初始状态应该是运行的
+        self.assertFalse(self.controller.is_paused)
+
+        # 测试暂停
+        self.controller.pause()
+        self.assertTrue(self.controller.is_paused)
+
+        # 测试恢复
+        self.controller.resume()
+        self.assertFalse(self.controller.is_paused)
+
+
+class TestParameterSync(unittest.TestCase):
+    """测试参数同步组件"""
+
+    def test_async_parameter_synchronizer(self):
+        """测试异步参数同步器"""
+        from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer
+
+        config = OmegaConf.create({})
+        mock_actor_wg = Mock()
+        mock_rollouter_actor = Mock()
+
+        sync = AsyncParameterSynchronizer(config, mock_actor_wg, mock_rollouter_actor)
+
+        self.assertEqual(sync.get_current_version(), 0)
+
+
+def test_integration():
+    """集成测试"""
+    logger.info("Starting integration test...")
+
+    if not ray.is_initialized():
+        ray.init(local_mode=True)
+
+    try:
+        # 测试MessageQueue和客户端的集成
+        config = OmegaConf.create(
+            {
+                "async_training": {
+                    "freshness_threshold": 3,
+                    "max_staleness_allowed": 5,
+                }
+            }
+        )
+
+        message_queue = MessageQueue.remote(config, max_queue_size=10)
+        client = MessageQueueClient(message_queue)
+
+        # 模拟生产者-消费者场景
+        mock_batch = Mock(spec=DataProto)
+
+        # 生产样本
+        for i in range(5):
+            success = client.put_batch(epoch=i, batch=mock_batch, param_version=i, rollout_metadata={"batch_id": i})
+            assert success, f"Failed to put batch {i}"
+
+        # 消费样本
+        samples = client.get_batch(min_batch_count=3, timeout=10.0)
+        assert samples is not None, "Failed to get samples"
+        assert len(samples) == 3, f"Expected 3 samples, got {len(samples)}"
+
+        # 检查统计信息
+        stats = client.get_statistics()
+        assert stats["total_produced"] == 5
+        assert stats["total_consumed"] == 3
+
+        logger.info("Integration test passed!")
+
+        # 清理
+        ray.get(message_queue.shutdown.remote())
+
+    finally:
+        if ray.is_initialized():
+            ray.shutdown()
+
+
+if __name__ == "__main__":
+    # 运行单元测试
+    unittest.main(argv=[""], exit=False, verbosity=2)
+
+    # 运行集成测试
+    test_integration()
+
+    print("\n" + "=" * 50)
+    print("所有测试完成!")
+    print("=" * 50)
diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py
index 1a2073e6b02..7582c6c18f4 100644
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
@@ -23,6 +23,7 @@
 license_head_modelbest = "Copyright 2025 ModelBest Inc. and/or its affiliates"
 license_head_amazon = "Copyright 2025 Amazon.com Inc and/or its affiliates"
 license_head_facebook = "Copyright (c) 2016-     Facebook, Inc"
+license_head_meituan = "Copyright 2025 Meituan Ltd. and/or its affiliates"
 license_headers = [
     license_head_bytedance,
     license_head_bytedance_25,
@@ -32,6 +33,7 @@
     license_head_modelbest,
     license_head_amazon,
     license_head_facebook,
+    license_head_meituan,
 ]
 
 
@@ -54,4 +56,4 @@
                 if lh in file_content:
                     has_license = True
                     break
-            assert has_license, f"file {path_in_str} does not contain license"
+            assert has_license, f"file {path_in_str} does not contain license \n {file_content}"

From eb7990380929f73a8121ffa5150466db9eff018a Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 24 Jul 2025 16:33:19 +0800
Subject: [PATCH 002/182] init async training pipline

---
 tests/special_sanity/check_license.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py
index 7582c6c18f4..d759a417ff4 100644
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
@@ -56,4 +56,4 @@
                 if lh in file_content:
                     has_license = True
                     break
-            assert has_license, f"file {path_in_str} does not contain license \n {file_content}"
+            assert has_license, f"file {path_in_str} does not contain license"

From 0459298aec438ab75a381b84774b1b643443f0d1 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 25 Jul 2025 19:59:34 +0800
Subject: [PATCH 003/182] update code

---
 recipe/fully_async_policy/RollouterActor.py   |  75 ++++
 recipe/fully_async_policy/fully_async_main.py | 263 ++++++------
 .../fully_async_policy/fully_async_trainer.py |   7 +-
 recipe/fully_async_policy/message_queue.py    |  15 +-
 recipe/fully_async_policy/rollouter.py        |   7 +-
 recipe/fully_async_policy/test_mq.py          | 374 ++++++++++++++++++
 verl/trainer/main_ppo.py                      |   9 +-
 7 files changed, 620 insertions(+), 130 deletions(-)
 create mode 100644 recipe/fully_async_policy/RollouterActor.py
 create mode 100644 recipe/fully_async_policy/test_mq.py

diff --git a/recipe/fully_async_policy/RollouterActor.py b/recipe/fully_async_policy/RollouterActor.py
new file mode 100644
index 00000000000..fb5212b577a
--- /dev/null
+++ b/recipe/fully_async_policy/RollouterActor.py
@@ -0,0 +1,75 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+
+from recipe.fully_async_policy.rollouter import Rollouter
+
+
+@ray.remote
+class RollouterActor:
+    """Rollouter的Ray Actor包装器"""
+
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping,
+        resource_pool_manager,
+        ray_worker_group_cls,
+        processor=None,
+        train_dataset=None,
+        collate_fn=None,
+        train_sampler=None,
+        device_name="cuda",
+    ):
+        self.rollouter = Rollouter(
+            config=config,
+            tokenizer=tokenizer,
+            role_worker_mapping=role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            processor=processor,
+            train_dataset=train_dataset,
+            collate_fn=collate_fn,
+            train_sampler=train_sampler,
+            device_name=device_name,
+        )
+
+    def init_workers(self):
+        """初始化worker"""
+        return self.rollouter.init_workers()
+
+    def set_message_queue_client(self, message_queue_client):
+        """设置消息队列客户端"""
+        return self.rollouter.set_message_queue_client(message_queue_client)
+
+    def set_parameter_synchronizer(self, param_synchronizer):
+        """设置参数同步器"""
+        return self.rollouter.set_parameter_synchronizer(param_synchronizer)
+
+    def update_rollout_weights(self, param_version: int):
+        """更新rollout权重"""
+        return self.rollouter.update_rollout_weights(param_version)
+
+    def fit(self):
+        """开始生成循环"""
+        return self.rollouter.fit()
+
+    def shutdown(self):
+        """关闭rollouter"""
+        return self.rollouter.shutdown()
+
+    def get_statistics(self):
+        """获取统计信息"""
+        return self.rollouter.get_statistics()
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 3bab5d91eb1..f0689a5d28c 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -13,19 +13,32 @@
 # limitations under the License.
 
 import logging
-import os
 import threading
 import time
+import os
+import socket
+
+import hydra
+import ray
+from omegaconf import OmegaConf
+
+from recipe.fully_async_policy.RollouterActor import RollouterActor
+from verl.experimental.dataset.sampler import AbstractSampler
+from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.trainer.ppo.reward import load_reward_manager
+from verl.utils.device import is_cuda_available
+from verl.utils.import_utils import load_extern_type
 
 import hydra
 import ray
 
 from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
-from recipe.fully_async_policy.rollouter import Rollouter
-from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler, run_ppo
 from verl.trainer.ppo.reward import load_reward_manager
+from verl.utils.dataset.rl_dataset import collate_fn
 
-from .fully_async_trainer import FullyAsyncTrainer
+from fully_async_trainer import FullyAsyncTrainer
 
 logger = logging.getLogger(__name__)
 
@@ -35,110 +48,157 @@ def setup_logging():
     logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 
 
-@ray.remote
-class RollouterActor:
-    """Rollouter的Ray Actor包装器"""
-
-    def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping,
-        resource_pool_manager,
-        ray_worker_group_cls,
-        processor=None,
-        train_dataset=None,
-        collate_fn=None,
-        train_sampler=None,
-        device_name="cuda",
-    ):
-        self.rollouter = Rollouter(
-            config=config,
-            tokenizer=tokenizer,
-            role_worker_mapping=role_worker_mapping,
-            resource_pool_manager=resource_pool_manager,
-            ray_worker_group_cls=ray_worker_group_cls,
-            processor=processor,
-            train_dataset=train_dataset,
-            collate_fn=collate_fn,
-            train_sampler=train_sampler,
-            device_name=device_name,
-        )
-
-    def init_workers(self):
-        """初始化worker"""
-        return self.rollouter.init_workers()
-
-    def set_message_queue_client(self, message_queue_client):
-        """设置消息队列客户端"""
-        return self.rollouter.set_message_queue_client(message_queue_client)
-
-    def set_parameter_synchronizer(self, param_synchronizer):
-        """设置参数同步器"""
-        return self.rollouter.set_parameter_synchronizer(param_synchronizer)
+@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+class FullyAsyncTaskRunner:
+    """Ray remote class for executing distributed PPO training tasks.
 
-    def update_rollout_weights(self, param_version: int):
-        """更新rollout权重"""
-        return self.rollouter.update_rollout_weights(param_version)
+    This class encapsulates the main training logic and runs as a Ray remote actor
+    to enable distributed execution across multiple nodes and GPUs.
+    """
 
-    def fit(self):
-        """开始生成循环"""
-        return self.rollouter.fit()
+    def run(self, config):
+        """运行完全异步的PPO训练"""
+        setup_logging()
 
-    def shutdown(self):
-        """关闭rollouter"""
-        return self.rollouter.shutdown()
+        logger.info("Starting fully async PPO training...")
+        # 创建数据集和采样器
+        logger.info("Creating dataset and sampler...")
+        from verl.utils import hf_processor, hf_tokenizer
 
-    def get_statistics(self):
-        """获取统计信息"""
-        return self.rollouter.get_statistics()
+        # Print the initial configuration. `resolve=True` will evaluate symbolic values.
+        from pprint import pprint
 
+        from omegaconf import OmegaConf
 
-def run_fully_async_ppo(config):
-    """运行完全异步的PPO训练"""
-    setup_logging()
+        from verl.utils.fs import copy_to_local
 
-    logger.info("Starting fully async PPO training...")
+        print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
+        pprint(OmegaConf.to_container(config, resolve=True))
+        OmegaConf.resolve(config)
 
-    # 初始化Ray
-    if not ray.is_initialized():
-        ray.init(
-            address=os.environ.get("RAY_ADDRESS", None),
-            runtime_env={"env_vars": {"NCCL_DEBUG": "WARN", "VLLM_USE_V1": "1"}},
+        # Download the checkpoint from HDFS to the local machine.
+        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        local_path = copy_to_local(
+            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
         )
 
-    try:
-        # 创建数据集和采样器
-        logger.info("Creating dataset and sampler...")
+        # Instantiate the tokenizer and processor.
         from verl.utils import hf_processor, hf_tokenizer
 
-        tokenizer = hf_tokenizer(config.actor_rollout_ref.model.path)
-        processor = hf_processor(config.actor_rollout_ref.model.path)
-
-        train_dataset, val_dataset = create_rl_dataset(config, tokenizer, processor)
-        train_sampler = create_rl_sampler(config, train_dataset)
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # Used for multimodal LLM, could be None
+        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+
+        # Define worker classes based on the actor strategy.
+        if config.actor_rollout_ref.actor.strategy == "fsdp2":
+            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+            from verl.single_controller.ray import RayWorkerGroup
+
+            from recipe.one_step_off_policy.fsdp_workers import (
+                ActorRolloutRefWorker,
+                AsyncActorRolloutRefWorker,
+                CriticWorker,
+                RolloutWorker,
+            )
+
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = RayWorkerGroup
+
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+
+            from recipe.one_step_off_policy.megatron_workers import (
+                ActorRolloutRefWorker,
+                AsyncActorRolloutRefWorker,
+                CriticWorker,
+                RolloutWorker,
+            )
+
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+        else:
+            raise NotImplementedError
+
+        from recipe.one_step_off_policy.ray_trainer import ResourcePoolManager, Role
+
+        role_worker_mapping = {
+            Role.Actor: ray.remote(actor_rollout_cls),
+            Role.Rollout: ray.remote(RolloutWorker),
+            Role.Critic: ray.remote(CriticWorker),
+        }
 
-        # 创建collate function
-        from verl.trainer.ppo.ray_trainer import default_collate_fn
+        global_pool_id = "actor_pool"
+        rollout_pool_id = "rollout_pool"
 
-        collate_fn = default_collate_fn
+        assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0"
+        assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0"
+        assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0"
+        assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0"
 
-        # 创建奖励函数
-        reward_fn, val_reward_fn = load_reward_manager(config, tokenizer)
+        actor_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes
+        rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes
 
-        # 创建资源池管理器和worker映射
-        from verl.single_controller.ray import RayWorkerGroup
-        from verl.trainer.ppo.ray_trainer import (
-            Role,
-            create_resource_pool_manager,
-            create_role_worker_mapping,
+        resource_pool_spec = {
+            "actor_pool": actor_pool,
+            "rollout_pool": rollout_pool,
+        }
+        mapping = {
+            Role.Actor: global_pool_id,
+            Role.Rollout: rollout_pool_id,
+            Role.Critic: global_pool_id,
+        }
+        print(f"resource_pool_spec: {resource_pool_spec}")
+        # We should adopt a multi-source reward function here:
+        # - for rule-based rm, we directly call a reward score
+        # - for model-based rm, we call a model
+        # - for code related prompt, we send to a sandbox if there are test cases
+        # finally, we combine all the rewards together
+        # The reward type depends on the tag of the data
+        if config.reward_model.enable:
+            if config.reward_model.strategy == "fsdp2":
+                from verl.workers.fsdp_workers import RewardModelWorker
+            elif config.reward_model.strategy == "megatron":
+                from verl.workers.megatron_workers import RewardModelWorker
+            else:
+                raise NotImplementedError
+            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+            mapping[Role.RewardModel] = global_pool_id
+
+        # Add a reference policy worker if KL loss or KL reward is used.
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+            mapping[Role.RefPolicy] = global_pool_id
+
+        # Load the reward manager for training and validation.
+        reward_fn = load_reward_manager(
+            config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
         )
+        val_reward_fn = load_reward_manager(
+            config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {})
+        )
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+        from verl.utils.dataset.rl_dataset import collate_fn
 
-        # resource_pool_manager = create_resource_pool_manager(config)
-        role_worker_mapping = create_role_worker_mapping(config)
+        # Create training and validation datasets.
+        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor)
+        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
+        train_sampler = create_rl_sampler(config.data, train_dataset)
 
         # 1. 创建MessageQueue
         logger.info("Creating MessageQueue...")
+        # todo max_queue_size auto compute
         max_queue_size = config.async_training.get("max_queue_size", 1000)
         message_queue = MessageQueue.remote(config, max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
@@ -168,6 +228,9 @@ def run_fully_async_ppo(config):
             role: worker_cls for role, worker_cls in role_worker_mapping.items() if role != Role.Rollout
         }
 
+        # 创建奖励函数
+        reward_fn, val_reward_fn = load_reward_manager(config, tokenizer)
+
         trainer = FullyAsyncTrainer(
             config=config,
             tokenizer=tokenizer,
@@ -207,37 +270,15 @@ def run_rollouter():
         rollouter_thread = threading.Thread(target=run_rollouter, daemon=True)
         rollouter_thread.start()
 
-        # 等待一下让Rollouter启动
-        time.sleep(5)
-
         # 6. 启动Trainer（主线程）
         logger.info("Starting FullyAsyncTrainer...")
         trainer.fit()
 
-        # 7. 关闭
-        logger.info("Shutting down...")
-        ray.get(rollouter_actor.shutdown.remote())
-
-        # 等待Rollouter线程结束
-        rollouter_thread.join(timeout=10)
-
-        # 关闭MessageQueue
-        ray.get(message_queue.shutdown.remote())
-
-        logger.info("Fully async PPO training completed successfully!")
-
-    except Exception as e:
-        logger.error(f"Error in fully async PPO training: {e}")
-        raise
-    finally:
-        if ray.is_initialized():
-            ray.shutdown()
-
 
-@hydra.main(config_path="../one_step_off_policy/config", config_name="fully_async_ppo_trainer", version_base=None)
+@hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
     """主入口函数"""
-    run_fully_async_ppo(config)
+    run_ppo(config, FullyAsyncTaskRunner)
 
 
 if __name__ == "__main__":
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 192d33817a6..2487387b163 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -14,7 +14,6 @@
 
 import logging
 from pprint import pprint
-from typing import Optional
 
 import numpy as np
 import ray
@@ -62,10 +61,10 @@ def __init__(
         processor=None,
         reward_fn=None,
         val_reward_fn=None,
-        train_dataset: Optional[Dataset] = None,
-        val_dataset: Optional[Dataset] = None,
+        train_dataset: Dataset | None = None,
+        val_dataset: Dataset | None = None,
         collate_fn=None,
-        train_sampler: Optional[Sampler] = None,
+        train_sampler: Sampler | None = None,
         device_name="cuda",
     ):
         self.config = config
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index e28346a9ccd..dd9b5c5e8a9 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import threading
-import time
+import threadingimport time
 import uuid
 from collections import deque
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import ray
 import zmq
@@ -39,7 +38,7 @@ class BatchSample:
     rollout_metadata: dict[str, Any]
 
 
-@ray.remote(num_cpus=1)
+@ray.remote(num_cpus=24)
 class MessageQueue:
     """
     基于ZeroMQ的异步消息队列，用于Rollouter和Trainer之间的通信
@@ -84,7 +83,7 @@ def _setup_zmq(self):
             self.socket.bind(self.address)
 
     def put_batch(
-        self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
+            self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
     ) -> bool:
         """
         放入一个batch样本到队列
@@ -129,7 +128,7 @@ def put_batch(
 
             return True
 
-    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
+    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None:
         """
         从队列获取batch样本
 
@@ -208,12 +207,12 @@ def __init__(self, queue_actor: ray.ActorHandle):
         self.queue_actor = queue_actor
 
     def put_batch(
-        self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
+            self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
     ) -> bool:
         """放入batch到队列"""
         return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata))
 
-    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
+    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None:
         """从队列获取batch"""
         return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout))
 
diff --git a/recipe/fully_async_policy/rollouter.py b/recipe/fully_async_policy/rollouter.py
index d98f5e5fdf5..ac43b6e3dbf 100644
--- a/recipe/fully_async_policy/rollouter.py
+++ b/recipe/fully_async_policy/rollouter.py
@@ -16,7 +16,6 @@
 import threading
 import time
 import uuid
-from typing import Optional
 
 import numpy as np
 import ray
@@ -85,9 +84,9 @@ def __init__(
         resource_pool_manager: ResourcePoolManager,
         ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
         processor=None,
-        train_dataset: Optional[Dataset] = None,
+        train_dataset: Dataset | None = None,
         collate_fn=None,
-        train_sampler: Optional[Sampler] = None,
+        train_sampler: Sampler | None = None,
         device_name="cuda",
     ):
         self.config = config
@@ -253,7 +252,7 @@ def _should_pause_generation(self) -> bool:
 
         return False
 
-    def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]:
+    def _generate_batch(self, epoch: int, batch_dict: dict) -> DataProto | None:
         """生成单个batch的样本"""
         try:
             batch = DataProto.from_single_dict(batch_dict)
diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py
new file mode 100644
index 00000000000..a8aaa8add5f
--- /dev/null
+++ b/recipe/fully_async_policy/test_mq.py
@@ -0,0 +1,374 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import time
+import threading
+from unittest.mock import Mock, patch, MagicMock
+from omegaconf import DictConfig
+import ray
+
+from message_queue import BatchSample, MessageQueue, MessageQueueClient
+
+
+@pytest.fixture
+def mock_data_proto():
+    """Mock DataProto对象"""
+    return Mock()
+
+
+@pytest.fixture
+def basic_config():
+    """基础配置"""
+    return DictConfig({
+        'async_training': {
+            'freshness_threshold': 3
+        }
+    })
+
+
+@pytest.fixture
+def queue_config():
+    """队列配置"""
+    return DictConfig({
+        'async_training': {
+            'freshness_threshold': 2
+        }
+    })
+
+
+class TestBatchSample:
+    """测试BatchSample数据类"""
+
+    def test_batch_sample_creation(self, mock_data_proto):
+        """测试BatchSample创建"""
+        sample = BatchSample(
+            batch_id="test-123",
+            epoch=1,
+            data=mock_data_proto,
+            param_version=5,
+            timestamp=1234567890.0,
+            rollout_metadata={"key": "value"}
+        )
+
+        assert sample.batch_id == "test-123"
+        assert sample.epoch == 1
+        assert sample.data == mock_data_proto
+        assert sample.param_version == 5
+        assert sample.timestamp == 1234567890.0
+        assert sample.rollout_metadata == {"key": "value"}
+
+
+class TestMessageQueue:
+    """测试MessageQueue类（需要在非Ray环境下测试内部逻辑）"""
+
+    @patch('message_queue.zmq.Context')
+    @patch('message_queue.FileLock')
+    @patch('socket.socket')
+    def test_message_queue_init(self, mock_socket, mock_filelock, mock_zmq_context, basic_config):
+        """测试MessageQueue初始化"""
+        # Mock socket
+        mock_sock_instance = Mock()
+        mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345)
+        mock_socket.return_value.__enter__.return_value = mock_sock_instance
+
+        # Mock ZMQ
+        mock_context = Mock()
+        mock_zmq_context.return_value = mock_context
+        mock_zmq_socket = Mock()
+        mock_context.socket.return_value = mock_zmq_socket
+
+        # Mock FileLock
+        mock_filelock.return_value.__enter__ = Mock(return_value=None)
+        mock_filelock.return_value.__exit__ = Mock(return_value=None)
+
+        # 创建MessageQueue实例（不使用Ray装饰器）
+        queue = MessageQueue.__wrapped__(basic_config, max_queue_size=100)
+
+        assert queue.max_queue_size == 100
+        assert queue.current_param_version == 0
+        assert queue.freshness_threshold == 3
+        assert len(queue.queue) == 0
+        assert queue.total_produced == 0
+        assert queue.total_consumed == 0
+        assert queue.dropped_samples == 0
+
+
+@pytest.fixture
+def ray_setup():
+    """设置Ray环境"""
+    if not ray.is_initialized():
+        ray.init(local_mode=True, ignore_reinit_error=True)
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture
+def message_queue_actor(ray_setup, basic_config):
+    """创建MessageQueue actor"""
+    with patch('message_queue.zmq.Context'), \
+            patch('message_queue.FileLock'), \
+            patch('socket.socket') as mock_socket:
+        # Mock socket setup
+        mock_sock_instance = Mock()
+        mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345)
+        mock_socket.return_value.__enter__.return_value = mock_sock_instance
+
+        actor = MessageQueue.remote(basic_config, max_queue_size=10)
+        yield actor
+        ray.get(actor.shutdown.remote())
+
+
+class TestMessageQueueActor:
+    """测试MessageQueue Actor"""
+
+    def test_put_batch_success(self, message_queue_actor, mock_data_proto):
+        """测试成功放入batch"""
+        result = ray.get(message_queue_actor.put_batch.remote(
+            epoch=1,
+            batch=mock_data_proto,
+            param_version=1,
+            rollout_metadata={"test": "data"}
+        ))
+
+        assert result is True
+
+        # 检查队列大小
+        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
+        assert queue_size == 1
+
+        # 检查统计信息
+        stats = ray.get(message_queue_actor.get_statistics.remote())
+        assert stats["total_produced"] == 1
+        assert stats["queue_size"] == 1
+
+    def test_put_batch_staleness_check(self, message_queue_actor, mock_data_proto):
+        """测试新鲜度检查"""
+        # 更新参数版本为5
+        ray.get(message_queue_actor.update_param_version.remote(5))
+
+        # 尝试放入版本过旧的batch（版本差异>=3会被拒绝）
+        result = ray.get(message_queue_actor.put_batch.remote(
+            epoch=1,
+            batch=mock_data_proto,
+            param_version=2,  # 5-2=3, 达到阈值
+            rollout_metadata={}
+        ))
+
+        assert result is False
+
+        # 检查统计信息中的丢弃样本数
+        stats = ray.get(message_queue_actor.get_statistics.remote())
+        assert stats["dropped_samples"] == 1
+
+    def test_put_batch_queue_overflow(self, message_queue_actor, mock_data_proto):
+        """测试队列溢出处理"""
+        # 填满队列（最大容量10）
+        for i in range(12):  # 超过最大容量
+            ray.get(message_queue_actor.put_batch.remote(
+                epoch=1,
+                batch=mock_data_proto,
+                param_version=1,
+                rollout_metadata={}
+            ))
+
+        # 队列大小应该保持在最大值
+        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
+        assert queue_size == 10
+
+        # 检查统计信息
+        stats = ray.get(message_queue_actor.get_statistics.remote())
+        assert stats["dropped_samples"] == 2  # 超出的2个被丢弃
+
+    def test_get_batch_success(self, message_queue_actor, mock_data_proto):
+        """测试成功获取batch"""
+        # 先放入一些batch
+        for i in range(3):
+            ray.get(message_queue_actor.put_batch.remote(
+                epoch=i,
+                batch=mock_data_proto,
+                param_version=1,
+                rollout_metadata={"index": i}
+            ))
+
+        # 获取2个batch
+        samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=2, timeout=5.0))
+
+        assert samples is not None
+        assert len(samples) == 2
+        assert all(isinstance(sample, BatchSample) for sample in samples)
+
+        # 检查队列大小减少
+        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
+        assert queue_size == 1
+
+        # 检查统计信息
+        stats = ray.get(message_queue_actor.get_statistics.remote())
+        assert stats["total_consumed"] == 2
+
+    def test_get_batch_timeout(self, message_queue_actor):
+        """测试获取batch超时"""
+        # 空队列情况下获取batch应该超时
+        samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=1, timeout=1.0))
+        assert samples is None
+
+    def test_update_param_version(self, message_queue_actor):
+        """测试更新参数版本"""
+        ray.get(message_queue_actor.update_param_version.remote(10))
+
+        stats = ray.get(message_queue_actor.get_statistics.remote())
+        assert stats["current_param_version"] == 10
+
+    def test_clear_queue(self, message_queue_actor, mock_data_proto):
+        """测试清空队列"""
+        # 先添加一些样本
+        for i in range(3):
+            ray.get(message_queue_actor.put_batch.remote(
+                epoch=i, batch=mock_data_proto, param_version=1
+            ))
+
+        # 清空队列
+        ray.get(message_queue_actor.clear_queue.remote())
+
+        # 检查队列大小
+        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
+        assert queue_size == 0
+
+    def test_get_statistics(self, message_queue_actor):
+        """测试获取统计信息"""
+        stats = ray.get(message_queue_actor.get_statistics.remote())
+
+        expected_keys = {
+            "queue_size", "total_produced", "total_consumed",
+            "dropped_samples", "current_param_version", "freshness_threshold"
+        }
+        assert set(stats.keys()) == expected_keys
+        assert isinstance(stats["queue_size"], int)
+        assert isinstance(stats["total_produced"], int)
+        assert isinstance(stats["total_consumed"], int)
+
+
+class TestMessageQueueClient:
+    """测试MessageQueueClient"""
+
+    def test_client_put_batch(self, message_queue_actor, mock_data_proto):
+        """测试客户端放入batch"""
+        client = MessageQueueClient(message_queue_actor)
+
+        result = client.put_batch(
+            epoch=1,
+            batch=mock_data_proto,
+            param_version=1,
+            rollout_metadata={"test": "client"}
+        )
+
+        assert result is True
+        assert client.get_queue_size() == 1
+
+    def test_client_get_batch(self, message_queue_actor, mock_data_proto):
+        """测试客户端获取batch"""
+        client = MessageQueueClient(message_queue_actor)
+
+        # 先放入一个batch
+        client.put_batch(epoch=1, batch=mock_data_proto, param_version=1)
+
+        # 获取batch
+        samples = client.get_batch(min_batch_count=1, timeout=5.0)
+
+        assert samples is not None
+        assert len(samples) == 1
+        assert isinstance(samples[0], BatchSample)
+
+    def test_client_update_param_version(self, message_queue_actor):
+        """测试客户端更新参数版本"""
+        client = MessageQueueClient(message_queue_actor)
+
+        client.update_param_version(15)
+
+        stats = client.get_statistics()
+        assert stats["current_param_version"] == 15
+
+    def test_client_get_queue_size(self, message_queue_actor, mock_data_proto):
+        """测试客户端获取队列大小"""
+        client = MessageQueueClient(message_queue_actor)
+
+        assert client.get_queue_size() == 0
+
+        client.put_batch(epoch=1, batch=mock_data_proto, param_version=1)
+        assert client.get_queue_size() == 1
+
+    def test_client_clear_queue(self, message_queue_actor, mock_data_proto):
+        """测试客户端清空队列"""
+        client = MessageQueueClient(message_queue_actor)
+
+        # 添加样本
+        client.put_batch(epoch=1, batch=mock_data_proto, param_version=1)
+        assert client.get_queue_size() == 1
+
+        # 清空队列
+        client.clear_queue()
+        assert client.get_queue_size() == 0
+
+    def test_client_shutdown(self, message_queue_actor):
+        """测试客户端关闭"""
+        client = MessageQueueClient(message_queue_actor)
+
+        # 关闭不应该抛出异常
+        client.shutdown()
+
+
+class TestConcurrency:
+    """测试并发场景"""
+
+    def test_concurrent_put_get(self, message_queue_actor, mock_data_proto):
+        """测试并发放入和获取"""
+        client = MessageQueueClient(message_queue_actor)
+        results = []
+
+        def producer():
+            for i in range(5):
+                result = client.put_batch(
+                    epoch=i, batch=mock_data_proto, param_version=1
+                )
+                results.append(("put", result))
+                time.sleep(0.1)
+
+        def consumer():
+            for _ in range(3):
+                samples = client.get_batch(min_batch_count=1, timeout=2.0)
+                results.append(("get", samples is not None))
+                time.sleep(0.1)
+
+        # 启动生产者和消费者线程
+        producer_thread = threading.Thread(target=producer)
+        consumer_thread = threading.Thread(target=consumer)
+
+        producer_thread.start()
+        time.sleep(0.05)  # 让生产者先开始
+        consumer_thread.start()
+
+        producer_thread.join()
+        consumer_thread.join()
+
+        # 检查结果
+        put_results = [r[1] for r in results if r[0] == "put"]
+        get_results = [r[1] for r in results if r[0] == "get"]
+
+        assert all(put_results)  # 所有放入操作都应该成功
+        assert all(get_results)  # 所有获取操作都应该成功
+
+
+# 运行测试的示例配置
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index a9ea554687a..e81d0b32c1d 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -41,7 +41,7 @@ def main(config):
 
 
 # Define a function to run the PPO-like training process
-def run_ppo(config) -> None:
+def run_ppo(config, task_runner_class = None) -> None:
     """Initialize Ray cluster and run distributed PPO training process.
 
     Args:
@@ -59,6 +59,9 @@ def run_ppo(config) -> None:
             runtime_env=get_ppo_ray_runtime_env(),
             num_cpus=config.ray_init.num_cpus,
         )
+    # for recipe to change TaskRunner
+    if task_runner_class is None:
+        task_runner_class = TaskRunner
 
     # Create a remote instance of the TaskRunner class, and
     # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
@@ -68,9 +71,9 @@ def run_ppo(config) -> None:
         and len(config.trainer.get("profile_steps", [])) > 0
     ):
         nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
-        runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
+        runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote()
     else:
-        runner = TaskRunner.remote()
+        runner = task_runner_class.remote()
     ray.get(runner.run.remote(config))
 
     # [Optional] get the path of the timeline trace file from the configuration, default to None

From 5c9dd6d7d824d803761451179218840b1dda3947 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 25 Jul 2025 22:16:35 +0800
Subject: [PATCH 004/182] test message queue

---
 recipe/fully_async_policy/message_queue.py    |  45 +++---
 recipe/fully_async_policy/test_fully_async.py |   3 +-
 recipe/fully_async_policy/test_mq.py          | 138 +++++++-----------
 3 files changed, 81 insertions(+), 105 deletions(-)

diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index dd9b5c5e8a9..f57d1e15325 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -12,19 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import threadingimport time
+import threading
+import time
 import uuid
 from collections import deque
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Optional
 
 import ray
 import zmq
 from filelock import FileLock
 from omegaconf import DictConfig
 
-from verl import DataProto
-
 
 @dataclass
 class BatchSample:
@@ -32,13 +31,13 @@ class BatchSample:
 
     batch_id: str
     epoch: int
-    data: DataProto
+    data: Any
     param_version: int
     timestamp: float
     rollout_metadata: dict[str, Any]
 
 
-@ray.remote(num_cpus=24)
+@ray.remote(num_cpus=1)
 class MessageQueue:
     """
     基于ZeroMQ的异步消息队列，用于Rollouter和Trainer之间的通信
@@ -49,13 +48,24 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.max_queue_size = max_queue_size
         self.queue = deque(maxlen=max_queue_size)
         self.current_param_version = 0
-        self.freshness_threshold = config.async_training.get("freshness_threshold", 3)
+
+        # 安全地获取配置值，避免递归问题
+        try:
+            if hasattr(config, "async_training") and config.async_training is not None:
+                self.freshness_threshold = getattr(config.async_training, "freshness_threshold", 3)
+            else:
+                self.freshness_threshold = 3
+        except (AttributeError, RecursionError):
+            self.freshness_threshold = 3
 
         # ZeroMQ setup
-        self.context = zmq.Context()
+        self.context = None
         self.socket = None
         self.address = None
-        self._setup_zmq()
+        try:
+            self._setup_zmq()
+        except Exception as e:
+            print(f"Warning: ZeroMQ setup failed: {e}. Queue will work without ZeroMQ.")
 
         # Threading for message handling
         self.running = True
@@ -71,6 +81,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
     def _setup_zmq(self):
         """设置ZeroMQ socket"""
         with FileLock("/tmp/verl_message_queue.lock"):
+            # 初始化 ZeroMQ context
+            self.context = zmq.Context()
+
             # 使用TCP socket
             import socket as sock
 
@@ -82,9 +95,7 @@ def _setup_zmq(self):
             self.socket = self.context.socket(zmq.PAIR)
             self.socket.bind(self.address)
 
-    def put_batch(
-            self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
-    ) -> bool:
+    def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool:
         """
         放入一个batch样本到队列
 
@@ -128,7 +139,7 @@ def put_batch(
 
             return True
 
-    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None:
+    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
         """
         从队列获取batch样本
 
@@ -203,16 +214,14 @@ def get_address(self) -> str:
 class MessageQueueClient:
     """MessageQueue的客户端，用于与MessageQueue Actor通信"""
 
-    def __init__(self, queue_actor: ray.ActorHandle):
+    def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
-    def put_batch(
-            self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None
-    ) -> bool:
+    def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool:
         """放入batch到队列"""
         return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata))
 
-    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None:
+    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
         """从队列获取batch"""
         return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout))
 
diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py
index b2f7f866fd7..eaa9313254a 100644
--- a/recipe/fully_async_policy/test_fully_async.py
+++ b/recipe/fully_async_policy/test_fully_async.py
@@ -23,8 +23,7 @@
 import ray
 from omegaconf import OmegaConf
 
-from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
-from verl import DataProto
+from recipe.fully_async_policy.message_queue import DataProto, MessageQueue, MessageQueueClient
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py
index a8aaa8add5f..488b7d12614 100644
--- a/recipe/fully_async_policy/test_mq.py
+++ b/recipe/fully_async_policy/test_mq.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-import time
 import threading
-from unittest.mock import Mock, patch, MagicMock
-from omegaconf import DictConfig
-import ray
+import time
+from unittest.mock import Mock
 
+import pytest
+import ray
 from message_queue import BatchSample, MessageQueue, MessageQueueClient
+from omegaconf import DictConfig
 
 
 @pytest.fixture
@@ -31,21 +31,13 @@ def mock_data_proto():
 @pytest.fixture
 def basic_config():
     """基础配置"""
-    return DictConfig({
-        'async_training': {
-            'freshness_threshold': 3
-        }
-    })
+    return DictConfig({"async_training": {"freshness_threshold": 3}})
 
 
 @pytest.fixture
 def queue_config():
     """队列配置"""
-    return DictConfig({
-        'async_training': {
-            'freshness_threshold': 2
-        }
-    })
+    return DictConfig({"async_training": {"freshness_threshold": 2}})
 
 
 class TestBatchSample:
@@ -59,7 +51,7 @@ def test_batch_sample_creation(self, mock_data_proto):
             data=mock_data_proto,
             param_version=5,
             timestamp=1234567890.0,
-            rollout_metadata={"key": "value"}
+            rollout_metadata={"key": "value"},
         )
 
         assert sample.batch_id == "test-123"
@@ -73,29 +65,16 @@ def test_batch_sample_creation(self, mock_data_proto):
 class TestMessageQueue:
     """测试MessageQueue类（需要在非Ray环境下测试内部逻辑）"""
 
-    @patch('message_queue.zmq.Context')
-    @patch('message_queue.FileLock')
-    @patch('socket.socket')
-    def test_message_queue_init(self, mock_socket, mock_filelock, mock_zmq_context, basic_config):
+    def test_message_queue_init(self, basic_config):
         """测试MessageQueue初始化"""
-        # Mock socket
-        mock_sock_instance = Mock()
-        mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345)
-        mock_socket.return_value.__enter__.return_value = mock_sock_instance
-
-        # Mock ZMQ
-        mock_context = Mock()
-        mock_zmq_context.return_value = mock_context
-        mock_zmq_socket = Mock()
-        mock_context.socket.return_value = mock_zmq_socket
+        # 直接创建MessageQueue实例（不使用Ray装饰器）
+        queue = MessageQueue.__ray_actor_class__(basic_config, max_queue_size=100)
 
-        # Mock FileLock
-        mock_filelock.return_value.__enter__ = Mock(return_value=None)
-        mock_filelock.return_value.__exit__ = Mock(return_value=None)
-
-        # 创建MessageQueue实例（不使用Ray装饰器）
-        queue = MessageQueue.__wrapped__(basic_config, max_queue_size=100)
+        # 确保ZeroMQ初始化成功
+        assert queue.context is not None
+        assert queue.socket is not None
 
+        # 基本属性检查
         assert queue.max_queue_size == 100
         assert queue.current_param_version == 0
         assert queue.freshness_threshold == 3
@@ -104,6 +83,9 @@ def test_message_queue_init(self, mock_socket, mock_filelock, mock_zmq_context,
         assert queue.total_consumed == 0
         assert queue.dropped_samples == 0
 
+        # 清理资源
+        queue.shutdown()
+
 
 @pytest.fixture
 def ray_setup():
@@ -117,17 +99,9 @@ def ray_setup():
 @pytest.fixture
 def message_queue_actor(ray_setup, basic_config):
     """创建MessageQueue actor"""
-    with patch('message_queue.zmq.Context'), \
-            patch('message_queue.FileLock'), \
-            patch('socket.socket') as mock_socket:
-        # Mock socket setup
-        mock_sock_instance = Mock()
-        mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345)
-        mock_socket.return_value.__enter__.return_value = mock_sock_instance
-
-        actor = MessageQueue.remote(basic_config, max_queue_size=10)
-        yield actor
-        ray.get(actor.shutdown.remote())
+    actor = MessageQueue.remote(basic_config, max_queue_size=10)
+    yield actor
+    ray.get(actor.shutdown.remote())
 
 
 class TestMessageQueueActor:
@@ -135,12 +109,11 @@ class TestMessageQueueActor:
 
     def test_put_batch_success(self, message_queue_actor, mock_data_proto):
         """测试成功放入batch"""
-        result = ray.get(message_queue_actor.put_batch.remote(
-            epoch=1,
-            batch=mock_data_proto,
-            param_version=1,
-            rollout_metadata={"test": "data"}
-        ))
+        result = ray.get(
+            message_queue_actor.put_batch.remote(
+                epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "data"}
+            )
+        )
 
         assert result is True
 
@@ -159,12 +132,14 @@ def test_put_batch_staleness_check(self, message_queue_actor, mock_data_proto):
         ray.get(message_queue_actor.update_param_version.remote(5))
 
         # 尝试放入版本过旧的batch（版本差异>=3会被拒绝）
-        result = ray.get(message_queue_actor.put_batch.remote(
-            epoch=1,
-            batch=mock_data_proto,
-            param_version=2,  # 5-2=3, 达到阈值
-            rollout_metadata={}
-        ))
+        result = ray.get(
+            message_queue_actor.put_batch.remote(
+                epoch=1,
+                batch=mock_data_proto,
+                param_version=2,  # 5-2=3, 达到阈值
+                rollout_metadata={},
+            )
+        )
 
         assert result is False
 
@@ -176,12 +151,11 @@ def test_put_batch_queue_overflow(self, message_queue_actor, mock_data_proto):
         """测试队列溢出处理"""
         # 填满队列（最大容量10）
         for i in range(12):  # 超过最大容量
-            ray.get(message_queue_actor.put_batch.remote(
-                epoch=1,
-                batch=mock_data_proto,
-                param_version=1,
-                rollout_metadata={}
-            ))
+            ray.get(
+                message_queue_actor.put_batch.remote(
+                    epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={}
+                )
+            )
 
         # 队列大小应该保持在最大值
         queue_size = ray.get(message_queue_actor.get_queue_size.remote())
@@ -195,12 +169,11 @@ def test_get_batch_success(self, message_queue_actor, mock_data_proto):
         """测试成功获取batch"""
         # 先放入一些batch
         for i in range(3):
-            ray.get(message_queue_actor.put_batch.remote(
-                epoch=i,
-                batch=mock_data_proto,
-                param_version=1,
-                rollout_metadata={"index": i}
-            ))
+            ray.get(
+                message_queue_actor.put_batch.remote(
+                    epoch=i, batch=mock_data_proto, param_version=1, rollout_metadata={"index": i}
+                )
+            )
 
         # 获取2个batch
         samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=2, timeout=5.0))
@@ -234,9 +207,7 @@ def test_clear_queue(self, message_queue_actor, mock_data_proto):
         """测试清空队列"""
         # 先添加一些样本
         for i in range(3):
-            ray.get(message_queue_actor.put_batch.remote(
-                epoch=i, batch=mock_data_proto, param_version=1
-            ))
+            ray.get(message_queue_actor.put_batch.remote(epoch=i, batch=mock_data_proto, param_version=1))
 
         # 清空队列
         ray.get(message_queue_actor.clear_queue.remote())
@@ -250,8 +221,12 @@ def test_get_statistics(self, message_queue_actor):
         stats = ray.get(message_queue_actor.get_statistics.remote())
 
         expected_keys = {
-            "queue_size", "total_produced", "total_consumed",
-            "dropped_samples", "current_param_version", "freshness_threshold"
+            "queue_size",
+            "total_produced",
+            "total_consumed",
+            "dropped_samples",
+            "current_param_version",
+            "freshness_threshold",
         }
         assert set(stats.keys()) == expected_keys
         assert isinstance(stats["queue_size"], int)
@@ -266,12 +241,7 @@ def test_client_put_batch(self, message_queue_actor, mock_data_proto):
         """测试客户端放入batch"""
         client = MessageQueueClient(message_queue_actor)
 
-        result = client.put_batch(
-            epoch=1,
-            batch=mock_data_proto,
-            param_version=1,
-            rollout_metadata={"test": "client"}
-        )
+        result = client.put_batch(epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "client"})
 
         assert result is True
         assert client.get_queue_size() == 1
@@ -338,9 +308,7 @@ def test_concurrent_put_get(self, message_queue_actor, mock_data_proto):
 
         def producer():
             for i in range(5):
-                result = client.put_batch(
-                    epoch=i, batch=mock_data_proto, param_version=1
-                )
+                result = client.put_batch(epoch=i, batch=mock_data_proto, param_version=1)
                 results.append(("put", result))
                 time.sleep(0.1)
 

From 3fd7020e0d34f2f03d3c0e0ffb5f9bfb84e873c2 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 30 Jul 2025 14:03:27 +0800
Subject: [PATCH 005/182] main

---
 recipe/fully_async_policy/RollouterActor.py   |  75 --
 recipe/fully_async_policy/fully_async_main.py | 597 ++++++++++-----
 .../fully_async_rollouter.py                  | 681 ++++++++++++++++++
 recipe/fully_async_policy/rollouter.py        | 413 -----------
 4 files changed, 1103 insertions(+), 663 deletions(-)
 delete mode 100644 recipe/fully_async_policy/RollouterActor.py
 create mode 100644 recipe/fully_async_policy/fully_async_rollouter.py
 delete mode 100644 recipe/fully_async_policy/rollouter.py

diff --git a/recipe/fully_async_policy/RollouterActor.py b/recipe/fully_async_policy/RollouterActor.py
deleted file mode 100644
index fb5212b577a..00000000000
--- a/recipe/fully_async_policy/RollouterActor.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import ray
-
-from recipe.fully_async_policy.rollouter import Rollouter
-
-
-@ray.remote
-class RollouterActor:
-    """Rollouter的Ray Actor包装器"""
-
-    def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping,
-        resource_pool_manager,
-        ray_worker_group_cls,
-        processor=None,
-        train_dataset=None,
-        collate_fn=None,
-        train_sampler=None,
-        device_name="cuda",
-    ):
-        self.rollouter = Rollouter(
-            config=config,
-            tokenizer=tokenizer,
-            role_worker_mapping=role_worker_mapping,
-            resource_pool_manager=resource_pool_manager,
-            ray_worker_group_cls=ray_worker_group_cls,
-            processor=processor,
-            train_dataset=train_dataset,
-            collate_fn=collate_fn,
-            train_sampler=train_sampler,
-            device_name=device_name,
-        )
-
-    def init_workers(self):
-        """初始化worker"""
-        return self.rollouter.init_workers()
-
-    def set_message_queue_client(self, message_queue_client):
-        """设置消息队列客户端"""
-        return self.rollouter.set_message_queue_client(message_queue_client)
-
-    def set_parameter_synchronizer(self, param_synchronizer):
-        """设置参数同步器"""
-        return self.rollouter.set_parameter_synchronizer(param_synchronizer)
-
-    def update_rollout_weights(self, param_version: int):
-        """更新rollout权重"""
-        return self.rollouter.update_rollout_weights(param_version)
-
-    def fit(self):
-        """开始生成循环"""
-        return self.rollouter.fit()
-
-    def shutdown(self):
-        """关闭rollouter"""
-        return self.rollouter.shutdown()
-
-    def get_statistics(self):
-        """获取统计信息"""
-        return self.rollouter.get_statistics()
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index f0689a5d28c..e57e3e119b7 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -13,75 +13,203 @@
 # limitations under the License.
 
 import logging
-import threading
-import time
 import os
+import signal
 import socket
+import threading
+import time
+from pprint import pprint
 
 import hydra
 import ray
 from omegaconf import OmegaConf
 
-from recipe.fully_async_policy.RollouterActor import RollouterActor
-from verl.experimental.dataset.sampler import AbstractSampler
-from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer
-from verl.trainer.ppo.reward import load_reward_manager
-from verl.utils.device import is_cuda_available
-from verl.utils.import_utils import load_extern_type
-
-import hydra
-import ray
-
+from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter
+from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer
 from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
-from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler, run_ppo
+from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 from verl.trainer.ppo.reward import load_reward_manager
-from verl.utils.dataset.rl_dataset import collate_fn
-
-from fully_async_trainer import FullyAsyncTrainer
+from verl.utils.fs import copy_to_local
 
 logger = logging.getLogger(__name__)
 
 
 def setup_logging():
     """设置日志配置"""
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler(), logging.FileHandler("fully_async_training.log")],
+    )
 
 
-@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
-class FullyAsyncTaskRunner:
-    """Ray remote class for executing distributed PPO training tasks.
+def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
+    """
+    创建资源池管理器
 
-    This class encapsulates the main training logic and runs as a Ray remote actor
-    to enable distributed execution across multiple nodes and GPUs.
+    Args:
+        config: 配置对象
+        roles: 需要创建资源池的角色列表
+
+    Returns:
+        ResourcePoolManager: 资源池管理器
     """
+    # 构建资源池规格
+    resource_pool_spec = {}
+    mapping = {}
+
+    # Actor/Critic资源池（训练相关）
+    if any(role in roles for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]):
+        assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0"
+        assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0"
+
+        trainer_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes
+        resource_pool_spec["trainer_pool"] = trainer_pool
+
+        # 训练相关角色映射到同一个资源池
+        for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]:
+            if role in roles:
+                mapping[role] = "trainer_pool"
+
+    # Rollout资源池
+    if Role.Rollout in roles:
+        assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0"
+        assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0"
+
+        rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes
+        resource_pool_spec["rollout_pool"] = rollout_pool
+        mapping[Role.Rollout] = "rollout_pool"
+
+    logger.info(f"Resource pool specification: {resource_pool_spec}")
+    logger.info(f"Role mapping: {mapping}")
+
+    return ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+
+def create_role_worker_mapping(config):
+    """
+    创建角色到worker类的映射
+
+    Args:
+        config: 配置对象
+
+    Returns:
+        dict: 角色到worker类的映射
+    """
+    # 根据策略选择worker类
+    if config.actor_rollout_ref.actor.strategy == "fsdp2":
+        assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+        from recipe.one_step_off_policy.fsdp_workers import (
+            ActorRolloutRefWorker,
+            AsyncActorRolloutRefWorker,
+            CriticWorker,
+            RolloutWorker,
+        )
+        from verl.single_controller.ray import RayWorkerGroup
+
+        actor_rollout_cls = (
+            AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+        )
+        ray_worker_group_cls = RayWorkerGroup
+
+    elif config.actor_rollout_ref.actor.strategy == "megatron":
+        assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+        from recipe.one_step_off_policy.megatron_workers import (
+            ActorRolloutRefWorker,
+            AsyncActorRolloutRefWorker,
+            CriticWorker,
+            RolloutWorker,
+        )
+        from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+
+        actor_rollout_cls = (
+            AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+        )
+        ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+    else:
+        raise NotImplementedError(f"Unsupported strategy: {config.actor_rollout_ref.actor.strategy}")
+
+    role_worker_mapping = {
+        Role.Actor: ray.remote(actor_rollout_cls),
+        Role.Rollout: ray.remote(RolloutWorker),
+        Role.Critic: ray.remote(CriticWorker),
+    }
+
+    # 添加reward model（如果启用）
+    if config.reward_model.enable:
+        if config.reward_model.strategy == "fsdp2":
+            from verl.workers.fsdp_workers import RewardModelWorker
+        elif config.reward_model.strategy == "megatron":
+            from verl.workers.megatron_workers import RewardModelWorker
+        else:
+            raise NotImplementedError(f"Unsupported reward model strategy: {config.reward_model.strategy}")
+
+        role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+
+    # 添加reference policy（如果需要KL loss或reward）
+    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+
+    return role_worker_mapping, ray_worker_group_cls
+
+
+@ray.remote(num_cpus=1)
+class FullyAsyncTaskRunner:
+    """
+    Ray remote class for executing distributed PPO training tasks.
+    """
+
+    def __init__(self):
+        self.running = False
+        self.components = {}
+        self.shutdown_event = threading.Event()
 
     def run(self, config):
         """运行完全异步的PPO训练"""
         setup_logging()
-
         logger.info("Starting fully async PPO training...")
-        # 创建数据集和采样器
-        logger.info("Creating dataset and sampler...")
-        from verl.utils import hf_processor, hf_tokenizer
+        # 设置信号处理
+        self._setup_signal_handlers()
+        # 初始化基础组件
+        self._initialize_components(config)
+        # 启动训练流程
+        self._run_training_loop()
+
+        self._cleanup_resources()
 
-        # Print the initial configuration. `resolve=True` will evaluate symbolic values.
-        from pprint import pprint
+    def _setup_signal_handlers(self):
+        """设置信号处理器"""
 
-        from omegaconf import OmegaConf
+        def signal_handler(signum, frame):
+            logger.info(f"Received signal {signum}, initiating shutdown...")
+            self.running = False
+            self.shutdown_event.set()
 
-        from verl.utils.fs import copy_to_local
+        signal.signal(signal.SIGINT, signal_handler)
+        signal.signal(signal.SIGTERM, signal_handler)
 
+    def _initialize_components(self, config) -> None:
+        """
+        初始化所有组件
+
+        Args:
+            config: 配置对象
+
+        Returns:
+            bool: 是否初始化成功
+        """
+        # 打印配置信息
         print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
         pprint(OmegaConf.to_container(config, resolve=True))
         OmegaConf.resolve(config)
 
-        # Download the checkpoint from HDFS to the local machine.
-        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        # 初始化模型路径和tokenizer
+        logger.info("Initializing model and tokenizer...")
         local_path = copy_to_local(
             config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
         )
-
         # Instantiate the tokenizer and processor.
         from verl.utils import hf_processor, hf_tokenizer
 
@@ -90,195 +218,314 @@ def run(self, config):
         # Used for multimodal LLM, could be None
         processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
 
-        # Define worker classes based on the actor strategy.
-        if config.actor_rollout_ref.actor.strategy == "fsdp2":
-            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-            from verl.single_controller.ray import RayWorkerGroup
-
-            from recipe.one_step_off_policy.fsdp_workers import (
-                ActorRolloutRefWorker,
-                AsyncActorRolloutRefWorker,
-                CriticWorker,
-                RolloutWorker,
-            )
-
-            actor_rollout_cls = (
-                AsyncActorRolloutRefWorker
-                if config.actor_rollout_ref.rollout.mode == "async"
-                else ActorRolloutRefWorker
-            )
-            ray_worker_group_cls = RayWorkerGroup
-
-        elif config.actor_rollout_ref.actor.strategy == "megatron":
-            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
-
-            from recipe.one_step_off_policy.megatron_workers import (
-                ActorRolloutRefWorker,
-                AsyncActorRolloutRefWorker,
-                CriticWorker,
-                RolloutWorker,
-            )
-
-            actor_rollout_cls = (
-                AsyncActorRolloutRefWorker
-                if config.actor_rollout_ref.rollout.mode == "async"
-                else ActorRolloutRefWorker
-            )
-            ray_worker_group_cls = NVMegatronRayWorkerGroup
-
-        else:
-            raise NotImplementedError
+        self.components["tokenizer"] = tokenizer
+        self.components["processor"] = processor
 
-        from recipe.one_step_off_policy.ray_trainer import ResourcePoolManager, Role
+        # 创建worker映射和资源池
+        logger.info("Creating worker mapping and resource pools...")
+        role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config)
+        self.components["role_worker_mapping"] = role_worker_mapping
+        self.components["ray_worker_group_cls"] = ray_worker_group_cls
 
-        role_worker_mapping = {
-            Role.Actor: ray.remote(actor_rollout_cls),
-            Role.Rollout: ray.remote(RolloutWorker),
-            Role.Critic: ray.remote(CriticWorker),
-        }
-
-        global_pool_id = "actor_pool"
-        rollout_pool_id = "rollout_pool"
+        # 创建数据集
+        logger.info("Creating datasets...")
+        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+        from verl.utils.dataset.rl_dataset import collate_fn
 
-        assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0"
-        assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0"
-        assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0"
-        assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0"
+        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor)
+        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
+        train_sampler = create_rl_sampler(config.data, train_dataset)
 
-        actor_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes
-        rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes
+        self.components["train_dataset"] = train_dataset
+        self.components["val_dataset"] = val_dataset
+        self.components["train_sampler"] = train_sampler
+        self.components["collate_fn"] = collate_fn
 
-        resource_pool_spec = {
-            "actor_pool": actor_pool,
-            "rollout_pool": rollout_pool,
-        }
-        mapping = {
-            Role.Actor: global_pool_id,
-            Role.Rollout: rollout_pool_id,
-            Role.Critic: global_pool_id,
-        }
-        print(f"resource_pool_spec: {resource_pool_spec}")
-        # We should adopt a multi-source reward function here:
-        # - for rule-based rm, we directly call a reward score
-        # - for model-based rm, we call a model
-        # - for code related prompt, we send to a sandbox if there are test cases
-        # finally, we combine all the rewards together
-        # The reward type depends on the tag of the data
-        if config.reward_model.enable:
-            if config.reward_model.strategy == "fsdp2":
-                from verl.workers.fsdp_workers import RewardModelWorker
-            elif config.reward_model.strategy == "megatron":
-                from verl.workers.megatron_workers import RewardModelWorker
-            else:
-                raise NotImplementedError
-            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
-            mapping[Role.RewardModel] = global_pool_id
-
-        # Add a reference policy worker if KL loss or KL reward is used.
-        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
-            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
-            mapping[Role.RefPolicy] = global_pool_id
-
-        # Load the reward manager for training and validation.
+        # 创建奖励函数
+        logger.info("Loading reward functions...")
         reward_fn = load_reward_manager(
             config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
         )
         val_reward_fn = load_reward_manager(
             config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {})
         )
-        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
-
-        from verl.utils.dataset.rl_dataset import collate_fn
+        self.components["reward_fn"] = reward_fn
+        self.components["val_reward_fn"] = val_reward_fn
 
-        # Create training and validation datasets.
-        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor)
-        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
-        train_sampler = create_rl_sampler(config.data, train_dataset)
-
-        # 1. 创建MessageQueue
+        # 创建MessageQueue
         logger.info("Creating MessageQueue...")
-        # todo max_queue_size auto compute
         max_queue_size = config.async_training.get("max_queue_size", 1000)
         message_queue = MessageQueue.remote(config, max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
 
-        # 2. 创建Rollouter Actor
+        self.components["message_queue"] = message_queue
+        self.components["message_queue_client"] = message_queue_client
+
+        # 创建Rollouter
         logger.info("Creating Rollouter...")
-        rollouter_actor = RollouterActor.remote(
+        self._create_rollouter(config)
+
+        # 创建Trainer
+        logger.info("Creating FullyAsyncTrainer...")
+        self._create_trainer(config)
+
+        # 设置参数同步
+        logger.info("Setting up parameter synchronization...")
+        param_synchronizer = AsyncParameterSynchronizer(
             config=config,
-            tokenizer=tokenizer,
-            role_worker_mapping={Role.Rollout: role_worker_mapping[Role.Rollout]},
+            actor_wg=self.components["trainer"].actor_wg,
+            rollouter=self.components["rollouter"],
+        )
+        self.components["param_synchronizer"] = param_synchronizer
+        logger.info("All components initialized successfully")
+
+    def _create_rollouter(self, config) -> None:
+        """创建Rollouter"""
+        rollouter = FullyAsyncRollouter.remote(
+            config=config,
+            tokenizer=self.components["tokenizer"],
+            role_worker_mapping={Role.Rollout: self.components["role_worker_mapping"][Role.Rollout]},
             resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]),
-            ray_worker_group_cls=RayWorkerGroup,
-            processor=processor,
-            train_dataset=train_dataset,
-            collate_fn=collate_fn,
-            train_sampler=train_sampler,
+            ray_worker_group_cls=self.components["ray_worker_group_cls"],
+            processor=self.components["processor"],
+            train_dataset=self.components["train_dataset"],
+            collate_fn=self.components["collate_fn"],
+            train_sampler=self.components["train_sampler"],
             device_name=config.trainer.device,
         )
 
         # 初始化Rollouter
-        ray.get(rollouter_actor.init_workers.remote())
-        ray.get(rollouter_actor.set_message_queue_client.remote(message_queue_client))
+        init_future = rollouter.init_workers.remote()
+        ray.get(init_future, timeout=60.0)
 
-        # 3. 创建Trainer
-        logger.info("Creating FullyAsyncTrainer...")
+        set_queue_future = rollouter.set_message_queue_client.remote(self.components["message_queue_client"])
+        ray.get(set_queue_future, timeout=10.0)
+
+        self.components["rollouter"] = rollouter
+        logger.info("Rollouter created and initialized successfully")
+
+    def _create_trainer(self, config) -> None:
+        """创建Trainer"""
+        # 创建trainer角色映射（排除Rollout）
         trainer_role_mapping = {
-            role: worker_cls for role, worker_cls in role_worker_mapping.items() if role != Role.Rollout
+            role: worker_cls
+            for role, worker_cls in self.components["role_worker_mapping"].items()
+            if role != Role.Rollout
         }
 
-        # 创建奖励函数
-        reward_fn, val_reward_fn = load_reward_manager(config, tokenizer)
-
-        trainer = FullyAsyncTrainer(
+        trainer = FullyAsyncTrainer.remote(
             config=config,
-            tokenizer=tokenizer,
+            tokenizer=self.components["tokenizer"],
             role_worker_mapping=trainer_role_mapping,
             resource_pool_manager=create_resource_pool_manager(config, roles=list(trainer_role_mapping.keys())),
-            ray_worker_group_cls=RayWorkerGroup,
-            processor=processor,
-            reward_fn=reward_fn,
-            val_reward_fn=val_reward_fn,
-            train_dataset=train_dataset,
-            val_dataset=val_dataset,
-            collate_fn=collate_fn,
-            train_sampler=train_sampler,
+            ray_worker_group_cls=self.components["ray_worker_group_cls"],
+            processor=self.components["processor"],
+            reward_fn=self.components["reward_fn"],
+            val_reward_fn=self.components["val_reward_fn"],
+            train_dataset=self.components["train_dataset"],
+            val_dataset=self.components["val_dataset"],
+            collate_fn=self.components["collate_fn"],
+            train_sampler=self.components["train_sampler"],
             device_name=config.trainer.device,
         )
 
         # 初始化Trainer
         trainer.init_workers()
-        trainer.set_message_queue_client(message_queue_client)
-        trainer.set_rollouter_actor(rollouter_actor)
+        trainer.set_message_queue_client(self.components["message_queue_client"])
+        trainer.set_rollouter(self.components["rollouter"])
 
-        # 4. 设置参数同步
-        logger.info("Setting up parameter synchronization...")
-        # param_synchronizer = AsyncParameterSynchronizer(
-        #     config=config, actor_wg=trainer.actor_wg, rollouter_actor=rollouter_actor
-        # )
+        self.components["trainer"] = trainer
+        logger.info("FullyAsyncTrainer created and initialized successfully")
 
-        # 5. 启动Rollouter（在后台线程中）
-        logger.info("Starting Rollouter in background...")
+    def _run_training_loop(self):
+        """运行训练循环"""
+        self.running = True
 
-        def run_rollouter():
+        logger.info("Starting Rollouter in background...")
+        rollouter_future = self.components["rollouter"].fit.remote()
+        time.sleep(2.0)
+        trainer_future = self.components["trainer"].fit.remote()
+        self._monitor_components()
+        ray.get(rollouter_future)
+        ray.get(trainer_future)
+
+        logger.info("Training completed or interrupted")
+
+    def _run_rollouter(self):
+        try:
+            ray.get(self.components["rollouter"].fit.remote())
+        except Exception as e:
+            logger.error(f"Rollouter error: {e}")
+            self.running = False
+            self.shutdown_event.set()
+
+    def _run_trainer(self):
+        """运行trainer"""
+        try:
+            self.components["trainer"].fit()
+        except Exception as e:
+            logger.error(f"Trainer error: {e}")
+        finally:
+            self.running = False
+            self.shutdown_event.set()
+
+    def _monitor_components(self):
+        """监控组件状态"""
+        logger.info("Starting component monitoring...")
+
+        last_stats_time = time.time()
+        stats_interval = 60.0  # 60秒报告一次统计
+
+        while self.running and not self.shutdown_event.is_set():
             try:
-                ray.get(rollouter_actor.fit.remote())
+                # 等待一段时间或直到收到停止信号
+                if self.shutdown_event.wait(timeout=10.0):
+                    break
+
+                # 定期报告统计信息
+                current_time = time.time()
+                if current_time - last_stats_time >= stats_interval:
+                    self._log_component_statistics()
+                    last_stats_time = current_time
+
+                # 检查组件健康状态
+                self._check_component_health()
+
             except Exception as e:
-                logger.error(f"Rollouter error: {e}")
+                logger.error(f"Error in component monitoring: {e}")
+
+        logger.info("Component monitoring stopped")
+
+    def _log_component_statistics(self):
+        """记录组件统计信息"""
+        try:
+            # 获取Trainer统计
+            trainer_stats = self.components["trainer"].get_statistics()
+
+            # 获取Rollouter统计
+            rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0)
+
+            # 获取队列统计
+            queue_stats = self.components["message_queue_client"].get_statistics()
 
-        rollouter_thread = threading.Thread(target=run_rollouter, daemon=True)
-        rollouter_thread.start()
+            logger.info("=== Component Statistics ===")
+            logger.info(
+                f"Trainer - Steps: {trainer_stats['global_steps']}, "
+                f"Samples: {trainer_stats['processed_samples']}, "
+                f"Param version: {trainer_stats['current_param_version']}"
+            )
+
+            logger.info(
+                f"Rollouter - Generated: {rollouter_stats['total_generated_samples']}, "
+                f"Dropped: {rollouter_stats['dropped_stale_samples']}, "
+                f"Errors: {rollouter_stats['generation_errors']}"
+            )
 
-        # 6. 启动Trainer（主线程）
-        logger.info("Starting FullyAsyncTrainer...")
-        trainer.fit()
+            logger.info(
+                f"Queue - Size: {queue_stats['queue_size']}, "
+                f"Produced: {queue_stats['total_produced']}, "
+                f"Consumed: {queue_stats['total_consumed']}"
+            )
+
+        except Exception as e:
+            logger.error(f"Error getting component statistics: {e}")
+
+    def _check_component_health(self):
+        """检查组件健康状态"""
+        try:
+            # 检查trainer是否仍在运行
+            if hasattr(self.components["trainer"], "global_steps"):
+                current_steps = self.components["trainer"].global_steps
+                # 可以添加更多健康检查逻辑
+                print(current_steps)
+
+            # 检查rollouter是否仍在运行
+            rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0)
+
+            if not rollouter_stats["is_running"]:
+                logger.warning("Rollouter is not running!")
+                # 可以尝试重启或报告错误
+
+        except Exception as e:
+            logger.warning(f"Health check failed: {e}")
+
+    def _cleanup_resources(self):
+        """清理资源"""
+        logger.info("Cleaning up resources...")
+
+        try:
+            # 停止Rollouter
+            if "rollouter" in self.components:
+                logger.info("Shutting down Rollouter...")
+                try:
+                    shutdown_future = self.components["rollouter"].shutdown.remote()
+                    ray.get(shutdown_future, timeout=10.0)
+                except Exception as e:
+                    logger.warning(f"Error shutting down Rollouter: {e}")
+
+            # 清理MessageQueue
+            if "message_queue_client" in self.components:
+                logger.info("Cleaning up MessageQueue...")
+                try:
+                    self.components["message_queue_client"].shutdown()
+                except Exception as e:
+                    logger.warning(f"Error cleaning up MessageQueue: {e}")
+
+            # 清理参数同步器
+            if "param_synchronizer" in self.components:
+                logger.info("Cleaning up parameter synchronizer...")
+                # TODO: 添加参数同步器的清理逻辑
+
+            logger.info("Resource cleanup completed")
+
+        except Exception as e:
+            logger.error(f"Error during cleanup: {e}")
+
+    def get_training_status(self) -> dict:
+        """获取训练状态"""
+        if not self.running or "trainer" not in self.components:
+            return {"status": "not_running"}
+
+        try:
+            trainer_stats = self.components["trainer"].get_statistics()
+            rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0)
+
+            return {
+                "status": "running",
+                "trainer_stats": trainer_stats,
+                "rollouter_stats": rollouter_stats,
+            }
+        except Exception as e:
+            logger.error(f"Error getting training status: {e}")
+            return {"status": "error", "error": str(e)}
 
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
     """主入口函数"""
-    run_ppo(config, FullyAsyncTaskRunner)
+    from verl.trainer.main_ppo import run_ppo
+
+    # 确保异步训练配置存在
+    if not hasattr(config, "async_training"):
+        # 设置默认异步训练配置
+        config.async_training = OmegaConf.create(
+            {
+                "freshness_threshold": 3,
+                "max_staleness_allowed": 5,
+                "max_queue_size": 1000,
+                "min_batch_count": 1,
+                "batch_timeout": 30.0,
+                "generation_timeout": 30.0,
+                "batch_generation_interval": 0.1,
+                "max_sync_retries": 3,
+                "sync_timeout": 30.0,
+                "sync_retry_delay": 1.0,
+            }
+        )
+        logger.info("Using default async training configuration")
+
+    logger.info("Starting fully async PPO training with improved architecture")
+    run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
 
 
 if __name__ == "__main__":
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
new file mode 100644
index 00000000000..c127b242704
--- /dev/null
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -0,0 +1,681 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import threading
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
+
+import numpy as np
+import ray
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset, Sampler
+
+from recipe.fully_async_policy.message_queue import MessageQueueClient
+from verl import DataProto
+from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType
+from verl.utils.debug import marked_timer
+
+logger = logging.getLogger(__name__)
+
+
+class RolloutController:
+    """控制rollout的暂停和恢复 - 改进的控制机制"""
+
+    def __init__(self):
+        self.is_paused = False
+        self.pause_event = threading.Event()
+        self.resume_event = threading.Event()
+        self.resume_event.set()  # 初始状态为可运行
+        self.pending_requests = []
+        self.lock = threading.RLock()
+        self.pause_count = 0
+
+    def pause(self, timeout: Optional[float] = None) -> bool:
+        """
+        暂停rollout
+
+        Args:
+            timeout: 暂停超时时间，如果为None则无限等待
+
+        Returns:
+            bool: 是否成功暂停
+        """
+        with self.lock:
+            if not self.is_paused:
+                self.is_paused = True
+                self.resume_event.clear()
+                self.pause_event.set()
+                self.pause_count += 1
+                logger.info(f"Rollout paused (count: {self.pause_count})")
+                return True
+            else:
+                logger.debug("Rollout already paused")
+                return True
+
+    def resume(self) -> bool:
+        """
+        恢复rollout
+
+        Returns:
+            bool: 是否成功恢复
+        """
+        with self.lock:
+            if self.is_paused:
+                self.is_paused = False
+                self.pause_event.clear()
+                self.resume_event.set()
+                logger.info("Rollout resumed")
+                return True
+            else:
+                logger.debug("Rollout already running")
+                return True
+
+    def wait_if_paused(self, timeout: float = None) -> bool:
+        """
+        如果被暂停则等待恢复
+
+        Args:
+            timeout: 等待超时时间
+
+        Returns:
+            bool: 是否成功等待（未超时）
+        """
+        if self.is_paused:
+            logger.debug(f"Waiting for resume (timeout: {timeout})")
+            return self.resume_event.wait(timeout)
+        return True
+
+    def is_pause_requested(self) -> bool:
+        """检查是否有暂停请求"""
+        return self.pause_event.is_set()
+
+    def get_status(self) -> dict:
+        """获取控制器状态"""
+        with self.lock:
+            return {
+                "is_paused": self.is_paused,
+                "pause_count": self.pause_count,
+                "has_pending_requests": len(self.pending_requests) > 0,
+            }
+
+
+class Rollouter:
+    """
+    异步样本生成器，负责持续生成训练样本并放入MessageQueue
+    基于OneStepOffRayTrainer的成熟实现改进
+    """
+
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        train_dataset: Dataset | None = None,
+        collate_fn=None,
+        train_sampler: Sampler | None = None,
+        device_name="cuda",
+    ):
+        self.config = config
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name
+
+        # 数据相关
+        self.train_dataset = train_dataset
+        self.collate_fn = collate_fn
+        self.train_sampler = train_sampler
+
+        # Rollout控制
+        self.rollout_controller = RolloutController()
+        self.current_param_version = 0
+
+        # 新鲜度控制 - 改进的配置管理
+        async_config = config.async_training
+        self.freshness_threshold = async_config.get("freshness_threshold", 3)
+        self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5)
+        self.generation_timeout = async_config.get("generation_timeout", 30.0)
+        self.batch_generation_interval = async_config.get("batch_generation_interval", 0.1)
+
+        # 统计信息
+        self.total_generated_samples = 0
+        self.dropped_stale_samples = 0
+        self.generation_errors = 0
+        self.param_sync_requests = 0
+
+        # Worker groups
+        self.rollout_wg = None
+        self.message_queue_client = None
+
+        # 运行状态
+        self.running = False
+        self.generation_thread = None
+        self.thread_executor = ThreadPoolExecutor(max_workers=2)
+
+        # 参数同步相关
+        self.param_synchronizer = None
+        self.last_sync_time = 0
+        self.sync_in_progress = False
+        self.sync_lock = threading.Lock()
+
+        # 异步rollout模式
+        self.async_rollout_mode = config.actor_rollout_ref.rollout.mode == "async"
+
+        self._validate_config()
+
+    def _validate_config(self):
+        """验证配置"""
+        required_configs = [
+            "data.train_batch_size",
+            "actor_rollout_ref.rollout.n",
+            "async_training.freshness_threshold",
+        ]
+
+        for config_path in required_configs:
+            if not OmegaConf.select(self.config, config_path):
+                logger.warning(f"Missing recommended config: {config_path}")
+
+        # 验证异步训练配置
+        if not hasattr(self.config, "async_training"):
+            raise ValueError("Missing async_training configuration")
+
+    def init_workers(self):
+        """初始化rollout workers - 参考OneStepOffRayTrainer的实现"""
+        logger.info("Initializing Rollouter workers...")
+
+        self.resource_pool_manager.create_resource_pool()
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
+        # 只创建rollout worker
+        resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout)
+        role_cls = RayClassWithInitArgs(
+            cls=self.role_worker_mapping[Role.Rollout],
+            config=self.config.actor_rollout_ref,
+            role="rollout",
+        )
+        self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls
+
+        # 初始化WorkerGroup
+        all_wg = {}
+        wg_kwargs = {}
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
+            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
+            if OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None:
+                wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
+                    OmegaConf.select(self.config.trainer, "worker_nsight_options")
+                )
+
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                device_name=self.device_name,
+                **wg_kwargs,
+            )
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+
+        self.rollout_wg = all_wg["rollout"]
+        self.rollout_wg.init_model()
+
+        # 初始化异步rollout管理器（如果需要）
+        if self.async_rollout_mode:
+            self._init_async_rollout_manager()
+
+        logger.info("Rollouter workers initialized successfully")
+
+    def _init_async_rollout_manager(self):
+        """初始化异步rollout管理器"""
+        try:
+            from verl.workers.rollout.async_server import AsyncLLMServerManager
+
+            self.async_rollout_manager = AsyncLLMServerManager(
+                config=self.config,
+                worker_group=self.rollout_wg,
+            )
+            logger.info("Async rollout manager initialized")
+        except Exception as e:
+            logger.warning(f"Failed to initialize async rollout manager: {e}")
+            self.async_rollout_mode = False
+
+    def set_message_queue_client(self, message_queue_client: MessageQueueClient):
+        """设置消息队列客户端"""
+        self.message_queue_client = message_queue_client
+
+    def set_parameter_synchronizer(self, param_synchronizer):
+        """设置参数同步器"""
+        self.param_synchronizer = param_synchronizer
+
+    def update_rollout_weights(self, param_version: int) -> bool:
+        """
+        更新rollout模型参数 - 改进的参数同步实现
+        这个方法由外部Trainer调用
+
+        Args:
+            param_version: 新的参数版本号
+
+        Returns:
+            bool: 是否成功更新参数
+        """
+        logger.info(f"Updating rollout weights to version {param_version}")
+
+        with self.sync_lock:
+            if self.sync_in_progress:
+                logger.warning(f"Sync already in progress, skipping version {param_version}")
+                return False
+
+            self.sync_in_progress = True
+
+        try:
+            # 暂停rollout - 带超时机制
+            if not self.rollout_controller.pause(timeout=10.0):
+                logger.error("Failed to pause rollout within timeout")
+                return False
+
+            # 等待当前generation完成（如果有的话）
+            time.sleep(0.1)
+
+            # 执行参数同步
+            sync_success = self._execute_parameter_sync(param_version)
+
+            if sync_success:
+                self.current_param_version = param_version
+                self.param_sync_requests += 1
+                self.last_sync_time = time.time()
+                logger.info(f"Successfully updated rollout weights to version {param_version}")
+            else:
+                logger.error(f"Failed to sync parameters to version {param_version}")
+
+        except Exception as e:
+            logger.error(f"Error during parameter sync: {e}")
+            sync_success = False
+        finally:
+            # 恢复rollout
+            self.rollout_controller.resume()
+            self.sync_in_progress = False
+
+        return sync_success
+
+    def _execute_parameter_sync(self, param_version: int) -> bool:
+        """
+        执行实际的参数同步 - 改进的同步逻辑
+
+        Args:
+            param_version: 目标参数版本
+
+        Returns:
+            bool: 是否同步成功
+        """
+        try:
+            # 暂停推理引擎
+            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
+                # 对于异步模式，暂停服务器
+                pass  # 异步服务器的暂停在 pause() 中已经处理
+            else:
+                # 对于同步模式，使用sleep/wake_up机制
+                sleep_futures = self.rollout_wg.sleep()
+                ray.get(sleep_futures)
+
+            # 执行参数同步
+            if self.param_synchronizer:
+                self.param_synchronizer.sync_weights()
+                logger.debug("Parameter synchronization completed via synchronizer")
+            else:
+                # 直接使用rollout worker group的同步机制
+                if hasattr(self.rollout_wg, "sync_rollout_weights"):
+                    sync_futures = self.rollout_wg.sync_rollout_weights()
+                    ray.get(sync_futures)
+                    logger.debug("Parameter synchronization completed via rollout worker group")
+                else:
+                    logger.warning("No parameter synchronization mechanism available")
+                    return False
+
+            # 恢复推理引擎
+            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
+                # 对于异步模式，恢复服务器
+                pass  # 异步服务器的恢复在 resume() 中已经处理
+            else:
+                # 对于同步模式，唤醒workers
+                wake_futures = self.rollout_wg.wake_up()
+                ray.get(wake_futures)
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Parameter sync execution failed: {e}")
+            return False
+
+    def _create_dataloader(self):
+        """创建数据加载器"""
+        from torch.utils.data import DataLoader
+
+        if self.train_dataset is None:
+            raise ValueError("Training dataset not provided")
+
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.config.data.train_batch_size,
+            sampler=self.train_sampler,
+            collate_fn=self.collate_fn,
+            num_workers=self.config.data.get("dataloader_num_workers", 0),
+            drop_last=True,
+            pin_memory=True,  # 改进内存管理
+        )
+
+    def _create_continuous_iterator(self):
+        """创建连续的数据迭代器"""
+        dataloader = self._create_dataloader()
+
+        epoch = 0
+        while self.running:
+            try:
+                for batch_dict in dataloader:
+                    if not self.running:
+                        return
+                    yield epoch, batch_dict
+                epoch += 1
+            except Exception as e:
+                logger.error(f"Error in data iterator: {e}")
+                time.sleep(1.0)  # 避免快速重试
+                continue
+
+    def _should_pause_generation(self) -> bool:
+        """
+        判断是否应该暂停生成，基于新鲜度控制 - 改进的判断逻辑
+        """
+        if self.message_queue_client is None:
+            return False
+
+        try:
+            queue_stats = self.message_queue_client.get_statistics()
+            queue_size = queue_stats["queue_size"]
+            current_trainer_version = queue_stats["current_param_version"]
+
+            # 计算参数版本差异
+            version_diff = self.current_param_version - current_trainer_version
+
+            # 如果版本差异过大，暂停生成
+            if version_diff >= self.max_staleness_allowed:
+                logger.debug(
+                    f"Should pause due to staleness: rollout_version={self.current_param_version}, "
+                    f"trainer_version={current_trainer_version}, diff={version_diff}"
+                )
+                return True
+
+            # 如果队列太满，也暂停生成
+            max_queue_size = self.freshness_threshold * self.config.data.train_batch_size
+            if queue_size >= max_queue_size:
+                logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
+                return True
+
+            return False
+
+        except Exception as e:
+            logger.error(f"Error checking pause conditions: {e}")
+            return True  # 出错时暂停生成
+
+    def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]:
+        """生成单个batch的样本 - 改进的生成逻辑"""
+        try:
+            batch = DataProto.from_single_dict(batch_dict)
+
+            # 处理batch用于生成 - 参考OneStepOffRayTrainer的处理逻辑
+            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+
+            # 处理多模态数据和其他可选字段
+            optional_keys = ["multi_modal_data", "raw_prompt", "tools_kwargs", "interaction_kwargs"]
+            for key in optional_keys:
+                if key in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append(key)
+
+            gen_batch = batch.pop(
+                batch_keys=batch_keys_to_pop,
+                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+            )
+
+            # 重复生成多个响应 - 参考OneStepOffRayTrainer
+            n_repeats = self.config.actor_rollout_ref.rollout.n
+            gen_batch = gen_batch.repeat(repeat_times=n_repeats, interleave=True)
+
+            # 执行生成
+            if self.async_rollout_mode:
+                # 异步生成
+                gen_batch_output = ray.get(
+                    self.rollout_wg.async_generate_sequences.remote(gen_batch), timeout=self.generation_timeout
+                )
+            else:
+                # 同步生成
+                gen_batch_output = ray.get(
+                    self.rollout_wg.generate_sequences.remote(gen_batch), timeout=self.generation_timeout
+                )
+
+            # 添加UID - 确保每个样本有唯一标识
+            batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+
+            # 重复原始batch以对齐生成的响应
+            batch = batch.repeat(repeat_times=n_repeats, interleave=True)
+
+            # 合并数据
+            final_batch = batch.union(gen_batch_output)
+
+            # 添加rollout metadata
+            final_batch.meta_info["rollout_param_version"] = self.current_param_version
+            final_batch.meta_info["generation_timestamp"] = time.time()
+
+            return final_batch
+
+        except Exception as e:
+            logger.error(f"Error generating batch: {e}")
+            self.generation_errors += 1
+            return None
+
+    def _generation_loop(self):
+        """主要的生成循环 - 改进的循环逻辑"""
+        logger.info("Starting generation loop...")
+
+        try:
+            continuous_iterator = self._create_continuous_iterator()
+
+            for epoch, batch_dict in continuous_iterator:
+                if not self.running:
+                    break
+
+                # 等待如果被暂停
+                if not self.rollout_controller.wait_if_paused(timeout=1.0):
+                    if not self.running:
+                        break
+                    continue
+
+                # 检查是否应该暂停生成
+                if self._should_pause_generation():
+                    time.sleep(self.batch_generation_interval)
+                    continue
+
+                # 生成样本
+                timing_raw = {}
+                with marked_timer("generate_batch", timing_raw):
+                    generated_batch = self._generate_batch(epoch, batch_dict)
+
+                if generated_batch is not None:
+                    # 准备rollout metadata
+                    rollout_metadata = {
+                        "timing": timing_raw,
+                        "generation_timestamp": time.time(),
+                        "rollout_param_version": self.current_param_version,
+                        "epoch": epoch,
+                    }
+
+                    # 放入队列
+                    success = self.message_queue_client.put_batch(
+                        epoch=epoch,
+                        batch=generated_batch,
+                        param_version=self.current_param_version,
+                        rollout_metadata=rollout_metadata,
+                    )
+
+                    if success:
+                        self.total_generated_samples += 1
+                        if self.total_generated_samples % 10 == 0:
+                            logger.info(
+                                f"Generated {self.total_generated_samples} batches, "
+                                f"param_version={self.current_param_version}, "
+                                f"errors={self.generation_errors}"
+                            )
+                    else:
+                        self.dropped_stale_samples += 1
+                        if self.dropped_stale_samples % 5 == 0:
+                            logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
+
+                # 控制生成频率
+                if self.batch_generation_interval > 0:
+                    time.sleep(self.batch_generation_interval)
+
+        except Exception as e:
+            logger.error(f"Generation loop error: {e}")
+        finally:
+            logger.info("Generation loop finished")
+
+    def fit(self):
+        """开始异步生成样本 - 改进的主运行逻辑"""
+        logger.info("Starting Rollouter...")
+
+        if self.message_queue_client is None:
+            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+
+        self.running = True
+
+        # 在单独的线程中运行生成循环
+        self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
+        self.generation_thread.start()
+
+        logger.info("Rollouter started successfully")
+
+        try:
+            # 主线程保持运行，处理控制信号和状态监控
+            last_stats_time = time.time()
+            stats_interval = 30.0  # 30秒报告一次统计
+
+            while self.running:
+                time.sleep(1.0)
+
+                # 定期打印统计信息
+                current_time = time.time()
+                if current_time - last_stats_time >= stats_interval:
+                    self._log_statistics()
+                    last_stats_time = current_time
+
+                # 检查生成线程状态
+                if not self.generation_thread.is_alive():
+                    logger.error("Generation thread died, restarting...")
+                    self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
+                    self.generation_thread.start()
+
+        except KeyboardInterrupt:
+            logger.info("Received interrupt signal, shutting down...")
+        except Exception as e:
+            logger.error(f"Error in main loop: {e}")
+        finally:
+            self.shutdown()
+
+    def _log_statistics(self):
+        """记录统计信息"""
+        try:
+            controller_status = self.rollout_controller.get_status()
+            queue_stats = self.message_queue_client.get_statistics()
+
+            logger.info(
+                f"Rollouter stats - Generated: {self.total_generated_samples}, "
+                f"Dropped: {self.dropped_stale_samples}, "
+                f"Errors: {self.generation_errors}, "
+                f"Queue size: {queue_stats['queue_size']}, "
+                f"Param version: {self.current_param_version}, "
+                f"Paused: {controller_status['is_paused']}, "
+                f"Sync requests: {self.param_sync_requests}"
+            )
+        except Exception as e:
+            logger.error(f"Error logging statistics: {e}")
+
+    def shutdown(self):
+        """关闭Rollouter - 改进的关闭逻辑"""
+        logger.info("Shutting down Rollouter...")
+
+        self.running = False
+
+        # 恢复可能被暂停的生成线程
+        self.rollout_controller.resume()
+
+        # 等待生成线程结束
+        if self.generation_thread and self.generation_thread.is_alive():
+            logger.info("Waiting for generation thread to finish...")
+            self.generation_thread.join(timeout=10.0)
+
+            if self.generation_thread.is_alive():
+                logger.warning("Generation thread did not finish within timeout")
+
+        # 关闭线程池
+        if self.thread_executor:
+            self.thread_executor.shutdown(wait=True)
+
+        # 清理异步rollout管理器
+        if hasattr(self, "async_rollout_manager"):
+            try:
+                # TODO: 添加异步rollout管理器的清理逻辑
+                pass
+            except Exception as e:
+                logger.warning(f"Error cleaning up async rollout manager: {e}")
+
+        logger.info("Rollouter shutdown complete")
+
+    def get_statistics(self) -> dict:
+        """获取统计信息 - 改进的统计信息"""
+        controller_status = self.rollout_controller.get_status()
+
+        stats = {
+            "total_generated_samples": self.total_generated_samples,
+            "dropped_stale_samples": self.dropped_stale_samples,
+            "generation_errors": self.generation_errors,
+            "current_param_version": self.current_param_version,
+            "param_sync_requests": self.param_sync_requests,
+            "last_sync_time": self.last_sync_time,
+            "is_running": self.running,
+            "sync_in_progress": self.sync_in_progress,
+        }
+
+        stats.update(controller_status)
+
+        # 添加队列统计（如果可用）
+        if self.message_queue_client:
+            try:
+                queue_stats = self.message_queue_client.get_statistics()
+                stats["queue_size"] = queue_stats.get("queue_size", 0)
+                stats["queue_total_produced"] = queue_stats.get("total_produced", 0)
+                stats["queue_dropped_samples"] = queue_stats.get("dropped_samples", 0)
+            except Exception as e:
+                logger.debug(f"Error getting queue statistics: {e}")
+
+        return stats
diff --git a/recipe/fully_async_policy/rollouter.py b/recipe/fully_async_policy/rollouter.py
deleted file mode 100644
index ac43b6e3dbf..00000000000
--- a/recipe/fully_async_policy/rollouter.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import threading
-import time
-import uuid
-
-import numpy as np
-import ray
-from omegaconf import OmegaConf
-from torch.utils.data import Dataset, Sampler
-
-from recipe.fully_async_policy.message_queue import MessageQueueClient
-from verl import DataProto
-from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
-from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType
-from verl.utils.debug import marked_timer
-
-logger = logging.getLogger(__name__)
-
-
-class RolloutController:
-    """控制rollout的暂停和恢复"""
-
-    def __init__(self):
-        self.is_paused = False
-        self.pause_event = threading.Event()
-        self.resume_event = threading.Event()
-        self.resume_event.set()  # 初始状态为可运行
-        self.pending_requests = []
-        self.lock = threading.RLock()
-
-    def pause(self):
-        """暂停rollout"""
-        with self.lock:
-            if not self.is_paused:
-                self.is_paused = True
-                self.resume_event.clear()
-                self.pause_event.set()
-                logger.info("Rollout paused")
-
-    def resume(self):
-        """恢复rollout"""
-        with self.lock:
-            if self.is_paused:
-                self.is_paused = False
-                self.pause_event.clear()
-                self.resume_event.set()
-                logger.info("Rollout resumed")
-
-    def wait_if_paused(self, timeout: float = None):
-        """如果被暂停则等待恢复"""
-        if self.is_paused:
-            self.resume_event.wait(timeout)
-
-    def is_pause_requested(self) -> bool:
-        """检查是否有暂停请求"""
-        return self.pause_event.is_set()
-
-
-class Rollouter:
-    """
-    异步样本生成器，负责持续生成训练样本并放入MessageQueue
-    """
-
-    def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        train_dataset: Dataset | None = None,
-        collate_fn=None,
-        train_sampler: Sampler | None = None,
-        device_name="cuda",
-    ):
-        self.config = config
-        self.tokenizer = tokenizer
-        self.processor = processor
-        self.role_worker_mapping = role_worker_mapping
-        self.resource_pool_manager = resource_pool_manager
-        self.ray_worker_group_cls = ray_worker_group_cls
-        self.device_name = device_name
-
-        # 数据相关
-        self.train_dataset = train_dataset
-        self.collate_fn = collate_fn
-        self.train_sampler = train_sampler
-
-        # Rollout控制
-        self.rollout_controller = RolloutController()
-        self.current_param_version = 0
-
-        # 新鲜度控制
-        self.freshness_threshold = config.async_training.get("freshness_threshold", 3)
-        self.max_staleness_allowed = config.async_training.get("max_staleness_allowed", 5)
-
-        # 统计信息
-        self.total_generated_samples = 0
-        self.dropped_stale_samples = 0
-        self.pause_count = 0
-
-        # Worker groups
-        self.rollout_wg = None
-        self.message_queue_client = None
-
-        # 运行状态
-        self.running = False
-        self.generation_thread = None
-
-    def init_workers(self):
-        """初始化rollout workers"""
-        logger.info("Initializing Rollouter workers...")
-
-        self.resource_pool_manager.create_resource_pool()
-        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
-
-        # 只创建rollout worker
-        resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout)
-        role_cls = RayClassWithInitArgs(
-            cls=self.role_worker_mapping[Role.Rollout],
-            config=self.config.actor_rollout_ref,
-            role="rollout",
-        )
-        self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls
-
-        # 初始化WorkerGroup
-        all_wg = {}
-        wg_kwargs = {}
-        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
-            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
-
-        for resource_pool, class_dict in self.resource_pool_to_cls.items():
-            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
-            wg_dict = self.ray_worker_group_cls(
-                resource_pool=resource_pool,
-                ray_cls_with_init=worker_dict_cls,
-                device_name=self.device_name,
-                **wg_kwargs,
-            )
-            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
-            all_wg.update(spawn_wg)
-
-        self.rollout_wg = all_wg["rollout"]
-        self.rollout_wg.init_model()
-        logger.info("Rollouter workers initialized successfully")
-
-    def set_message_queue_client(self, message_queue_client: MessageQueueClient):
-        """设置消息队列客户端"""
-        self.message_queue_client = message_queue_client
-
-    def update_rollout_weights(self, param_version: int):
-        """
-        更新rollout模型参数
-        这个方法由外部Trainer调用
-        """
-        logger.info(f"Updating rollout weights to version {param_version}")
-
-        # 暂停rollout
-        self.rollout_controller.pause()
-
-        try:
-            # 暂停推理引擎
-            ray.get(self.rollout_wg.sleep.remote())
-
-            # 执行参数同步
-            # 这里需要与actor建立同步机制
-            if hasattr(self, "param_synchronizer") and self.param_synchronizer:
-                self.param_synchronizer.sync_weights()
-            else:
-                logger.warning("Parameter synchronizer not available, skipping weight sync")
-
-            # 更新参数版本
-            self.current_param_version = param_version
-
-            # 恢复推理引擎
-            ray.get(self.rollout_wg.wake_up.remote())
-
-        finally:
-            # 恢复rollout
-            self.rollout_controller.resume()
-
-        logger.info(f"Rollout weights updated to version {param_version}")
-
-    def set_parameter_synchronizer(self, param_synchronizer):
-        """设置参数同步器"""
-        self.param_synchronizer = param_synchronizer
-
-    def _create_dataloader(self):
-        """创建数据加载器"""
-        from torch.utils.data import DataLoader
-
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.config.data.train_batch_size,
-            sampler=self.train_sampler,
-            collate_fn=self.collate_fn,
-            num_workers=self.config.data.get("dataloader_num_workers", 0),
-            drop_last=True,
-        )
-
-    def _create_continuous_iterator(self):
-        """创建连续的数据迭代器"""
-        dataloader = self._create_dataloader()
-
-        for epoch in range(self.config.trainer.total_epochs):
-            for batch_dict in dataloader:
-                yield epoch, batch_dict
-
-    def _should_pause_generation(self) -> bool:
-        """
-        判断是否应该暂停生成，基于新鲜度控制
-        """
-        if self.message_queue_client is None:
-            return False
-
-        queue_stats = self.message_queue_client.get_statistics()
-        queue_size = queue_stats["queue_size"]
-        current_trainer_version = queue_stats["current_param_version"]
-
-        # 计算参数版本差异
-        version_diff = self.current_param_version - current_trainer_version
-
-        # 如果版本差异过大，暂停生成
-        if version_diff >= self.max_staleness_allowed:
-            logger.info(
-                f"Pausing generation due to staleness: rollout_version={self.current_param_version}, "
-                f"trainer_version={current_trainer_version}, diff={version_diff}"
-            )
-            return True
-
-        # 如果队列太满，也暂停生成
-        max_queue_size = self.freshness_threshold * self.config.data.train_batch_size
-        if queue_size >= max_queue_size:
-            logger.info(f"Pausing generation due to full queue: size={queue_size}, max={max_queue_size}")
-            return True
-
-        return False
-
-    def _generate_batch(self, epoch: int, batch_dict: dict) -> DataProto | None:
-        """生成单个batch的样本"""
-        try:
-            batch = DataProto.from_single_dict(batch_dict)
-
-            # 处理batch用于生成
-            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-
-            # 处理多模态数据
-            if "multi_modal_data" in batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("multi_modal_data")
-            if "raw_prompt" in batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("raw_prompt")
-            if "tools_kwargs" in batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("tools_kwargs")
-            if "interaction_kwargs" in batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-
-            gen_batch = batch.pop(
-                batch_keys=batch_keys_to_pop,
-                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-            )
-
-            # 重复生成多个响应
-            gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-
-            # 执行生成
-            if self.config.actor_rollout_ref.rollout.mode == "async":
-                gen_batch_output = ray.get(self.rollout_wg.async_generate_sequences.remote(gen_batch))
-            else:
-                gen_batch_output = ray.get(self.rollout_wg.generate_sequences.remote(gen_batch))
-
-            # 添加UID
-            batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
-
-            # 重复原始batch以对齐生成的响应
-            batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-
-            # 合并数据
-            final_batch = batch.union(gen_batch_output)
-
-            return final_batch
-
-        except Exception as e:
-            logger.error(f"Error generating batch: {e}")
-            return None
-
-    def _generation_loop(self):
-        """主要的生成循环"""
-        logger.info("Starting generation loop...")
-
-        continuous_iterator = self._create_continuous_iterator()
-
-        for epoch, batch_dict in continuous_iterator:
-            if not self.running:
-                break
-
-            # 等待如果被暂停
-            self.rollout_controller.wait_if_paused(timeout=1.0)
-
-            if not self.running:
-                break
-
-            # 检查是否应该暂停生成
-            if self._should_pause_generation():
-                time.sleep(1.0)  # 等待一段时间再检查
-                continue
-
-            # 生成样本
-            timing_raw = {}
-            with marked_timer("generate_batch", timing_raw):
-                generated_batch = self._generate_batch(epoch, batch_dict)
-
-            if generated_batch is not None:
-                # 放入队列
-                rollout_metadata = {
-                    "timing": timing_raw,
-                    "generation_timestamp": time.time(),
-                }
-
-                success = self.message_queue_client.put_batch(
-                    epoch=epoch,
-                    batch=generated_batch,
-                    param_version=self.current_param_version,
-                    rollout_metadata=rollout_metadata,
-                )
-
-                if success:
-                    self.total_generated_samples += 1
-                    if self.total_generated_samples % 10 == 0:
-                        logger.info(
-                            f"Generated {self.total_generated_samples} batches, "
-                            f"param_version={self.current_param_version}"
-                        )
-                else:
-                    self.dropped_stale_samples += 1
-                    logger.warning(f"Dropped stale sample, total dropped: {self.dropped_stale_samples}")
-
-        logger.info("Generation loop finished")
-
-    def fit(self):
-        """开始异步生成样本"""
-        logger.info("Starting Rollouter...")
-
-        if self.message_queue_client is None:
-            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-
-        self.running = True
-
-        # 在单独的线程中运行生成循环
-        self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
-        self.generation_thread.start()
-
-        try:
-            # 主线程保持运行，处理控制信号
-            while self.running:
-                time.sleep(1.0)
-
-                # 定期打印统计信息
-                if self.total_generated_samples > 0 and self.total_generated_samples % 100 == 0:
-                    queue_stats = self.message_queue_client.get_statistics()
-                    logger.info(
-                        f"Rollouter stats - Generated: {self.total_generated_samples}, "
-                        f"Dropped: {self.dropped_stale_samples}, "
-                        f"Queue size: {queue_stats['queue_size']}, "
-                        f"Param version: {self.current_param_version}"
-                    )
-
-        except KeyboardInterrupt:
-            logger.info("Received interrupt signal, shutting down...")
-        finally:
-            self.shutdown()
-
-    def shutdown(self):
-        """关闭Rollouter"""
-        logger.info("Shutting down Rollouter...")
-
-        self.running = False
-
-        # 恢复可能被暂停的生成线程
-        self.rollout_controller.resume()
-
-        # 等待生成线程结束
-        if self.generation_thread and self.generation_thread.is_alive():
-            self.generation_thread.join(timeout=5.0)
-
-        logger.info("Rollouter shutdown complete")
-
-    def get_statistics(self) -> dict:
-        """获取统计信息"""
-        return {
-            "total_generated_samples": self.total_generated_samples,
-            "dropped_stale_samples": self.dropped_stale_samples,
-            "current_param_version": self.current_param_version,
-            "pause_count": self.pause_count,
-            "is_running": self.running,
-            "is_paused": self.rollout_controller.is_paused,
-        }

From 2df18111a7b2e66e289c6cea94389c8d2f677568 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 30 Jul 2025 21:35:53 +0800
Subject: [PATCH 006/182] cpu mq

---
 recipe/fully_async_policy/message_queue.py | 60 ++++++++++++++++++++--
 recipe/fully_async_policy/test_mq.py       |  1 +
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index f57d1e15325..58996d4266e 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import threading
 import time
 import uuid
@@ -24,6 +25,8 @@
 from filelock import FileLock
 from omegaconf import DictConfig
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class BatchSample:
@@ -40,7 +43,8 @@ class BatchSample:
 @ray.remote(num_cpus=1)
 class MessageQueue:
     """
-    基于ZeroMQ的异步消息队列，用于Rollouter和Trainer之间的通信
+    简化的Ray-based异步消息队列，用于Rollouter和Trainer之间的通信
+    去掉了ZeroMQ的复杂性，使用更可靠的Ray机制
     """
 
     def __init__(self, config: DictConfig, max_queue_size: int = 1000):
@@ -49,7 +53,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.queue = deque(maxlen=max_queue_size)
         self.current_param_version = 0
 
-        # 安全地获取配置值，避免递归问题
+        # 安全地获取配置值
         try:
             if hasattr(config, "async_training") and config.async_training is not None:
                 self.freshness_threshold = getattr(config.async_training, "freshness_threshold", 3)
@@ -69,15 +73,22 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
 
         # Threading for message handling
         self.running = True
+
+        # 线程安全
         self.lock = threading.RLock()
         self.consumer_waiting = False
         self.consumer_condition = threading.Condition(self.lock)
 
-        # Statistics
+        # 统计信息
         self.total_produced = 0
         self.total_consumed = 0
         self.dropped_samples = 0
 
+        logger.info(
+            f"MessageQueue initialized with max_queue_size={max_queue_size},"
+            "freshness_threshold={self.freshness_threshold}"
+        )
+
     def _setup_zmq(self):
         """设置ZeroMQ socket"""
         with FileLock("/tmp/verl_message_queue.lock"):
@@ -113,6 +124,7 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata
             staleness = self.current_param_version - param_version
             if staleness >= self.freshness_threshold:
                 self.dropped_samples += 1
+                logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.freshness_threshold}")
                 return False
 
             sample = BatchSample(
@@ -128,7 +140,7 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata
             if len(self.queue) >= self.max_queue_size:
                 removed = self.queue.popleft()
                 self.dropped_samples += 1
-                print(f"Queue full, dropped sample {removed.batch_id}")
+                logger.warning(f"Queue full, dropped sample {removed.batch_id}")
 
             self.queue.append(sample)
             self.total_produced += 1
@@ -137,6 +149,9 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata
             if self.consumer_waiting:
                 self.consumer_condition.notify()
 
+            if self.total_produced % 100 == 0:
+                logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}")
+
             return True
 
     def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
@@ -174,7 +189,9 @@ def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional
     def update_param_version(self, version: int):
         """更新当前参数版本"""
         with self.lock:
+            old_version = self.current_param_version
             self.current_param_version = version
+            logger.debug(f"Parameter version updated from {old_version} to {version}")
 
     def get_queue_size(self) -> int:
         """获取当前队列长度"""
@@ -191,12 +208,15 @@ def get_statistics(self) -> dict[str, Any]:
                 "dropped_samples": self.dropped_samples,
                 "current_param_version": self.current_param_version,
                 "freshness_threshold": self.freshness_threshold,
+                "max_queue_size": self.max_queue_size,
             }
 
     def clear_queue(self):
         """清空队列"""
         with self.lock:
+            cleared_count = len(self.queue)
             self.queue.clear()
+            logger.info(f"Cleared {cleared_count} samples from queue")
 
     def shutdown(self):
         """关闭消息队列"""
@@ -206,6 +226,34 @@ def shutdown(self):
         if self.context:
             self.context.term()
 
+    def get_memory_usage(self) -> dict:
+        """获取内存使用统计"""
+        with self.lock:
+            # 估算队列中样本的内存使用
+            import sys
+
+            total_size = 0
+            sample_count = len(self.queue)
+
+            if sample_count > 0:
+                # 估算单个样本的大小（简化估算）
+                sample = list(self.queue)[0]
+                try:
+                    sample_size = sys.getsizeof(sample)
+                    if hasattr(sample.data, "batch") and hasattr(sample.data.batch, "__len__"):
+                        # 如果有batch信息，估算数据大小
+                        batch_size = len(sample.data.batch)
+                        sample_size += batch_size * 1000  # 粗略估算每个batch条目1KB
+                    total_size = sample_size * sample_count
+                except Exception:
+                    total_size = sample_count * 10000  # 粗略估算每个样本10KB
+
+            return {
+                "queue_samples": sample_count,
+                "estimated_memory_bytes": total_size,
+                "estimated_memory_mb": total_size / (1024 * 1024),
+            }
+
     def get_address(self) -> str:
         """获取ZeroMQ地址"""
         return self.address
@@ -244,3 +292,7 @@ def clear_queue(self):
     def shutdown(self):
         """关闭队列"""
         ray.get(self.queue_actor.shutdown.remote())
+
+    def get_memory_usage(self) -> dict:
+        """获取内存使用统计"""
+        return ray.get(self.queue_actor.get_memory_usage.remote())
diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py
index 488b7d12614..3659911319e 100644
--- a/recipe/fully_async_policy/test_mq.py
+++ b/recipe/fully_async_policy/test_mq.py
@@ -227,6 +227,7 @@ def test_get_statistics(self, message_queue_actor):
             "dropped_samples",
             "current_param_version",
             "freshness_threshold",
+            "max_queue_size",
         }
         assert set(stats.keys()) == expected_keys
         assert isinstance(stats["queue_size"], int)

From 48e91a3d2471457510dffa5a79cf9f00f25976d1 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 30 Jul 2025 21:50:29 +0800
Subject: [PATCH 007/182] one_step_off_policy

---
 recipe/one_step_off_policy/main_ppo.py | 51 +++++---------------------
 1 file changed, 9 insertions(+), 42 deletions(-)

diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py
index 44a0f4b8675..d6072c5521e 100644
--- a/recipe/one_step_off_policy/main_ppo.py
+++ b/recipe/one_step_off_policy/main_ppo.py
@@ -23,58 +23,18 @@
 import ray
 from omegaconf import OmegaConf
 
-from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
 from verl.trainer.ppo.reward import load_reward_manager
 
 from .ray_trainer import OneStepOffRayTrainer
 
 
-@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None)
-def main(config):
-    run_ppo(config)
-
-
-# Define a function to run the PPO-like training process
-def run_ppo(config) -> None:
-    # Check if Ray is not initialized
-    if not ray.is_initialized():
-        # Initialize Ray with a local cluster configuration
-        # Set environment variables in the runtime environment to control tokenizer parallelism,
-        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
-        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
-        ray.init(
-            runtime_env=get_ppo_ray_runtime_env(),
-            num_cpus=config.ray_init.num_cpus,
-        )
-
-    # Create a remote instance of the TaskRunner class, and
-    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
-    if (
-        OmegaConf.select(config.trainer, "profile_steps") is not None
-        and len(OmegaConf.select(config.trainer, "profile_steps")) > 0
-    ):
-        nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
-        runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
-    else:
-        runner = TaskRunner.remote()
-    ray.get(runner.run.remote(config))
-
-    # [Optional] get the path of the timeline trace file from the configuration, default to None
-    # This file is used for performance analysis
-    timeline_json_file = config.ray_init.get("timeline_json_file", None)
-    if timeline_json_file:
-        ray.timeline(filename=timeline_json_file)
-
-
 @ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
-class TaskRunner:
+class OneStepOffTaskRunner:
     def run(self, config):
         # Print the initial configuration. `resolve=True` will evaluate symbolic values.
         from pprint import pprint
 
-        from omegaconf import OmegaConf
-
         from verl.utils.fs import copy_to_local
 
         print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
@@ -172,7 +132,7 @@ def run(self, config):
         # finally, we combine all the rewards together
         # The reward type depends on the tag of the data
         if config.reward_model.enable:
-            if config.reward_model.strategy in ["fsdp2"]:
+            if config.reward_model.strategy == "fsdp2":
                 from verl.workers.fsdp_workers import RewardModelWorker
             elif config.reward_model.strategy == "megatron":
                 from verl.workers.megatron_workers import RewardModelWorker
@@ -224,5 +184,12 @@ def run(self, config):
         trainer.fit()
 
 
+@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None)
+def main(config):
+    from verl.trainer.main_ppo import run_ppo
+
+    run_ppo(config)
+
+
 if __name__ == "__main__":
     main()

From 07f2e62de973fc12bcd68b0f91cbf69622d580d8 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 30 Jul 2025 21:51:04 +0800
Subject: [PATCH 008/182] md

---
 .../fully_async_policy/README_fully_async.md  | 381 ++++++++++++------
 1 file changed, 253 insertions(+), 128 deletions(-)

diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md
index 979f9aff783..4c1866788a5 100644
--- a/recipe/fully_async_policy/README_fully_async.md
+++ b/recipe/fully_async_policy/README_fully_async.md
@@ -1,183 +1,308 @@
-# 完全异步训练工作流 (Fully Async Training Workflow)
+# 完全异步PPO训练系统 (Fully Async Policy)
 
-## 概述
+本文档介绍了基于 OneStepOffRayTrainer 成熟实现改进的完全异步PPO训练系统，该系统实现了 Trainer 和 Rollouter 的完全解耦，支持异步样本生成和训练。
 
-本项目实现了基于现有 one step off policy 代码的完全异步训练工作流，将样本生成（Rollouter）和模型训练（Trainer）完全解耦，通过 MessageQueue 进行异步通信。
+## 🚀 **系统特性**
 
-## 架构设计
+### 核心特性
+- **完全异步训练**: Trainer 和 Rollouter 在独立的Ray Actor中运行，实现真正的并行处理
+- **智能新鲜度控制**: 基于参数版本和时间戳的样本新鲜度管理，防止过期样本影响训练
+- **健壮的参数同步**: 改进的参数同步机制，支持错误重试和状态管理
+- **简化的消息队列**: 去除ZeroMQ依赖，使用Ray-based消息传递，更稳定可靠
+- **完善的监控**: 详细的性能指标和组件健康状态监控
 
-### 核心组件
+### 改进亮点
+- **参考OneStepOffRayTrainer**: 使用成熟的训练逻辑，确保训练稳定性
+- **错误处理和恢复**: 完善的异常处理和资源清理机制
+- **组件协调**: 统一的组件生命周期管理和状态监控
+- **配置验证**: 智能的配置验证和默认值设置
 
-1. **MessageQueue**: 基于 ZeroMQ 的异步消息队列，作为 Ray Actor 存在
-   - 管理生成的样本队列
-   - 支持新鲜度控制，自动丢弃过期样本
-   - 提供线程安全的生产者-消费者接口
+## 🏗️ **系统架构**
 
-2. **Rollouter**: 专门负责样本生成的组件
-   - 持续循环生成训练样本
-   - 支持暂停/恢复机制，用于参数更新
-   - 实现新鲜度阈值控制，避免生成过多过期样本
+### 组件结构
 
-3. **FullyAsyncTrainer**: 修改后的训练器
-   - 从 MessageQueue 获取样本进行训练
-   - 训练完成后通知 Rollouter 更新参数
-   - 支持样本新鲜度监控和统计
-
-4. **ParameterSynchronizer**: 参数同步模块
-   - 基于 NCCL 实现高效的参数同步
-   - 支持 Actor 到 Rollout 的参数传递
+```
+┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
+│  FullyAsyncMain │────│ MessageQueue    │────│ FullyAsyncTrainer│
+│  (Coordinator)  │    │  (Ray Actor)    │    │   (Ray Actor)   │
+└─────────────────┘    └─────────────────┘    └─────────────────┘
+         │                       │                       │
+         └───────────────────────┼───────────────────────┘
+                                 │
+                    ┌─────────────────┐
+                    │   Rollouter     │
+                    │  (Ray Actor)    │
+                    └─────────────────┘
+                             │
+                    ┌─────────────────┐
+                    │ ParameterSync   │
+                    │   Manager       │
+                    └─────────────────┘
+```
 
-### 工作流程
+### 数据流
 
 ```
-┌─────────────┐    put_batch    ┌──────────────┐    get_batch    ┌─────────────┐
-│  Rollouter  │ ──────────────► │ MessageQueue │ ──────────────► │   Trainer   │
-│             │                 │              │                 │             │
-│ - 生成样本   │                 │ - 队列管理    │                 │ - 模型训练   │
-│ - 暂停/恢复  │                 │ - 新鲜度控制  │                 │ - 参数更新   │
-│ - 新鲜度控制 │                 │ - 统计信息    │                 │ - 同步通知   │
-└─────────────┘                 └──────────────┘                 └─────────────┘
-       ▲                                                                 │
-       │                        update_rollout_weights                   │
-       └─────────────────────────────────────────────────────────────────┘
+1. 数据生成: Rollouter → MessageQueue
+2. 训练消费: MessageQueue → FullyAsyncTrainer
+3. 参数同步: FullyAsyncTrainer → Rollouter
+4. 状态监控: FullyAsyncMain → All Components
 ```
 
-## 新鲜度控制机制
+## 📋 **核心组件**
+
+### 1. FullyAsyncTrainer
+- **功能**: 从MessageQueue获取样本进行异步训练
+- **特性**:
+  - 基于OneStepOffRayTrainer的成熟训练逻辑
+  - 智能的样本新鲜度指标计算
+  - 完善的错误处理和重试机制
+  - 详细的训练性能监控
+
+### 2. Rollouter
+- **功能**: 持续生成训练样本并放入MessageQueue
+- **特性**:
+  - 智能的暂停/恢复控制机制
+  - 基于新鲜度的生成控制
+  - 改进的参数同步处理
+  - 异步/同步生成模式支持
+
+### 3. MessageQueue
+- **功能**: Ray-based消息队列，管理样本传递
+- **特性**:
+  - 去除ZeroMQ依赖，更稳定可靠
+  - 智能的样本过期检测
+  - 线程安全的队列操作
+  - 内存使用监控
+
+### 4. ParameterSynchronizer
+- **功能**: 管理Actor和Rollout间的参数同步
+- **特性**:
+  - 支持错误重试和超时处理
+  - 详细的同步状态跟踪
+  - 集群通信组管理
+
+### 5. FullyAsyncMain
+- **功能**: 系统协调器，管理所有组件的生命周期
+- **特性**:
+  - 统一的组件初始化和清理
+  - 实时的健康状态监控
+  - 优雅的关闭和错误恢复
+
+## ⚙️ **配置说明**
+
+### 异步训练配置 (async_training)
 
-### 配置参数
+```yaml
+async_training:
+  # 新鲜度控制
+  freshness_threshold: 3              # 样本新鲜度阈值
+  max_staleness_allowed: 5            # 最大允许的样本陈旧度
+
+  # 队列管理
+  max_queue_size: 1000               # 消息队列最大大小
+  min_batch_count: 1                 # 每次获取的最小batch数量
+  batch_timeout: 30.0                # 获取batch的超时时间
+
+  # 生成控制
+  generation_timeout: 30.0           # 单次生成的超时时间
+  batch_generation_interval: 0.1     # batch生成间隔
+
+  # 参数同步
+  max_sync_retries: 3                # 参数同步最大重试次数
+  sync_timeout: 30.0                 # 同步超时时间
+  sync_retry_delay: 1.0              # 重试延迟时间
+```
 
-- `freshness_threshold`: 新鲜度阈值，队列中超过此版本差异的样本会被丢弃
-- `max_staleness_allowed`: 最大允许的新鲜度差异，Rollouter 会暂停生成
-- `max_queue_size`: MessageQueue 的最大队列大小
+### 资源配置
 
-### 控制逻辑
+```yaml
+trainer:
+  n_gpus_per_node: 4                 # 每个训练节点的GPU数量
+  nnodes: 2                          # 训练节点数量
+  device: cuda
+
+rollout:
+  n_gpus_per_node: 2                 # 每个rollout节点的GPU数量
+  nnodes: 1                          # rollout节点数量
+```
 
-1. **样本丢弃**: 当样本的参数版本与当前 Trainer 版本差异超过 `freshness_threshold` 时，样本被丢弃
-2. **生成暂停**: 当 Rollouter 的参数版本与 Trainer 版本差异超过 `max_staleness_allowed` 时，暂停生成
-3. **队列管理**: 队列长度限制为 `freshness_threshold * batch_size`，避免内存溢出
+## 🔧 **使用方法**
 
-## 性能优势
+### 1. 基本运行
 
-### 相比同步训练
+```bash
+# 使用默认配置运行
+python fully_async_main.py
 
-- **GPU 利用率提升**: 生成和训练并行进行，减少 GPU 空闲时间
-- **长尾样本优化**: 训练不需要等待最慢的样本生成完成
-- **资源隔离**: 可以独立配置生成和训练的资源分配
+# 使用自定义配置
+python fully_async_main.py --config-path /path/to/config --config-name my_config
+```
 
-### 相比 One Step Off Policy
+### 2. 配置自定义
 
-- **更高的异步度**: 完全解耦生成和训练，支持多步异步
-- **更灵活的控制**: 支持动态的新鲜度控制和队列管理
-- **更好的监控**: 提供详细的统计信息和性能指标
+```python
+# 在配置文件中自定义异步训练参数
+async_training:
+  freshness_threshold: 5
+  max_queue_size: 2000
+  generation_timeout: 60.0
+```
 
-## 使用方法
+### 3. 监控和调试
 
-### 1. 安装依赖
+```python
+# 系统会自动输出详细的统计信息
+# 包括: Trainer状态、Rollouter状态、队列状态等
 
-```bash
-pip install zmq filelock
+# 日志文件: fully_async_training.log
+# 包含所有组件的详细日志信息
 ```
 
-### 2. 配置文件
+## 📊 **性能监控**
 
-使用 `config/fully_async_ppo_trainer.yaml` 配置文件，关键配置项：
+### 关键指标
 
-```yaml
-async_training:
-  freshness_threshold: 3      # 新鲜度阈值
-  max_staleness_allowed: 5    # 最大允许新鲜度差异
-  max_queue_size: 1000        # 队列最大大小
-  min_batch_count: 1          # 最小batch数量
-  batch_timeout: 30.0         # 获取batch超时时间
-
-actor_rollout_ref:
-  rollout:
-    mode: async               # 使用异步模式
-    n_gpus: 4                # rollout专用GPU数量
-    name: vllm               # 使用vLLM引擎
-```
+#### Trainer指标
+- `global_steps`: 训练步数
+- `processed_samples`: 已处理样本数
+- `current_param_version`: 当前参数版本
+- `param_sync_count`: 参数同步次数
 
-### 3. 启动训练
+#### Rollouter指标
+- `total_generated_samples`: 总生成样本数
+- `dropped_stale_samples`: 丢弃的过期样本数
+- `generation_errors`: 生成错误数
+- `param_sync_requests`: 参数同步请求数
 
-```bash
-python -m recipe.one_step_off_policy.fully_async_main \
-    data.train_files=~/data/train.parquet \
-    data.val_files=~/data/val.parquet \
-    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
-    trainer.total_training_steps=1000
-```
+#### 新鲜度指标
+- `avg_sample_age`: 样本平均年龄
+- `max_sample_age`: 样本最大年龄
+- `stale_samples_ratio`: 过期样本比例
 
-### 4. 监控训练
+#### 队列指标
+- `queue_size`: 当前队列大小
+- `total_produced`: 总生产样本数
+- `total_consumed`: 总消费样本数
+- `dropped_samples`: 总丢弃样本数
 
-训练过程中会输出以下统计信息：
+## 🔍 **故障排查**
 
-- `queue_size`: 当前队列大小
-- `avg_sample_age`: 平均样本年龄（参数版本差异）
-- `max_sample_age`: 最大样本年龄
-- `param_version`: 当前参数版本
-- `processed_samples`: 已处理样本数
-- `dropped_samples`: 丢弃的过期样本数
+### 常见问题
 
-## 性能调优建议
+1. **样本生成过慢**
+   - 检查 `generation_timeout` 设置
+   - 监控 `generation_errors` 指标
+   - 调整 `batch_generation_interval`
 
-### 1. 资源分配
+2. **样本过期严重**
+   - 调整 `freshness_threshold`
+   - 检查参数同步频率
+   - 监控 `stale_samples_ratio`
 
-- **生成资源**: 根据模型大小和生成速度需求分配 GPU
-- **训练资源**: 根据batch大小和训练复杂度分配 GPU
-- **比例建议**: 生成:训练 = 1:2 到 1:3
+3. **队列溢出**
+   - 增加 `max_queue_size`
+   - 优化训练速度
+   - 调整 `min_batch_count`
 
-### 2. 新鲜度控制
+4. **参数同步失败**
+   - 检查 `sync_timeout` 设置
+   - 监控 `sync_failures` 指标
+   - 调整 `max_sync_retries`
 
-- **快速生成场景**: 降低 `freshness_threshold` (2-3)
-- **慢速生成场景**: 提高 `freshness_threshold` (5-8)
-- **队列大小**: 设置为 `freshness_threshold * batch_size * 2`
+### 日志分析
 
-### 3. 网络优化
+```bash
+# 查看主要错误
+grep "ERROR" fully_async_training.log
 
-- **单节点**: MessageQueue 使用 IPC 协议
-- **多节点**: MessageQueue 使用 TCP 协议，注意网络带宽
+# 查看组件统计
+grep "Component Statistics" fully_async_training.log
 
-## 故障排除
+# 查看参数同步状态
+grep "Parameter sync" fully_async_training.log
+```
 
-### 常见问题
+## 🚀 **性能优化建议**
 
-1. **队列为空**: 检查 Rollouter 是否正常运行，是否被新鲜度控制暂停
-2. **内存溢出**: 减少 `max_queue_size` 或增加 `freshness_threshold`
-3. **参数同步失败**: 检查 NCCL 配置和网络连接
-4. **性能下降**: 调整资源分配比例，监控 GPU 利用率
+### 1. 资源配置优化
+- 根据模型大小合理配置GPU数量
+- 训练和rollout使用独立的资源池
+- 考虑内存和计算的平衡
 
-### 调试模式
+### 2. 新鲜度控制优化
+- 根据模型收敛速度调整新鲜度阈值
+- 监控样本年龄分布，避免过度丢弃
+- 动态调整队列大小
 
-设置环境变量启用详细日志：
+### 3. 参数同步优化
+- 合理设置同步频率，平衡性能和一致性
+- 使用异步同步减少等待时间
+- 监控同步耗时，及时发现问题
 
-```bash
-export VERL_LOGGING_LEVEL=DEBUG
-export NCCL_DEBUG=INFO
+## 🔧 **扩展和定制**
+
+### 自定义组件
+
+```python
+# 自定义Trainer
+class CustomFullyAsyncTrainer(FullyAsyncTrainer):
+    def _compute_custom_metrics(self, batch):
+        # 添加自定义指标计算
+        pass
+
+# 自定义Rollouter
+class CustomRollouter(Rollouter):
+    def _custom_generation_logic(self, batch):
+        # 添加自定义生成逻辑
+        pass
+```
+
+### 自定义监控
+
+```python
+# 添加自定义监控指标
+def custom_monitor(trainer_stats, rollouter_stats):
+    # 实现自定义监控逻辑
+    custom_metric = calculate_custom_metric(trainer_stats)
+    logger.info(f"Custom metric: {custom_metric}")
 ```
 
-## 与现有系统对比
+## 📚 **与OneStepOffRayTrainer的对比**
+
+| 特性 | OneStepOffRayTrainer | FullyAsyncTrainer |
+|------|---------------------|------------------|
+| 训练模式 | 同步批处理 | 异步流处理 |
+| 参数更新 | 批次同步更新 | 实时异步更新 |
+| 资源利用 | 阶段性利用 | 持续高效利用 |
+| 新鲜度控制 | 无需考虑 | 智能控制 |
+| 复杂度 | 相对简单 | 更复杂但更灵活 |
+| 适用场景 | 标准训练 | 大规模持续训练 |
+
+## 📖 **最佳实践**
 
-| 特性 | 同步训练 | One Step Off | 完全异步 |
-|------|----------|--------------|----------|
-| 异步程度 | 无 | 一步 | 多步 |
-| 资源利用率 | 低 | 中 | 高 |
-| 实现复杂度 | 低 | 中 | 高 |
-| 样本新鲜度 | 最新 | 一步延迟 | 可控延迟 |
-| 内存使用 | 低 | 中 | 中-高 |
+1. **配置调优**: 从默认配置开始，根据监控指标逐步优化
+2. **资源规划**: 合理分配训练和生成资源，避免瓶颈
+3. **监控预警**: 设置关键指标的阈值报警
+4. **定期检查**: 定期检查日志和性能指标
+5. **版本管理**: 记录配置变更和性能影响
 
-## 实验结果预期
+## 🤝 **贡献和反馈**
 
-基于现有 one step off policy 的实验结果，完全异步训练预期能够：
+欢迎提交issue和PR来改进这个异步训练系统！
 
-- **训练速度**: 相比同步训练提升 30-50%
-- **GPU 利用率**: 提升至 85-95%
-- **内存开销**: 增加 20-30%（主要用于队列缓存）
-- **模型收敛**: 与同步训练基本一致（在合理的新鲜度控制下）
+## 📄 **更新日志**
 
-## 后续改进
+### v2.0 (改进版本)
+- ✅ 基于OneStepOffRayTrainer重构训练逻辑
+- ✅ 简化MessageQueue实现，去除ZeroMQ依赖
+- ✅ 改进参数同步机制，支持错误重试
+- ✅ 完善组件协调和监控系统
+- ✅ 优化错误处理和资源管理
+- ✅ 增加详细的性能指标和日志
 
-1. **自适应新鲜度控制**: 根据训练进度动态调整新鲜度阈值
-2. **多队列支持**: 支持不同优先级的样本队列
-3. **分布式队列**: 支持跨节点的分布式消息队列
-4. **更精细的资源调度**: 支持动态的资源分配和调整
+### v1.0 (原始版本)
+- 基础异步训练框架
+- 简单的消息队列实现
+- 基本的参数同步功能
 

From 502de26f9ba6606dbef099db28cbc2b46551a0e3 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 30 Jul 2025 21:54:32 +0800
Subject: [PATCH 009/182] rollouter

---
 recipe/fully_async_policy/fully_async_rollouter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index c127b242704..3ece39d0f10 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -115,7 +115,8 @@ def get_status(self) -> dict:
             }
 
 
-class Rollouter:
+@ray.remote
+class FullyAsyncRollouter:
     """
     异步样本生成器，负责持续生成训练样本并放入MessageQueue
     基于OneStepOffRayTrainer的成熟实现改进

From dbdfdbfed992b95feb51f8563133ebfd6462b2ba Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 31 Jul 2025 11:50:55 +0800
Subject: [PATCH 010/182] yaml

---
 .../config/fully_async_ppo_trainer.yaml       | 153 ++++--------------
 1 file changed, 27 insertions(+), 126 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index cbc7058f108..19c4aa01339 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -6,131 +6,32 @@ defaults:
   - ppo_trainer
   - _self_
 
-# 完全异步训练的特殊配置
-async_training:
-  # 新鲜度阈值，超过此版本差异的样本会被丢弃
-  freshness_threshold: 3
-
-  # 最大允许的新鲜度差异，rollout会暂停生成
-  max_staleness_allowed: 5
-
-  # MessageQueue的最大队列大小
-  max_queue_size: 1000
-
-  # 最小batch数量，trainer会等待至少这么多batch
-  min_batch_count: 1
-
-  # 获取batch的超时时间（秒）
-  batch_timeout: 30.0
-
-# 重写默认的训练配置
-actor_rollout_ref:
-  hybrid_engine: false
-  rollout:
-    # 异步模式
-    mode: async
-
-    # rollout专用的GPU数量
-    n_gpus: 4
-
-    # 使用vLLM异步rollout
-    name: vllm
-
-    # 其他rollout参数
-    temperature: 1.0
-    top_k: -1
-    top_p: 1.0
-    tensor_model_parallel_size: 2
-    gpu_memory_utilization: 0.6
-    max_num_batched_tokens: 8192
-    free_cache_engine: true
-    enforce_eager: true
-
-# 训练器配置
-trainer:
-  # 总训练步数
-  total_training_steps: 1000
-
-  # 设备
-  device: cuda
-
-  # 保存频率
-  save_freq: 100
-
-  # 验证频率
-  val_freq: 50
-
-  # 日志配置
-  logger: '["console", "wandb"]'
-  project_name: "fully_async_ppo"
-  experiment_name: "test_async_training"
+# ============= 完全异步训练配置 (Fully Async Training Config) =============
 
-# 数据配置
-data:
-  # 训练batch大小
-  train_batch_size: 128
-
-  # 数据文件路径
-  train_files: "~/data/train.parquet"
-  val_files: "~/data/val.parquet"
-
-  # 序列长度
-  max_prompt_length: 1024
-  max_response_length: 1024
-
-# 算法配置
-algorithm:
-  # 优势估计器
-  adv_estimator: gae
-
-  # PPO参数
-  cliprange: 0.2
-  cliprange_value: 0.2
-  vf_coeff: 0.1
-  entropy_coeff: 0.01
-
-  # KL相关
-  kl_coeff: 0.1
-  adaptive_kl: true
-  target_kl: 0.01
-
-# 模型配置
-actor_rollout_ref:
-  model:
-    # 模型路径
-    path: "Qwen/Qwen2-7B-Instruct"
-
-    # 使用LoRA
-    lora_rank: 64
-    lora_alpha: 128
-    lora_dropout: 0.1
-
-  actor:
-    # Actor优化器
-    optim:
-      lr: 1e-6
-      weight_decay: 0.01
-
-    # FSDP配置
-    fsdp_config:
-      fsdp_size: -1
-      param_offload: false
-      optimizer_offload: false
-
-    # PPO配置
-    ppo_mini_batch_size: 32
-    use_dynamic_bsz: true
-
-# Critic配置
-critic:
-  model:
-    path: "Qwen/Qwen2-7B-Instruct"
-
-  optim:
-    lr: 1e-5
-    weight_decay: 0.01
-
-  fsdp_config:
-    fsdp_size: -1
-    param_offload: false
+async_training:
+  # 新鲜度控制 (Freshness Control)
+  freshness_threshold: 3              # 样本新鲜度阈值
+  max_staleness_allowed: 5            # 最大允许的样本陈旧度
+
+  # 队列管理 (Queue Management)
+  max_queue_size: 1000               # 消息队列最大大小
+  min_batch_count: 1                 # 每次获取的最小batch数量
+  batch_timeout: 30.0                # 获取batch的超时时间(秒)
+
+  # 生成控制 (Generation Control)
+  generation_timeout: 30.0           # 单次生成的超时时间(秒)
+  batch_generation_interval: 0.1     # batch生成间隔(秒)
+
+  # 参数同步 (Parameter Synchronization)
+  max_sync_retries: 3                # 参数同步最大重试次数
+  sync_timeout: 30.0                 # 同步超时时间(秒)
+  sync_retry_delay: 1.0              # 重试延迟时间(秒)
+
+# Rollout配置
+rollout:
+  nnodes: 1                          # Number of nodes used in the rollout
+  n_gpus_per_node: 8                 # Number of GPUs per node
+  mode: async                        # rollout模式: sync, async
+  name: vllm                         # rollout引擎: vllm, sglang
+  n: 4                               # 每个prompt生成的响应数量
 

From 08c1ba14b9e93aae9bf7c91410d4512587d627a2 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 31 Jul 2025 14:09:07 +0800
Subject: [PATCH 011/182] trainer

---
 .../fully_async_policy/fully_async_trainer.py | 395 ++++++++++++------
 recipe/fully_async_policy/param_sync.py       | 305 ++++++++++++--
 recipe/fully_async_policy/test_fully_async.py |   2 +-
 3 files changed, 535 insertions(+), 167 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 2487387b163..e66bc895c9c 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 
 import logging
+import time
 from pprint import pprint
 
 import numpy as np
 import ray
+import torch
 from omegaconf import OmegaConf
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
@@ -25,12 +27,15 @@
 from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo import core_algos
+from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
 from verl.trainer.ppo.metric_utils import (
     compute_data_metrics,
     compute_throughout_metrics,
     compute_timing_metrics,
 )
 from verl.trainer.ppo.ray_trainer import (
+    RayPPOTrainer,
     ResourcePoolManager,
     Role,
     WorkerType,
@@ -46,9 +51,11 @@
 logger = logging.getLogger(__name__)
 
 
-class FullyAsyncTrainer:
+@ray.remote
+class FullyAsyncTrainer(RayPPOTrainer):
     """
     完全异步的PPO训练器，从MessageQueue获取样本进行训练
+    基于OneStepOffRayTrainer的成熟实现改进
     """
 
     def __init__(
@@ -73,6 +80,9 @@ def __init__(
         self.reward_fn = reward_fn
         self.val_reward_fn = val_reward_fn
 
+        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+        assert not self.hybrid_engine
+
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
         self.ray_worker_group_cls = ray_worker_group_cls
@@ -85,12 +95,32 @@ def __init__(
         self.collate_fn = collate_fn
         self.train_sampler = train_sampler
 
-        # 角色配置
+        # 角色配置 - 参考OneStepOffRayTrainer的配置
         self.use_reference_policy = Role.RefPolicy in role_worker_mapping
         self.use_rm = Role.RewardModel in role_worker_mapping
-        self.use_critic = Role.Critic in role_worker_mapping
         self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
 
+        # KL控制器
+        if config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
+
+        # 确定是否使用critic - 参考OneStepOffRayTrainer的逻辑
+        if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+            self.use_critic = True
+        elif self.config.algorithm.adv_estimator in [
+            AdvantageEstimator.GRPO,
+            AdvantageEstimator.GRPO_PASSK,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS,
+            # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy
+            AdvantageEstimator.RLOO,
+            AdvantageEstimator.OPO,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
+            AdvantageEstimator.GPG,
+        ]:
+            self.use_critic = False
+        else:
+            raise NotImplementedError(f"Unsupported advantage estimator: {self.config.algorithm.adv_estimator}")
+
         # Worker groups
         self.actor_wg = None
         self.critic_wg = None
@@ -111,6 +141,17 @@ def __init__(
         # 统计信息
         self.processed_samples = 0
         self.stale_samples_processed = 0
+        self.param_sync_count = 0
+
+        self._validate_config()
+
+    def _validate_config(self):
+        """验证配置"""
+        required_configs = ["trainer.total_training_steps", "algorithm.adv_estimator", "data.train_batch_size"]
+
+        for config_path in required_configs:
+            if not OmegaConf.select(self.config, config_path):
+                raise ValueError(f"Missing required config: {config_path}")
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
@@ -121,50 +162,58 @@ def set_rollouter_actor(self, rollouter_actor):
         self.rollouter_actor = rollouter_actor
 
     def init_workers(self):
-        """初始化训练workers"""
+        """初始化训练workers - 参考OneStepOffRayTrainer的实现"""
         logger.info("Initializing FullyAsyncTrainer workers...")
 
         self.resource_pool_manager.create_resource_pool()
         self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
 
         # 创建actor worker
-        actor_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor)
+        resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor)
         actor_cls = RayClassWithInitArgs(
             cls=self.role_worker_mapping[Role.Actor],
             config=self.config.actor_rollout_ref,
             role="actor",
         )
-        self.resource_pool_to_cls[actor_resource_pool]["actor"] = actor_cls
+        self.resource_pool_to_cls[resource_pool]["actor"] = actor_cls
 
         # 创建critic worker
         if self.use_critic:
-            critic_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
             critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
-            self.resource_pool_to_cls[critic_resource_pool]["critic"] = critic_cls
+            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
 
         # 创建reference policy worker
         if self.use_reference_policy:
-            ref_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
             ref_policy_cls = RayClassWithInitArgs(
                 cls=self.role_worker_mapping[Role.RefPolicy],
                 config=self.config.actor_rollout_ref,
                 role="ref",
             )
-            self.resource_pool_to_cls[ref_resource_pool]["ref"] = ref_policy_cls
+            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
 
         # 创建reward model worker
         if self.use_rm:
-            rm_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
             rm_cls = RayClassWithInitArgs(
                 cls=self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model
             )
-            self.resource_pool_to_cls[rm_resource_pool]["rm"] = rm_cls
+            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
 
-        # 初始化WorkerGroup
+        # 初始化WorkerGroup - 参考OneStepOffRayTrainer的实现
         all_wg = {}
         wg_kwargs = {}
         if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
             wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
+            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
+            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
+                "worker_nsight_options must be set when profile_steps is set"
+            )
+            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
+                OmegaConf.select(self.config.trainer, "worker_nsight_options")
+            )
 
         for resource_pool, class_dict in self.resource_pool_to_cls.items():
             worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
@@ -197,55 +246,98 @@ def init_workers(self):
 
     def _load_checkpoint(self):
         """加载检查点"""
-        # 简化的检查点加载逻辑
-        pass
+        # TODO: 实现检查点加载逻辑
+        logger.info("Checkpoint loading not implemented yet")
 
     def _validate(self):
-        """执行验证"""
+        """执行验证 - 参考OneStepOffRayTrainer的验证逻辑"""
         if self.val_reward_fn is None:
             return None
 
-        # 简化的验证逻辑
-        logger.info("Validation step skipped in async trainer")
-        return {"val_reward": 0.0}
+        # TODO: 实现完整的验证逻辑
+        logger.info("Running validation...")
+        val_metrics = {"val_reward": 0.0}  # 简化的验证指标
+        return val_metrics
 
     def _save_checkpoint(self):
         """保存检查点"""
-        # 简化的检查点保存逻辑
-        pass
+        # TODO: 实现检查点保存逻辑
+        logger.info("Checkpoint saving not implemented yet")
 
     def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path):
         """保存生成结果"""
-        # 简化的生成结果保存逻辑
+        # TODO: 实现生成结果保存逻辑
+        logger.debug(f"Dumping generations to {dump_path}")
+
+    def _balance_batch(self, batch: DataProto, metrics: dict):
+        """平衡batch中的有效token数量 - 参考OneStepOffRayTrainer的实现"""
+        # TODO: 实现batch平衡逻辑
         pass
 
-    def _update_param_version_and_sync(self):
-        """更新参数版本并同步到Rollouter"""
+    def _sync_parameters_to_rollouter(self):
+        """同步参数到Rollouter - 改进的同步机制"""
+        if self.rollouter_actor is None:
+            logger.warning("Rollouter actor not set, skipping parameter sync")
+            return
+
         self.current_param_version += 1
 
-        # 通知MessageQueue更新参数版本
-        self.message_queue_client.update_param_version(self.current_param_version)
+        try:
+            # 通知MessageQueue更新参数版本
+            self.message_queue_client.update_param_version(self.current_param_version)
 
-        # 通知Rollouter更新参数
-        if self.rollouter_actor is not None:
-            ray.get(self.rollouter_actor.update_rollout_weights.remote(self.current_param_version))
+            # 同步参数到Rollouter
+            sync_future = self.rollouter_actor.update_rollout_weights.remote(self.current_param_version)
+            ray.get(sync_future)
+
+            self.param_sync_count += 1
+            logger.info(f"Parameter sync completed, version: {self.current_param_version}")
+
+        except Exception as e:
+            logger.error(f"Failed to sync parameters: {e}")
+            self.current_param_version -= 1  # 回滚版本号
+            raise
 
     def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto:
-        """处理从队列获取的batch样本"""
+        """处理从队列获取的batch样本 - 改进的批处理逻辑"""
+        if not batch_samples:
+            raise ValueError("Empty batch samples")
+
         if len(batch_samples) == 1:
             return batch_samples[0].data
 
-        # 如果有多个batch，需要合并
-        all_batches = [sample.data for sample in batch_samples]
-        return DataProto.concat(all_batches)
+        # 合并多个batch - 使用DataProto的concat方法
+        try:
+            all_batches = [sample.data for sample in batch_samples]
+            merged_batch = DataProto.concat(all_batches)
+            logger.debug(f"Successfully merged {len(batch_samples)} batches")
+            return merged_batch
+        except Exception as e:
+            logger.error(f"Failed to merge batch samples: {e}")
+            raise
+
+    def _compute_sample_freshness_metrics(self, batch_samples: list[BatchSample]) -> dict:
+        """计算样本新鲜度指标"""
+        sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples]
+        current_time = time.time()
+        sample_latencies = [current_time - sample.timestamp for sample in batch_samples]
+
+        return {
+            "freshness/avg_sample_age": np.mean(sample_ages),
+            "freshness/max_sample_age": max(sample_ages),
+            "freshness/min_sample_age": min(sample_ages),
+            "freshness/avg_sample_latency": np.mean(sample_latencies),
+            "freshness/max_sample_latency": max(sample_latencies),
+            "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages),
+        }
 
     def fit(self):
-        """主训练循环"""
+        """主训练循环 - 基于OneStepOffRayTrainer的成熟实现"""
         from omegaconf import OmegaConf
 
         from verl.utils.tracking import Tracking
 
-        logger = Tracking(
+        logger_tracker = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
             default_backend=self.config.trainer.logger,
@@ -257,17 +349,17 @@ def fit(self):
         # 加载检查点
         self._load_checkpoint()
 
-        # 验证
+        # 初始验证
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
             if val_metrics:
                 pprint(f"Initial validation metrics: {val_metrics}")
-                logger.log(data=val_metrics, step=self.global_steps)
+                logger_tracker.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
 
         # 进度条
-        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Async Training")
 
         self.global_steps += 1
         last_val_metrics = None
@@ -278,6 +370,7 @@ def fit(self):
         logger.info("Starting fully async training loop...")
 
         while self.global_steps <= self.total_training_steps:
+            # 性能分析
             do_profile = (
                 self.global_steps in self.config.trainer.profile_steps
                 if self.config.trainer.profile_steps is not None
@@ -286,7 +379,7 @@ def fit(self):
 
             if do_profile:
                 self.actor_wg.start_profile()
-                if self.use_reference_policy:
+                if self.use_reference_policy and not self.ref_in_actor:
                     self.ref_policy_wg.start_profile()
                 if self.use_critic:
                     self.critic_wg.start_profile()
@@ -295,7 +388,7 @@ def fit(self):
 
             metrics = {}
             timing_raw = {}
-            # is_last_step = self.global_steps >= self.total_training_steps
+            is_last_step = self.global_steps >= self.total_training_steps
 
             with marked_timer("step", timing_raw):
                 # 从队列获取样本
@@ -308,64 +401,102 @@ def fit(self):
                     )
 
                     if batch_samples is None:
-                        logger.warning("Timeout waiting for batch samples, continuing...")
+                        logger.warning("Timeout waiting for batch samples, retrying...")
+                        time.sleep(1.0)
                         continue
 
                 # 处理获取的样本
-                batch = self._process_batch_samples(batch_samples)
+                with marked_timer("process_batch_samples", timing_raw, color="cyan"):
+                    batch = self._process_batch_samples(batch_samples)
 
-                # 计算样本的新鲜度
-                sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples]
-                avg_sample_age = np.mean(sample_ages)
-                max_sample_age = max(sample_ages)
+                    # 计算样本新鲜度指标
+                    freshness_metrics = self._compute_sample_freshness_metrics(batch_samples)
+                    metrics.update(freshness_metrics)
 
-                logger.info(
-                    f"Processing batch with {len(batch_samples)} samples, "
-                    f"avg_age={avg_sample_age:.1f}, max_age={max_sample_age}"
-                )
+                    logger.info(
+                        f"Processing batch: {len(batch_samples)} samples, "
+                        f"avg_age={freshness_metrics['freshness/avg_sample_age']:.1f}, "
+                        f"max_age={freshness_metrics['freshness/max_sample_age']}"
+                    )
 
-                # 添加响应掩码
+                # 添加响应掩码 - 参考OneStepOffRayTrainer
                 batch.batch["response_mask"] = compute_response_mask(batch)
 
-                # 计算奖励
-                with marked_timer("compute_reward", timing_raw, color="yellow"):
-                    if self.reward_fn is not None:
-                        batch, reward_extra_infos_dict = compute_reward(
-                            batch, reward_fn=self.reward_fn, tokenizer=self.tokenizer
-                        )
-                    elif self.use_rm:
-                        batch, reward_extra_infos_dict = compute_reward_async(
-                            batch, rm_wg=self.rm_wg, tokenizer=self.tokenizer
-                        )
+                # 平衡batch
+                if self.config.trainer.balance_batch:
+                    self._balance_batch(batch, metrics=metrics)
+
+                # 计算全局有效token数量
+                batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
+                # 计算奖励 - 参考OneStepOffRayTrainer的实现
+                with marked_timer("reward", timing_raw, color="yellow"):
+                    if self.use_rm:
+                        reward_tensor = self.rm_wg.compute_rm_score(batch)
+                        batch = batch.union(reward_tensor)
+
+                    if self.config.reward_model.get("launch_reward_fn_async", False):
+                        future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer)
                     else:
-                        raise ValueError("No reward function or reward model provided")
+                        reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+
+                # 计算旧的log probabilities - 参考OneStepOffRayTrainer
+                with marked_timer("old_log_prob", timing_raw, color="blue"):
+                    old_log_prob = self.actor_wg.compute_log_prob(batch)
+                    entropys = old_log_prob.batch["entropys"]
+                    response_masks = batch.batch["response_mask"]
+                    loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                    entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                    old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                    metrics.update(old_log_prob_metrics)
+                    old_log_prob.batch.pop("entropys")
+                    batch = batch.union(old_log_prob)
 
                 # 计算reference log probabilities
                 if self.use_reference_policy:
-                    with marked_timer("compute_ref_log_prob", timing_raw, color="green"):
-                        if self.ref_in_actor:
-                            ref_log_prob_output = self.actor_wg.compute_ref_log_prob(batch)
+                    with marked_timer("ref", timing_raw, color="olive"):
+                        if not self.ref_in_actor:
+                            ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
                         else:
-                            ref_log_prob_output = self.ref_policy_wg.compute_log_prob(batch)
-                        batch = batch.union(ref_log_prob_output)
-
-                # 计算actor log probabilities
-                with marked_timer("compute_log_prob", timing_raw, color="cyan"):
-                    log_prob_output = self.actor_wg.compute_log_prob(batch)
-                    batch = batch.union(log_prob_output)
-
-                # 应用KL惩罚
-                if self.use_reference_policy:
-                    batch = apply_kl_penalty(batch, self.config.algorithm)
+                            ref_log_prob = self.actor_wg.compute_ref_log_prob(batch)
+                        batch = batch.union(ref_log_prob)
 
-                # 计算优势
+                # 计算values
                 if self.use_critic:
-                    with marked_timer("compute_values", timing_raw, color="magenta"):
-                        values_output = self.critic_wg.compute_values(batch)
-                        batch = batch.union(values_output)
-
-                with marked_timer("compute_advantage", timing_raw, color="orange"):
-                    batch = compute_advantage(batch, self.config.algorithm)
+                    with marked_timer("values", timing_raw, color="cyan"):
+                        values = self.critic_wg.compute_values(batch)
+                        batch = batch.union(values)
+
+                # 处理奖励和优势计算
+                with marked_timer("adv", timing_raw, color="brown"):
+                    if self.config.reward_model.get("launch_reward_fn_async", False):
+                        reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
+                    batch.batch["token_level_scores"] = reward_tensor
+
+                    if reward_extra_infos_dict:
+                        batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+                    # 应用KL惩罚
+                    if self.config.algorithm.use_kl_in_reward:
+                        batch, kl_metrics = apply_kl_penalty(
+                            batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                        )
+                        metrics.update(kl_metrics)
+                    else:
+                        batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+
+                    # 计算优势
+                    norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
+
+                    batch = compute_advantage(
+                        batch,
+                        adv_estimator=self.config.algorithm.adv_estimator,
+                        gamma=self.config.algorithm.gamma,
+                        lam=self.config.algorithm.lam,
+                        num_repeat=self.config.actor_rollout_ref.rollout.n,
+                        norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                        config=self.config.algorithm,
+                    )
 
                 # 更新critic
                 if self.use_critic:
@@ -382,9 +513,9 @@ def fit(self):
                     actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
                     metrics.update(actor_output_metrics)
 
-                    # 更新参数版本并同步到Rollouter
+                    # 同步参数到Rollouter
                     with marked_timer("sync_params", timing_raw, color="purple"):
-                        self._update_param_version_and_sync()
+                        self._sync_parameters_to_rollouter()
 
                 # 记录rollout生成
                 rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
@@ -404,39 +535,54 @@ def fit(self):
                 # 验证
                 if (
                     self.val_reward_fn is not None
-                    and self.config.trainer.val_freq is not None
-                    and self.global_steps % self.config.trainer.val_freq == 0
+                    and self.config.trainer.test_freq > 0
+                    and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
                 ):
-                    with marked_timer("validation", timing_raw, color="brown"):
+                    with marked_timer("testing", timing_raw, color="green"):
                         val_metrics = self._validate()
-                        if val_metrics:
-                            pprint(f"Validation metrics at step {self.global_steps}: {val_metrics}")
+                        if is_last_step:
                             last_val_metrics = val_metrics
+                            print(last_val_metrics)
+                    if val_metrics:
+                        metrics.update(val_metrics)
 
-            # 计算性能指标
-            timing_metrics = compute_timing_metrics(timing_raw)
-            throughput_metrics = compute_throughout_metrics(timing_raw, len(batch))
-            data_metrics = compute_data_metrics(batch, self.tokenizer)
+                # 保存检查点
+                if self.config.trainer.save_freq > 0 and (
+                    is_last_step or self.global_steps % self.config.trainer.save_freq == 0
+                ):
+                    with marked_timer("save_checkpoint", timing_raw, color="green"):
+                        self._save_checkpoint()
 
-            # 添加样本新鲜度指标
-            freshness_metrics = {
-                "avg_sample_age": avg_sample_age,
-                "max_sample_age": max_sample_age,
-                "processed_samples": self.processed_samples,
-                "param_version": self.current_param_version,
-            }
+            # 收集指标 - 参考OneStepOffRayTrainer的指标收集
+            metrics.update(
+                {
+                    "training/global_step": self.global_steps,
+                    "training/param_version": self.current_param_version,
+                    "training/param_sync_count": self.param_sync_count,
+                }
+            )
 
-            metrics.update(timing_metrics)
-            metrics.update(throughput_metrics)
-            metrics.update(data_metrics)
-            metrics.update(freshness_metrics)
+            # 数据和性能指标
+            metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+            metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
 
-            if last_val_metrics is not None:
-                metrics.update(last_val_metrics)
-                last_val_metrics = None
+            n_gpus = self.resource_pool_manager.get_n_gpus()
+            metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+
+            # 队列状态指标
+            queue_size = self.message_queue_client.get_queue_size()
+            queue_stats = self.message_queue_client.get_statistics()
+            metrics.update(
+                {
+                    "queue/size": queue_size,
+                    "queue/total_produced": queue_stats["total_produced"],
+                    "queue/total_consumed": queue_stats["total_consumed"],
+                    "queue/dropped_samples": queue_stats["dropped_samples"],
+                }
+            )
 
             # 记录日志
-            logger.log(data=metrics, step=self.global_steps)
+            logger_tracker.log(data=metrics, step=self.global_steps)
 
             # 更新进度条
             progress_bar.update(1)
@@ -444,27 +590,27 @@ def fit(self):
                 {
                     "reward": f"{metrics.get('reward/mean', 0):.3f}",
                     "kl": f"{metrics.get('actor/approx_kl', 0):.3f}",
-                    "queue_size": self.message_queue_client.get_queue_size(),
-                    "param_version": self.current_param_version,
+                    "queue_size": queue_size,
+                    "param_ver": self.current_param_version,
+                    "avg_age": f"{metrics.get('freshness/avg_sample_age', 0):.1f}",
                 }
             )
 
-            # 保存检查点
-            if self.config.trainer.save_freq is not None and self.global_steps % self.config.trainer.save_freq == 0:
-                self._save_checkpoint()
-
             if do_profile:
-                self.actor_wg.end_profile()
-                if self.use_reference_policy:
-                    self.ref_policy_wg.end_profile()
+                self.actor_wg.stop_profile()
+                if self.use_reference_policy and not self.ref_in_actor:
+                    self.ref_policy_wg.stop_profile()
                 if self.use_critic:
-                    self.critic_wg.end_profile()
+                    self.critic_wg.stop_profile()
                 if self.use_rm:
-                    self.rm_wg.end_profile()
+                    self.rm_wg.stop_profile()
 
             self.global_steps += 1
             self.processed_samples += len(batch_samples)
 
+            if is_last_step:
+                break
+
         progress_bar.close()
         logger.info(f"Training completed after {self.global_steps} steps")
 
@@ -473,17 +619,22 @@ def fit(self):
             val_metrics = self._validate()
             if val_metrics:
                 pprint(f"Final validation metrics: {val_metrics}")
-                logger.log(data=val_metrics, step=self.global_steps)
+                logger_tracker.log(data=val_metrics, step=self.global_steps)
 
         # 最终检查点保存
         self._save_checkpoint()
 
     def get_statistics(self) -> dict:
         """获取训练统计信息"""
+        queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {}
         return {
             "global_steps": self.global_steps,
             "processed_samples": self.processed_samples,
             "stale_samples_processed": self.stale_samples_processed,
             "current_param_version": self.current_param_version,
-            "queue_size": self.message_queue_client.get_queue_size() if self.message_queue_client else 0,
+            "param_sync_count": self.param_sync_count,
+            "queue_size": queue_stats.get("queue_size", 0),
+            "queue_total_produced": queue_stats.get("total_produced", 0),
+            "queue_total_consumed": queue_stats.get("total_consumed", 0),
+            "queue_dropped_samples": queue_stats.get("dropped_samples", 0),
         }
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 272f890cbbc..023475ef777 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import time
 
 import ray
 from ray.util.collective import collective
@@ -23,64 +24,146 @@
 class ParameterSynchronizer:
     """
     参数同步器，负责在actor和rollout之间同步模型参数
+    改进版本，具有更好的错误处理和重试机制
     """
 
     def __init__(self, config):
         self.config = config
         self.weights_info = None
         self.sync_group_initialized = False
+        self.sync_group_name = "actor_rollout"
 
-    def initialize_sync_group(self, actor_workers: list, rollout_workers: list):
+        # 同步配置
+        self.max_sync_retries = config.async_training.get("max_sync_retries", 3)
+        self.sync_timeout = config.async_training.get("sync_timeout", 30.0)
+        self.retry_delay = config.async_training.get("sync_retry_delay", 1.0)
+
+        # 统计信息
+        self.sync_count = 0
+        self.sync_failures = 0
+        self.last_sync_time = 0
+
+    def initialize_sync_group(self, actor_workers: list, rollout_workers: list) -> bool:
         """
         初始化参数同步组
 
         Args:
             actor_workers: actor worker列表
             rollout_workers: rollout worker列表
+
+        Returns:
+            bool: 是否成功初始化
         """
         logger.info("Initializing parameter synchronization group...")
 
         try:
+            # 验证workers
+            if not actor_workers:
+                raise ValueError("No actor workers provided")
+            if not rollout_workers:
+                raise ValueError("No rollout workers provided")
+
             # 获取actor的权重信息
-            if actor_workers:
-                self.weights_info = ray.get(actor_workers[0].get_actor_weights_info.remote())[0]
+            logger.debug("Getting actor weights info...")
+            weights_info_future = actor_workers[0].get_actor_weights_info.remote()
+            self.weights_info = ray.get(weights_info_future, timeout=10.0)[0]
 
-                # 设置rollout的权重信息
-                for rollout_worker in rollout_workers:
-                    ray.get(rollout_worker.set_actor_weights_info.remote(self.weights_info))
+            if not self.weights_info:
+                raise ValueError("Failed to get actor weights info")
+
+            # 设置rollout的权重信息
+            logger.debug("Setting rollout weights info...")
+            set_weights_futures = []
+            for rollout_worker in rollout_workers:
+                future = rollout_worker.set_actor_weights_info.remote(self.weights_info)
+                set_weights_futures.append(future)
+
+            ray.get(set_weights_futures, timeout=10.0)
 
             # 创建actor-rollout通信组
+            logger.debug("Creating collective communication group...")
             all_workers = actor_workers + rollout_workers
+
+            # 清理可能存在的旧组
+            try:
+                collective.destroy_collective_group(self.sync_group_name)
+            except Exception:
+                pass  # 忽略清理错误
+
             collective.create_collective_group(
                 all_workers,
                 len(all_workers),
                 list(range(0, len(all_workers))),
                 backend="nccl",
-                group_name="actor_rollout",
+                group_name=self.sync_group_name,
             )
 
             self.sync_group_initialized = True
             logger.info("Parameter synchronization group initialized successfully")
+            return True
 
         except Exception as e:
             logger.error(f"Failed to initialize sync group: {e}")
-            raise
+            self.sync_group_initialized = False
+            return False
 
-    def sync_weights(self, actor_workers: list, rollout_workers: list):
+    def sync_weights(self, actor_workers: list, rollout_workers: list) -> bool:
         """
-        同步权重从actor到rollout
+        同步权重从actor到rollout - 改进版本，具有重试和错误处理
 
         Args:
             actor_workers: actor worker列表
             rollout_workers: rollout worker列表
+
+        Returns:
+            bool: 是否同步成功
         """
         if not self.sync_group_initialized:
-            raise RuntimeError("Sync group not initialized. Call initialize_sync_group() first.")
+            logger.error("Sync group not initialized. Call initialize_sync_group() first.")
+            return False
+
+        logger.debug("Starting weight synchronization...")
+        start_time = time.time()
+
+        for attempt in range(self.max_sync_retries):
+            try:
+                # 执行同步
+                success = self._execute_sync(actor_workers, rollout_workers)
+
+                if success:
+                    self.sync_count += 1
+                    self.last_sync_time = time.time()
+                    sync_duration = self.last_sync_time - start_time
+                    logger.debug(f"Weight synchronization completed in {sync_duration:.2f}s")
+                    return True
+                else:
+                    logger.warning(f"Sync attempt {attempt + 1} failed")
+
+            except Exception as e:
+                logger.warning(f"Sync attempt {attempt + 1} failed with error: {e}")
+
+            # 如果不是最后一次尝试，等待后重试
+            if attempt < self.max_sync_retries - 1:
+                logger.info(f"Retrying sync in {self.retry_delay}s...")
+                time.sleep(self.retry_delay)
+
+        # 所有重试都失败
+        self.sync_failures += 1
+        logger.error(f"All sync attempts failed. Total failures: {self.sync_failures}")
+        return False
+
+    def _execute_sync(self, actor_workers: list, rollout_workers: list) -> bool:
+        """
+        执行实际的同步操作
 
-        logger.debug("Synchronizing weights from actor to rollout...")
+        Args:
+            actor_workers: actor worker列表
+            rollout_workers: rollout worker列表
 
+        Returns:
+            bool: 是否同步成功
+        """
         try:
-            # 同步权重
             sync_futures = []
 
             # Actor端同步
@@ -93,20 +176,39 @@ def sync_weights(self, actor_workers: list, rollout_workers: list):
                 future = rollout_worker.sync_rollout_weights.remote()
                 sync_futures.append(future)
 
-            # 等待所有同步完成
-            ray.get(sync_futures)
-
-            logger.debug("Weight synchronization completed")
+            # 等待所有同步完成，带超时
+            ray.get(sync_futures, timeout=self.sync_timeout)
+            return True
 
         except Exception as e:
-            logger.error(f"Failed to sync weights: {e}")
-            raise
+            logger.error(f"Sync execution failed: {e}")
+            return False
+
+    def cleanup(self):
+        """清理同步组"""
+        if self.sync_group_initialized:
+            try:
+                collective.destroy_collective_group(self.sync_group_name)
+                logger.info("Sync group cleaned up")
+            except Exception as e:
+                logger.warning(f"Error cleaning up sync group: {e}")
+            finally:
+                self.sync_group_initialized = False
+
+    def get_statistics(self) -> dict:
+        """获取同步统计信息"""
+        return {
+            "sync_count": self.sync_count,
+            "sync_failures": self.sync_failures,
+            "last_sync_time": self.last_sync_time,
+            "sync_group_initialized": self.sync_group_initialized,
+        }
 
 
 @ray.remote
 class ParameterSyncManager:
     """
-    Ray Actor形式的参数同步管理器
+    Ray Actor形式的参数同步管理器 - 改进版本
     """
 
     def __init__(self, config):
@@ -114,28 +216,69 @@ def __init__(self, config):
         self.synchronizer = ParameterSynchronizer(config)
         self.actor_workers = []
         self.rollout_workers = []
+        self.is_ready = False
 
-    def register_workers(self, actor_workers: list, rollout_workers: list):
-        """注册worker"""
-        self.actor_workers = actor_workers
-        self.rollout_workers = rollout_workers
+    def register_workers(self, actor_workers: list, rollout_workers: list) -> bool:
+        """
+        注册worker
 
-        # 初始化同步组
-        self.synchronizer.initialize_sync_group(actor_workers, rollout_workers)
+        Args:
+            actor_workers: actor worker列表
+            rollout_workers: rollout worker列表
+
+        Returns:
+            bool: 是否成功注册
+        """
+        try:
+            self.actor_workers = actor_workers
+            self.rollout_workers = rollout_workers
+
+            # 初始化同步组
+            success = self.synchronizer.initialize_sync_group(actor_workers, rollout_workers)
+            self.is_ready = success
+
+            if success:
+                logger.info("ParameterSyncManager ready")
+            else:
+                logger.error("ParameterSyncManager initialization failed")
+
+            return success
+        except Exception as e:
+            logger.error(f"Failed to register workers: {e}")
+            return False
+
+    def sync_parameters(self) -> bool:
+        """
+        执行参数同步
 
-    def sync_parameters(self):
-        """执行参数同步"""
-        self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers)
-        return True
+        Returns:
+            bool: 是否同步成功
+        """
+        if not self.is_ready:
+            logger.error("SyncManager not ready. Call register_workers() first.")
+            return False
+
+        return self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers)
 
     def get_weights_info(self):
         """获取权重信息"""
         return self.synchronizer.weights_info
 
+    def get_statistics(self) -> dict:
+        """获取统计信息"""
+        stats = self.synchronizer.get_statistics()
+        stats["is_ready"] = self.is_ready
+        return stats
+
+    def cleanup(self):
+        """清理资源"""
+        self.synchronizer.cleanup()
+        self.is_ready = False
+
 
 class AsyncParameterSynchronizer:
     """
-    异步参数同步器，用于完全异步训练工作流
+    异步参数同步器，用于完全异步训练工作流 - 改进版本
     """
 
     def __init__(self, config, actor_wg, rollouter_actor):
@@ -150,26 +293,100 @@ def __init__(self, config, actor_wg, rollouter_actor):
         self.rollouter_actor = rollouter_actor
         self.current_version = 0
 
-    def sync_to_rollouter(self, new_version: int):
-        """
-        将actor参数同步到rollouter
+        # 同步配置
+        self.sync_timeout = config.async_training.get("sync_timeout", 30.0)
+        self.max_sync_retries = config.async_training.get("max_sync_retries", 3)
+        self.retry_delay = config.async_training.get("sync_retry_delay", 1.0)
 
-        Args:
-            new_version: 新的参数版本号
-        """
-        logger.info(f"Syncing parameters to rollouter, version: {new_version}")
+        # 统计信息
+        self.sync_count = 0
+        self.sync_failures = 0
+        self.last_sync_time = 0
+
+        # 初始化同步组
+        self._init_sync_group()
 
+    def _init_sync_group(self):
+        """初始化同步组"""
         try:
-            # 通知rollouter更新参数
-            ray.get(self.rollouter_actor.update_rollout_weights.remote(new_version))
+            # 获取actor权重信息
+            weights_info = self.actor_wg.get_actor_weights_info()[0]
+
+            # 通知rollouter设置权重信息
+            ray.get(self.rollouter_actor.set_weights_info.remote(weights_info), timeout=10.0)
+
+            # 创建同步通信组
+            actor_workers = self.actor_wg.workers
+            rollout_workers = ray.get(self.rollouter_actor.get_rollout_workers.remote(), timeout=10.0)
+
+            all_workers = actor_workers + rollout_workers
+            collective.create_collective_group(
+                all_workers,
+                len(all_workers),
+                list(range(0, len(all_workers))),
+                backend="nccl",
+                group_name="async_actor_rollout",
+            )
 
-            self.current_version = new_version
-            logger.info(f"Parameter sync to rollouter completed, version: {new_version}")
+            logger.info("Async parameter synchronizer initialized")
 
         except Exception as e:
-            logger.error(f"Failed to sync parameters to rollouter: {e}")
-            raise
+            logger.warning(f"Failed to initialize async sync group: {e}")
+
+    def sync_to_rollouter(self, new_version: int) -> bool:
+        """
+        将actor参数同步到rollouter - 改进版本，具有重试机制
+
+        Args:
+            new_version: 新的参数版本号
+
+        Returns:
+            bool: 是否同步成功
+        """
+        logger.info(f"Syncing parameters to rollouter, version: {new_version}")
+        start_time = time.time()
+
+        for attempt in range(self.max_sync_retries):
+            try:
+                # 首先同步actor到rollout worker group
+                self.actor_wg.sync_rollout_weights()
+
+                # 然后通知rollouter更新参数版本
+                sync_future = self.rollouter_actor.update_rollout_weights.remote(new_version)
+                sync_result = ray.get(sync_future, timeout=self.sync_timeout)
+
+                if sync_result:
+                    self.current_version = new_version
+                    self.sync_count += 1
+                    self.last_sync_time = time.time()
+                    sync_duration = self.last_sync_time - start_time
+                    logger.info(f"Parameter sync completed in {sync_duration:.2f}s, version: {new_version}")
+                    return True
+                else:
+                    logger.warning(f"Rollouter rejected sync for version {new_version}")
+
+            except Exception as e:
+                logger.warning(f"Sync attempt {attempt + 1} failed: {e}")
+
+            # 如果不是最后一次尝试，等待后重试
+            if attempt < self.max_sync_retries - 1:
+                logger.info(f"Retrying sync in {self.retry_delay}s...")
+                time.sleep(self.retry_delay)
+
+        # 所有重试都失败
+        self.sync_failures += 1
+        logger.error(f"Failed to sync parameters to rollouter after {self.max_sync_retries} attempts")
+        return False
 
     def get_current_version(self) -> int:
         """获取当前参数版本"""
         return self.current_version
+
+    def get_statistics(self) -> dict:
+        """获取统计信息"""
+        return {
+            "current_version": self.current_version,
+            "sync_count": self.sync_count,
+            "sync_failures": self.sync_failures,
+            "last_sync_time": self.last_sync_time,
+        }
diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py
index eaa9313254a..6332a4dd4d8 100644
--- a/recipe/fully_async_policy/test_fully_async.py
+++ b/recipe/fully_async_policy/test_fully_async.py
@@ -101,7 +101,7 @@ class TestRollouterComponents(unittest.TestCase):
 
     def setUp(self):
         """设置测试环境"""
-        from .rollouter import RolloutController
+        from .fully_async_rollouter import RolloutController
 
         self.controller = RolloutController()
 

From 289a4a5833cf5074d892792bb2df25f8f3aaa6c2 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 31 Jul 2025 16:06:31 +0800
Subject: [PATCH 012/182] message_queue

---
 .../fully_async_policy/README_fully_async.md  |   6 +-
 .../config/fully_async_ppo_trainer.yaml       |   2 +-
 recipe/fully_async_policy/fully_async_main.py |   2 +-
 .../fully_async_rollouter.py                  |  10 +-
 .../fully_async_policy/fully_async_trainer.py |  10 +-
 recipe/fully_async_policy/message_queue.py    | 152 +++----
 .../run_fully_async_example.sh                |   4 +-
 recipe/fully_async_policy/test_fully_async.py |   4 +-
 recipe/fully_async_policy/test_mq.py          | 343 ----------------
 recipe/fully_async_policy/unittest/test_mq.py | 373 ++++++++++++++++++
 10 files changed, 454 insertions(+), 452 deletions(-)
 delete mode 100644 recipe/fully_async_policy/test_mq.py
 create mode 100644 recipe/fully_async_policy/unittest/test_mq.py

diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md
index 4c1866788a5..1708be5ae34 100644
--- a/recipe/fully_async_policy/README_fully_async.md
+++ b/recipe/fully_async_policy/README_fully_async.md
@@ -96,7 +96,7 @@
 ```yaml
 async_training:
   # 新鲜度控制
-  freshness_threshold: 3              # 样本新鲜度阈值
+  staleness_threshold: 3              # 样本新鲜度阈值
   max_staleness_allowed: 5            # 最大允许的样本陈旧度
 
   # 队列管理
@@ -144,7 +144,7 @@ python fully_async_main.py --config-path /path/to/config --config-name my_config
 ```python
 # 在配置文件中自定义异步训练参数
 async_training:
-  freshness_threshold: 5
+  staleness_threshold: 5
   max_queue_size: 2000
   generation_timeout: 60.0
 ```
@@ -196,7 +196,7 @@ async_training:
    - 调整 `batch_generation_interval`
 
 2. **样本过期严重**
-   - 调整 `freshness_threshold`
+   - 调整 `staleness_threshold`
    - 检查参数同步频率
    - 监控 `stale_samples_ratio`
 
diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 19c4aa01339..d97484d88f4 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -10,7 +10,7 @@ defaults:
 
 async_training:
   # 新鲜度控制 (Freshness Control)
-  freshness_threshold: 3              # 样本新鲜度阈值
+  staleness_threshold: 3              # 样本新鲜度阈值
   max_staleness_allowed: 5            # 最大允许的样本陈旧度
 
   # 队列管理 (Queue Management)
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index e57e3e119b7..3773d90d8d7 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -510,7 +510,7 @@ def main(config):
         # 设置默认异步训练配置
         config.async_training = OmegaConf.create(
             {
-                "freshness_threshold": 3,
+                "staleness_threshold": 3,
                 "max_staleness_allowed": 5,
                 "max_queue_size": 1000,
                 "min_batch_count": 1,
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 3ece39d0f10..06380803aee 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -154,7 +154,7 @@ def __init__(
 
         # 新鲜度控制 - 改进的配置管理
         async_config = config.async_training
-        self.freshness_threshold = async_config.get("freshness_threshold", 3)
+        self.staleness_threshold = async_config.get("staleness_threshold", 3)
         self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5)
         self.generation_timeout = async_config.get("generation_timeout", 30.0)
         self.batch_generation_interval = async_config.get("batch_generation_interval", 0.1)
@@ -190,7 +190,7 @@ def _validate_config(self):
         required_configs = [
             "data.train_batch_size",
             "actor_rollout_ref.rollout.n",
-            "async_training.freshness_threshold",
+            "async_training.staleness_threshold",
         ]
 
         for config_path in required_configs:
@@ -428,7 +428,7 @@ def _should_pause_generation(self) -> bool:
                 return True
 
             # 如果队列太满，也暂停生成
-            max_queue_size = self.freshness_threshold * self.config.data.train_batch_size
+            max_queue_size = self.staleness_threshold * self.config.data.train_batch_size
             if queue_size >= max_queue_size:
                 logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
                 return True
@@ -532,9 +532,9 @@ def _generation_loop(self):
                     }
 
                     # 放入队列
-                    success = self.message_queue_client.put_batch(
+                    success = self.message_queue_client.put_samples(
                         epoch=epoch,
-                        batch=generated_batch,
+                        sample=generated_batch,
                         param_version=self.current_param_version,
                         rollout_metadata=rollout_metadata,
                     )
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index e66bc895c9c..36687861ae8 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -23,7 +23,7 @@
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
 
-from recipe.fully_async_policy.message_queue import BatchSample, MessageQueueClient
+from recipe.fully_async_policy.message_queue import QueueSample, MessageQueueClient
 from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
@@ -298,7 +298,7 @@ def _sync_parameters_to_rollouter(self):
             self.current_param_version -= 1  # 回滚版本号
             raise
 
-    def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto:
+    def _process_batch_samples(self, batch_samples: list[QueueSample]) -> DataProto:
         """处理从队列获取的batch样本 - 改进的批处理逻辑"""
         if not batch_samples:
             raise ValueError("Empty batch samples")
@@ -316,7 +316,7 @@ def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto:
             logger.error(f"Failed to merge batch samples: {e}")
             raise
 
-    def _compute_sample_freshness_metrics(self, batch_samples: list[BatchSample]) -> dict:
+    def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
         """计算样本新鲜度指标"""
         sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples]
         current_time = time.time()
@@ -396,8 +396,8 @@ def fit(self):
                     min_batch_count = self.config.async_training.get("min_batch_count", 1)
                     batch_timeout = self.config.async_training.get("batch_timeout", 30.0)
 
-                    batch_samples = self.message_queue_client.get_batch(
-                        min_batch_count=min_batch_count, timeout=batch_timeout
+                    batch_samples = self.message_queue_client.get_samples(
+                        min_batch=min_batch_count, timeout=batch_timeout
                     )
 
                     if batch_samples is None:
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 58996d4266e..5866dcfd4a9 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -18,21 +18,19 @@
 import uuid
 from collections import deque
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any, Optional, List
 
 import ray
-import zmq
-from filelock import FileLock
 from omegaconf import DictConfig
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
-class BatchSample:
+class QueueSample:
     """单个batch样本，包含参数版本和新鲜度信息"""
 
-    batch_id: str
+    id: str
     epoch: int
     data: Any
     param_version: int
@@ -40,11 +38,10 @@ class BatchSample:
     rollout_metadata: dict[str, Any]
 
 
-@ray.remote(num_cpus=1)
+@ray.remote(num_cpus=10, max_concurrency=10)
 class MessageQueue:
     """
     简化的Ray-based异步消息队列，用于Rollouter和Trainer之间的通信
-    去掉了ZeroMQ的复杂性，使用更可靠的Ray机制
     """
 
     def __init__(self, config: DictConfig, max_queue_size: int = 1000):
@@ -56,27 +53,17 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         # 安全地获取配置值
         try:
             if hasattr(config, "async_training") and config.async_training is not None:
-                self.freshness_threshold = getattr(config.async_training, "freshness_threshold", 3)
+                self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3)
             else:
-                self.freshness_threshold = 3
+                self.staleness_threshold = 3
         except (AttributeError, RecursionError):
-            self.freshness_threshold = 3
-
-        # ZeroMQ setup
-        self.context = None
-        self.socket = None
-        self.address = None
-        try:
-            self._setup_zmq()
-        except Exception as e:
-            print(f"Warning: ZeroMQ setup failed: {e}. Queue will work without ZeroMQ.")
+            self.staleness_threshold = 3
 
         # Threading for message handling
         self.running = True
 
         # 线程安全
         self.lock = threading.RLock()
-        self.consumer_waiting = False
         self.consumer_condition = threading.Condition(self.lock)
 
         # 统计信息
@@ -86,35 +73,19 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
 
         logger.info(
             f"MessageQueue initialized with max_queue_size={max_queue_size},"
-            "freshness_threshold={self.freshness_threshold}"
+            "staleness_threshold={self.staleness_threshold}"
         )
 
-    def _setup_zmq(self):
-        """设置ZeroMQ socket"""
-        with FileLock("/tmp/verl_message_queue.lock"):
-            # 初始化 ZeroMQ context
-            self.context = zmq.Context()
-
-            # 使用TCP socket
-            import socket as sock
-
-            with sock.socket() as s:
-                s.bind(("", 0))
-                port = s.getsockname()[1]
-
-            self.address = f"tcp://127.0.0.1:{port}"
-            self.socket = self.context.socket(zmq.PAIR)
-            self.socket.bind(self.address)
-
-    def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool:
+    def put_samples(self, epoch: int, samples: List[Any], param_version: int,
+                    rollout_metadata_list: List[dict[str, Any]] = None) -> bool:
         """
         放入一个batch样本到队列
 
         Args:
             epoch: 当前epoch
-            batch: 样本数据
+            samples: 样本数据
             param_version: 参数版本号
-            rollout_metadata: rollout相关的元数据
+            rollout_metadata_list: rollout相关的元数据
 
         Returns:
             bool: 是否成功放入队列
@@ -122,62 +93,67 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata
         with self.lock:
             # 检查新鲜度
             staleness = self.current_param_version - param_version
-            if staleness >= self.freshness_threshold:
+            if staleness >= self.staleness_threshold:
                 self.dropped_samples += 1
-                logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.freshness_threshold}")
+                logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
                 return False
 
-            sample = BatchSample(
-                batch_id=str(uuid.uuid4()),
-                epoch=epoch,
-                data=batch,
-                param_version=param_version,
-                timestamp=time.time(),
-                rollout_metadata=rollout_metadata or {},
-            )
-
-            # 如果队列满了，移除最旧的样本
-            if len(self.queue) >= self.max_queue_size:
-                removed = self.queue.popleft()
-                self.dropped_samples += 1
-                logger.warning(f"Queue full, dropped sample {removed.batch_id}")
+            # 处理 rollout_metadatas 为 None 的情况
+            if rollout_metadata_list is None:
+                rollout_metadata_list = [{}] * len(samples)
+
+            if len(rollout_metadata_list) != len(samples):
+                logger.warning(
+                    f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}")
+                return False
 
-            self.queue.append(sample)
-            self.total_produced += 1
+            for sample, meta in zip(samples, rollout_metadata_list):
+                queue_sample = QueueSample(
+                    id=str(uuid.uuid4()),
+                    epoch=epoch,
+                    data=sample,
+                    param_version=param_version,
+                    timestamp=time.time(),
+                    rollout_metadata=meta or {},
+                )
+
+                # 如果队列满了，移除最旧的样本，一般不会发生
+                if len(self.queue) >= self.max_queue_size:
+                    removed = self.queue.popleft()
+                    self.dropped_samples += 1
+                    logger.warning(f"Queue full, dropped sample {removed.id}")
+
+                self.queue.append(queue_sample)
+                self.total_produced += 1
 
             # 通知等待的消费者
-            if self.consumer_waiting:
-                self.consumer_condition.notify()
+            self.consumer_condition.notify()
 
             if self.total_produced % 100 == 0:
                 logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}")
 
             return True
 
-    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
+    def get_samples(self, min_batch: int = 1) -> list[QueueSample]:
         """
-        从队列获取batch样本
+        从队列获取batch样本，一直等待直到有足够样本
 
         Args:
-            min_batch_count: 最小batch数量
-            timeout: 超时时间（秒）
+            min_batch: sample数量满足min_batch，一次性获取
 
         Returns:
-            Optional[List[BatchSample]]: 获取的样本列表，如果超时返回None
+            List[QueueSample]: 获取的样本列表
         """
         with self.lock:
-            start_time = time.time()
+            while len(self.queue) < min_batch and self.running:
+                self.consumer_condition.wait()
 
-            while len(self.queue) < min_batch_count:
-                if time.time() - start_time > timeout:
-                    return None
-
-                self.consumer_waiting = True
-                self.consumer_condition.wait(timeout=1.0)
-                self.consumer_waiting = False
+            # 如果队列已关闭且没有足够样本，返回空列表
+            if not self.running and len(self.queue) < min_batch:
+                return []
 
             # 获取指定数量的样本
-            batch_count = min(min_batch_count, len(self.queue))
+            batch_count = min(min_batch, len(self.queue))
             samples = []
             for _ in range(batch_count):
                 if self.queue:
@@ -207,7 +183,7 @@ def get_statistics(self) -> dict[str, Any]:
                 "total_consumed": self.total_consumed,
                 "dropped_samples": self.dropped_samples,
                 "current_param_version": self.current_param_version,
-                "freshness_threshold": self.freshness_threshold,
+                "staleness_threshold": self.staleness_threshold,
                 "max_queue_size": self.max_queue_size,
             }
 
@@ -220,11 +196,11 @@ def clear_queue(self):
 
     def shutdown(self):
         """关闭消息队列"""
-        self.running = False
-        if self.socket:
-            self.socket.close()
-        if self.context:
-            self.context.term()
+        with self.lock:  # 修正：需要加锁
+            self.running = False
+            # 通知所有等待的线程，让它们能够退出
+            self.consumer_condition.notify_all()
+        logger.info("MessageQueue shutdown")
 
     def get_memory_usage(self) -> dict:
         """获取内存使用统计"""
@@ -254,10 +230,6 @@ def get_memory_usage(self) -> dict:
                 "estimated_memory_mb": total_size / (1024 * 1024),
             }
 
-    def get_address(self) -> str:
-        """获取ZeroMQ地址"""
-        return self.address
-
 
 class MessageQueueClient:
     """MessageQueue的客户端，用于与MessageQueue Actor通信"""
@@ -265,13 +237,13 @@ class MessageQueueClient:
     def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
-    def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool:
+    def put_batch(self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None) -> bool:
         """放入batch到队列"""
-        return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata))
+        return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list))
 
-    def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]:
-        """从队列获取batch"""
-        return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout))
+    def get_batch(self, min_batch_count: int = 1) -> list[QueueSample]:
+        """从队列获取batch，一直等待直到有足够样本"""
+        return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
     def update_param_version(self, version: int):
         """更新参数版本"""
diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh
index d58e4ecc771..180071318a1 100644
--- a/recipe/fully_async_policy/run_fully_async_example.sh
+++ b/recipe/fully_async_policy/run_fully_async_example.sh
@@ -54,7 +54,7 @@ max_prompt_length=1024
 max_response_length=1024
 
 # 异步训练参数
-freshness_threshold=3
+staleness_threshold=3
 max_staleness_allowed=5
 max_queue_size=1000
 min_batch_count=1
@@ -120,7 +120,7 @@ python -m recipe.one_step_off_policy.fully_async_main \
     critic.fsdp_config.param_offload=false \
     \
     # 异步训练配置
-    async_training.freshness_threshold=$freshness_threshold \
+    async_training.staleness_threshold=$staleness_threshold \
     async_training.max_staleness_allowed=$max_staleness_allowed \
     async_training.max_queue_size=$max_queue_size \
     async_training.min_batch_count=$min_batch_count \
diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py
index 6332a4dd4d8..c138debcaa0 100644
--- a/recipe/fully_async_policy/test_fully_async.py
+++ b/recipe/fully_async_policy/test_fully_async.py
@@ -40,7 +40,7 @@ def setUp(self):
         config = OmegaConf.create(
             {
                 "async_training": {
-                    "freshness_threshold": 3,
+                    "staleness_threshold": 3,
                     "max_staleness_allowed": 5,
                 }
             }
@@ -147,7 +147,7 @@ def test_integration():
         config = OmegaConf.create(
             {
                 "async_training": {
-                    "freshness_threshold": 3,
+                    "staleness_threshold": 3,
                     "max_staleness_allowed": 5,
                 }
             }
diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py
deleted file mode 100644
index 3659911319e..00000000000
--- a/recipe/fully_async_policy/test_mq.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import threading
-import time
-from unittest.mock import Mock
-
-import pytest
-import ray
-from message_queue import BatchSample, MessageQueue, MessageQueueClient
-from omegaconf import DictConfig
-
-
-@pytest.fixture
-def mock_data_proto():
-    """Mock DataProto对象"""
-    return Mock()
-
-
-@pytest.fixture
-def basic_config():
-    """基础配置"""
-    return DictConfig({"async_training": {"freshness_threshold": 3}})
-
-
-@pytest.fixture
-def queue_config():
-    """队列配置"""
-    return DictConfig({"async_training": {"freshness_threshold": 2}})
-
-
-class TestBatchSample:
-    """测试BatchSample数据类"""
-
-    def test_batch_sample_creation(self, mock_data_proto):
-        """测试BatchSample创建"""
-        sample = BatchSample(
-            batch_id="test-123",
-            epoch=1,
-            data=mock_data_proto,
-            param_version=5,
-            timestamp=1234567890.0,
-            rollout_metadata={"key": "value"},
-        )
-
-        assert sample.batch_id == "test-123"
-        assert sample.epoch == 1
-        assert sample.data == mock_data_proto
-        assert sample.param_version == 5
-        assert sample.timestamp == 1234567890.0
-        assert sample.rollout_metadata == {"key": "value"}
-
-
-class TestMessageQueue:
-    """测试MessageQueue类（需要在非Ray环境下测试内部逻辑）"""
-
-    def test_message_queue_init(self, basic_config):
-        """测试MessageQueue初始化"""
-        # 直接创建MessageQueue实例（不使用Ray装饰器）
-        queue = MessageQueue.__ray_actor_class__(basic_config, max_queue_size=100)
-
-        # 确保ZeroMQ初始化成功
-        assert queue.context is not None
-        assert queue.socket is not None
-
-        # 基本属性检查
-        assert queue.max_queue_size == 100
-        assert queue.current_param_version == 0
-        assert queue.freshness_threshold == 3
-        assert len(queue.queue) == 0
-        assert queue.total_produced == 0
-        assert queue.total_consumed == 0
-        assert queue.dropped_samples == 0
-
-        # 清理资源
-        queue.shutdown()
-
-
-@pytest.fixture
-def ray_setup():
-    """设置Ray环境"""
-    if not ray.is_initialized():
-        ray.init(local_mode=True, ignore_reinit_error=True)
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture
-def message_queue_actor(ray_setup, basic_config):
-    """创建MessageQueue actor"""
-    actor = MessageQueue.remote(basic_config, max_queue_size=10)
-    yield actor
-    ray.get(actor.shutdown.remote())
-
-
-class TestMessageQueueActor:
-    """测试MessageQueue Actor"""
-
-    def test_put_batch_success(self, message_queue_actor, mock_data_proto):
-        """测试成功放入batch"""
-        result = ray.get(
-            message_queue_actor.put_batch.remote(
-                epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "data"}
-            )
-        )
-
-        assert result is True
-
-        # 检查队列大小
-        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
-        assert queue_size == 1
-
-        # 检查统计信息
-        stats = ray.get(message_queue_actor.get_statistics.remote())
-        assert stats["total_produced"] == 1
-        assert stats["queue_size"] == 1
-
-    def test_put_batch_staleness_check(self, message_queue_actor, mock_data_proto):
-        """测试新鲜度检查"""
-        # 更新参数版本为5
-        ray.get(message_queue_actor.update_param_version.remote(5))
-
-        # 尝试放入版本过旧的batch（版本差异>=3会被拒绝）
-        result = ray.get(
-            message_queue_actor.put_batch.remote(
-                epoch=1,
-                batch=mock_data_proto,
-                param_version=2,  # 5-2=3, 达到阈值
-                rollout_metadata={},
-            )
-        )
-
-        assert result is False
-
-        # 检查统计信息中的丢弃样本数
-        stats = ray.get(message_queue_actor.get_statistics.remote())
-        assert stats["dropped_samples"] == 1
-
-    def test_put_batch_queue_overflow(self, message_queue_actor, mock_data_proto):
-        """测试队列溢出处理"""
-        # 填满队列（最大容量10）
-        for i in range(12):  # 超过最大容量
-            ray.get(
-                message_queue_actor.put_batch.remote(
-                    epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={}
-                )
-            )
-
-        # 队列大小应该保持在最大值
-        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
-        assert queue_size == 10
-
-        # 检查统计信息
-        stats = ray.get(message_queue_actor.get_statistics.remote())
-        assert stats["dropped_samples"] == 2  # 超出的2个被丢弃
-
-    def test_get_batch_success(self, message_queue_actor, mock_data_proto):
-        """测试成功获取batch"""
-        # 先放入一些batch
-        for i in range(3):
-            ray.get(
-                message_queue_actor.put_batch.remote(
-                    epoch=i, batch=mock_data_proto, param_version=1, rollout_metadata={"index": i}
-                )
-            )
-
-        # 获取2个batch
-        samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=2, timeout=5.0))
-
-        assert samples is not None
-        assert len(samples) == 2
-        assert all(isinstance(sample, BatchSample) for sample in samples)
-
-        # 检查队列大小减少
-        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
-        assert queue_size == 1
-
-        # 检查统计信息
-        stats = ray.get(message_queue_actor.get_statistics.remote())
-        assert stats["total_consumed"] == 2
-
-    def test_get_batch_timeout(self, message_queue_actor):
-        """测试获取batch超时"""
-        # 空队列情况下获取batch应该超时
-        samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=1, timeout=1.0))
-        assert samples is None
-
-    def test_update_param_version(self, message_queue_actor):
-        """测试更新参数版本"""
-        ray.get(message_queue_actor.update_param_version.remote(10))
-
-        stats = ray.get(message_queue_actor.get_statistics.remote())
-        assert stats["current_param_version"] == 10
-
-    def test_clear_queue(self, message_queue_actor, mock_data_proto):
-        """测试清空队列"""
-        # 先添加一些样本
-        for i in range(3):
-            ray.get(message_queue_actor.put_batch.remote(epoch=i, batch=mock_data_proto, param_version=1))
-
-        # 清空队列
-        ray.get(message_queue_actor.clear_queue.remote())
-
-        # 检查队列大小
-        queue_size = ray.get(message_queue_actor.get_queue_size.remote())
-        assert queue_size == 0
-
-    def test_get_statistics(self, message_queue_actor):
-        """测试获取统计信息"""
-        stats = ray.get(message_queue_actor.get_statistics.remote())
-
-        expected_keys = {
-            "queue_size",
-            "total_produced",
-            "total_consumed",
-            "dropped_samples",
-            "current_param_version",
-            "freshness_threshold",
-            "max_queue_size",
-        }
-        assert set(stats.keys()) == expected_keys
-        assert isinstance(stats["queue_size"], int)
-        assert isinstance(stats["total_produced"], int)
-        assert isinstance(stats["total_consumed"], int)
-
-
-class TestMessageQueueClient:
-    """测试MessageQueueClient"""
-
-    def test_client_put_batch(self, message_queue_actor, mock_data_proto):
-        """测试客户端放入batch"""
-        client = MessageQueueClient(message_queue_actor)
-
-        result = client.put_batch(epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "client"})
-
-        assert result is True
-        assert client.get_queue_size() == 1
-
-    def test_client_get_batch(self, message_queue_actor, mock_data_proto):
-        """测试客户端获取batch"""
-        client = MessageQueueClient(message_queue_actor)
-
-        # 先放入一个batch
-        client.put_batch(epoch=1, batch=mock_data_proto, param_version=1)
-
-        # 获取batch
-        samples = client.get_batch(min_batch_count=1, timeout=5.0)
-
-        assert samples is not None
-        assert len(samples) == 1
-        assert isinstance(samples[0], BatchSample)
-
-    def test_client_update_param_version(self, message_queue_actor):
-        """测试客户端更新参数版本"""
-        client = MessageQueueClient(message_queue_actor)
-
-        client.update_param_version(15)
-
-        stats = client.get_statistics()
-        assert stats["current_param_version"] == 15
-
-    def test_client_get_queue_size(self, message_queue_actor, mock_data_proto):
-        """测试客户端获取队列大小"""
-        client = MessageQueueClient(message_queue_actor)
-
-        assert client.get_queue_size() == 0
-
-        client.put_batch(epoch=1, batch=mock_data_proto, param_version=1)
-        assert client.get_queue_size() == 1
-
-    def test_client_clear_queue(self, message_queue_actor, mock_data_proto):
-        """测试客户端清空队列"""
-        client = MessageQueueClient(message_queue_actor)
-
-        # 添加样本
-        client.put_batch(epoch=1, batch=mock_data_proto, param_version=1)
-        assert client.get_queue_size() == 1
-
-        # 清空队列
-        client.clear_queue()
-        assert client.get_queue_size() == 0
-
-    def test_client_shutdown(self, message_queue_actor):
-        """测试客户端关闭"""
-        client = MessageQueueClient(message_queue_actor)
-
-        # 关闭不应该抛出异常
-        client.shutdown()
-
-
-class TestConcurrency:
-    """测试并发场景"""
-
-    def test_concurrent_put_get(self, message_queue_actor, mock_data_proto):
-        """测试并发放入和获取"""
-        client = MessageQueueClient(message_queue_actor)
-        results = []
-
-        def producer():
-            for i in range(5):
-                result = client.put_batch(epoch=i, batch=mock_data_proto, param_version=1)
-                results.append(("put", result))
-                time.sleep(0.1)
-
-        def consumer():
-            for _ in range(3):
-                samples = client.get_batch(min_batch_count=1, timeout=2.0)
-                results.append(("get", samples is not None))
-                time.sleep(0.1)
-
-        # 启动生产者和消费者线程
-        producer_thread = threading.Thread(target=producer)
-        consumer_thread = threading.Thread(target=consumer)
-
-        producer_thread.start()
-        time.sleep(0.05)  # 让生产者先开始
-        consumer_thread.start()
-
-        producer_thread.join()
-        consumer_thread.join()
-
-        # 检查结果
-        put_results = [r[1] for r in results if r[0] == "put"]
-        get_results = [r[1] for r in results if r[0] == "get"]
-
-        assert all(put_results)  # 所有放入操作都应该成功
-        assert all(get_results)  # 所有获取操作都应该成功
-
-
-# 运行测试的示例配置
-if __name__ == "__main__":
-    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
new file mode 100644
index 00000000000..dbc29c3e9ce
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/test_mq.py
@@ -0,0 +1,373 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import threading
+import time
+from unittest.mock import Mock
+
+import pytest
+import ray
+from recipe.fully_async_policy.message_queue import QueueSample, MessageQueue, MessageQueueClient
+from omegaconf import DictConfig
+
+
+@pytest.fixture
+def mock_data_proto():
+    """Mock数据对象"""
+    return Mock()
+
+
+@pytest.fixture
+def basic_config():
+    """基础配置"""
+    return DictConfig({"async_training": {"staleness_threshold": 3}})
+
+
+@pytest.fixture
+def queue_config():
+    """队列配置"""
+    return DictConfig({"async_training": {"staleness_threshold": 2}})
+
+
+@pytest.fixture
+def ray_setup():
+    """设置Ray环境"""
+    if not ray.is_initialized():
+        ray.init(local_mode=True, ignore_reinit_error=True)
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture
+def message_queue_client(ray_setup, basic_config):
+    """创建MessageQueue actor并返回其客户端"""
+    actor = MessageQueue.remote(basic_config, max_queue_size=10)
+    client = MessageQueueClient(actor)
+    yield client
+    client.shutdown()
+
+
+class TestMessageQueue:
+    """测试MessageQueue（通过MessageQueueClient）"""
+
+    def test_put_samples_success(self, message_queue_client, mock_data_proto):
+        """测试成功放入samples"""
+        samples = [mock_data_proto, mock_data_proto]
+        metadata_list = [{"test": "data1"}, {"test": "data2"}]
+
+        result = message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=1,
+            rollout_metadata_list=metadata_list
+        )
+
+        assert result is True
+
+        # 检查队列大小
+        queue_size = message_queue_client.get_queue_size()
+        assert queue_size == 2
+
+        # 检查统计信息
+        stats = message_queue_client.get_statistics()
+        assert stats["total_produced"] == 2
+        assert stats["queue_size"] == 2
+
+    def test_put_samples_without_metadata(self, message_queue_client, mock_data_proto):
+        """测试不提供metadata时的处理"""
+        samples = [mock_data_proto, mock_data_proto]
+
+        result = message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=1,
+            rollout_metadata_list=None
+        )
+
+        assert result is True
+        queue_size = message_queue_client.get_queue_size()
+        assert queue_size == 2
+
+    def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_proto):
+        """测试metadata长度不匹配的处理"""
+        samples = [mock_data_proto, mock_data_proto]
+        metadata_list = [{"test": "data1"}]  # 长度不匹配
+
+        result = message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=1,
+            rollout_metadata_list=metadata_list
+        )
+
+        assert result is False  # 应该失败
+        queue_size = message_queue_client.get_queue_size()
+        assert queue_size == 0
+
+    def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto):
+        """测试新鲜度检查"""
+        # 更新参数版本为5
+        message_queue_client.update_param_version(5)
+
+        # 尝试放入版本过旧的batch（版本差异>=3会被拒绝）
+        samples = [mock_data_proto]
+        result = message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=2,  # 5-2=3, 达到阈值
+            rollout_metadata_list=None
+        )
+
+        assert result is False
+
+        # 检查统计信息中的丢弃样本数
+        stats = message_queue_client.get_statistics()
+        assert stats["dropped_samples"] == 1
+
+    def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto):
+        """测试队列溢出处理"""
+        # 填满队列（最大容量10）
+        for i in range(6):  # 每次放入2个，总共12个，超过最大容量10
+            samples = [mock_data_proto, mock_data_proto]
+            message_queue_client.put_batch(
+                epoch=1,
+                batch=samples,
+                param_version=1,
+                rollout_metadata_list=None
+            )
+
+        # 队列大小应该保持在最大值
+        queue_size = message_queue_client.get_queue_size()
+        assert queue_size == 10
+
+        # 检查统计信息
+        stats = message_queue_client.get_statistics()
+        assert stats["dropped_samples"] == 2  # 超出的2个被丢弃
+
+    def test_get_samples_success(self, message_queue_client, mock_data_proto):
+        """测试成功获取samples"""
+        # 先放入一些samples
+        samples = [mock_data_proto, mock_data_proto, mock_data_proto]
+        metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}]
+        message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=1,
+            rollout_metadata_list=metadata_list
+        )
+
+        # 获取2个samples
+        retrieved_samples = message_queue_client.get_batch(min_batch_count=2)
+
+        assert retrieved_samples is not None
+        assert len(retrieved_samples) == 2
+        assert all(isinstance(sample, QueueSample) for sample in retrieved_samples)
+
+        # 检查队列大小减少
+        queue_size = message_queue_client.get_queue_size()
+        assert queue_size == 1
+
+        # 检查统计信息
+        stats = message_queue_client.get_statistics()
+        assert stats["total_consumed"] == 2
+
+    def test_get_samples_blocking_behavior(self, message_queue_client, mock_data_proto):
+        """测试阻塞行为"""
+        result = []
+
+        def get_samples():
+            # 这会阻塞直到有足够样本
+            samples = message_queue_client.get_batch(min_batch_count=2)
+            result.append(samples)
+
+        def put_samples_later():
+            time.sleep(0.5)  # 延迟放入
+            samples = [mock_data_proto, mock_data_proto]
+            message_queue_client.put_batch(
+                epoch=1,
+                batch=samples,
+                param_version=1,
+                rollout_metadata_list=None
+            )
+
+        # 启动消费者线程
+        consumer_thread = threading.Thread(target=get_samples)
+        producer_thread = threading.Thread(target=put_samples_later)
+
+        consumer_thread.start()
+        producer_thread.start()
+
+        # 等待两个线程完成
+        producer_thread.join(timeout=2)
+        consumer_thread.join(timeout=2)
+
+        assert len(result) == 1
+        assert len(result[0]) == 2
+
+    def test_update_param_version(self, message_queue_client):
+        """测试更新参数版本"""
+        message_queue_client.update_param_version(10)
+        stats = message_queue_client.get_statistics()
+        assert stats["current_param_version"] == 10
+
+    def test_clear_queue(self, message_queue_client, mock_data_proto):
+        """测试清空队列"""
+        # 先添加一些样本
+        samples = [mock_data_proto, mock_data_proto, mock_data_proto]
+        message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=1,
+            rollout_metadata_list=None
+        )
+
+        # 清空队列
+        message_queue_client.clear_queue()
+
+        # 检查队列大小
+        queue_size = message_queue_client.get_queue_size()
+        assert queue_size == 0
+
+    def test_get_queue_size(self, message_queue_client, mock_data_proto):
+        """测试获取队列大小"""
+        assert message_queue_client.get_queue_size() == 0
+
+        samples = [mock_data_proto]
+        message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=1,
+            rollout_metadata_list=None
+        )
+        assert message_queue_client.get_queue_size() == 1
+
+    def test_get_statistics(self, message_queue_client):
+        """测试获取统计信息"""
+        stats = message_queue_client.get_statistics()
+
+        expected_keys = {
+            "queue_size",
+            "total_produced",
+            "total_consumed",
+            "dropped_samples",
+            "current_param_version",
+            "staleness_threshold",
+            "max_queue_size",
+        }
+        assert set(stats.keys()) == expected_keys
+        assert isinstance(stats["queue_size"], int)
+        assert isinstance(stats["total_produced"], int)
+        assert isinstance(stats["total_consumed"], int)
+
+    def test_get_memory_usage(self, message_queue_client, mock_data_proto):
+        """测试获取内存使用统计"""
+        # 添加一些样本
+        samples = [mock_data_proto, mock_data_proto]
+        message_queue_client.put_batch(
+            epoch=1,
+            batch=samples,
+            param_version=1,
+            rollout_metadata_list=None
+        )
+
+        memory_stats = message_queue_client.get_memory_usage()
+
+        expected_keys = {"queue_samples", "estimated_memory_bytes", "estimated_memory_mb"}
+        assert set(memory_stats.keys()) == expected_keys
+        assert memory_stats["queue_samples"] == 2
+        assert memory_stats["estimated_memory_bytes"] > 0
+        assert memory_stats["estimated_memory_mb"] > 0
+
+    def test_shutdown(self, ray_setup, basic_config):
+        """测试关闭功能"""
+        # 创建新的actor用于测试关闭
+        actor = MessageQueue.remote(basic_config, max_queue_size=10)
+        client = MessageQueueClient(actor)
+
+        # 关闭应该不抛出异常
+        client.shutdown()
+
+
+class TestConcurrency:
+    """测试并发场景"""
+
+    def setup_method(self):
+        """每个测试方法前的设置"""
+        if not ray.is_initialized():
+            ray.init(local_mode=True, ignore_reinit_error=True)
+
+    def teardown_method(self):
+        """每个测试方法后的清理"""
+        if ray.is_initialized():
+            ray.shutdown()
+
+    def create_message_queue_client(self, config=None):
+        """创建MessageQueue client的辅助方法"""
+        if config is None:
+            config = DictConfig({"async_training": {"staleness_threshold": 3}})
+        actor = MessageQueue.remote(config, max_queue_size=10)
+        return MessageQueueClient(actor)
+
+    def test_concurrent_put_get(self, mock_data_proto):
+        """测试并发放入和获取"""
+        client = self.create_message_queue_client()
+        try:
+            results = []
+
+            def producer():
+                for i in range(50):
+                    samples = [mock_data_proto, mock_data_proto]
+                    result = client.put_batch(
+                        epoch=i,
+                        batch=samples,
+                        param_version=1,
+                        rollout_metadata_list=None
+                    )
+                    results.append(("put", result))
+                    time.sleep(0.1)
+
+            def consumer():
+                for _ in range(100):
+                    try:
+                        retrieved_samples = client.get_batch(min_batch_count=1)
+                        results.append(("get", len(retrieved_samples) > 0))
+                    except Exception as e:
+                        print(e)
+                        results.append(("get", False))
+                    time.sleep(0.1)
+
+            # 启动生产者和消费者线程
+            producer_thread = threading.Thread(target=producer)
+            consumer_thread = threading.Thread(target=consumer)
+
+            producer_thread.start()
+            time.sleep(0.05)
+            consumer_thread.start()
+
+            producer_thread.join(timeout=5)
+            consumer_thread.join(timeout=5)
+
+            # 检查结果
+            put_results = [r[1] for r in results if r[0] == "put"]
+            get_results = [r[1] for r in results if r[0] == "get"]
+
+            assert all(put_results)
+            assert all(get_results)
+        finally:
+            client.shutdown()
+
+
+# 运行测试的示例配置
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])

From a89991cb1f6ef6178a7327b2a63f00a78d461ff1 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 31 Jul 2025 19:26:25 +0800
Subject: [PATCH 013/182] train

---
 .../fully_async_policy/fully_async_trainer.py | 461 ++++++++----------
 recipe/one_step_off_policy/ray_trainer.py     |  11 +-
 2 files changed, 221 insertions(+), 251 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 36687861ae8..9122a97c8fa 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -14,6 +14,7 @@
 
 import logging
 import time
+import warnings
 from pprint import pprint
 
 import numpy as np
@@ -59,24 +60,45 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        train_dataset: Dataset | None = None,
-        val_dataset: Dataset | None = None,
-        collate_fn=None,
-        train_sampler: Sampler | None = None,
-        device_name="cuda",
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            train_dataset: Optional[Dataset] = None,
+            val_dataset: Optional[Dataset] = None,
+            collate_fn=None,
+            train_sampler: Optional[Sampler] = None,
+            device_name=None,
     ):
-        self.config = config
+        """
+        Initialize distributed PPO trainer with Ray backend.
+        Note that this trainer runs on the driver process on a single CPU/GPU node.
+
+        Args:
+            config: Configuration object containing training parameters.
+            tokenizer: Tokenizer used for encoding and decoding text.
+            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
+            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
+            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
+            processor: Optional data processor, used for multimodal data
+            reward_fn: Function for computing rewards during training.
+            val_reward_fn: Function for computing rewards during validation.
+            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
+            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
+            collate_fn: Function to collate data samples into batches.
+            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
+            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
+        """
+
+        # Store the tokenizer for text processing
         self.tokenizer = tokenizer
         self.processor = processor
+        self.config = config
         self.reward_fn = reward_fn
         self.val_reward_fn = val_reward_fn
 
@@ -85,87 +107,55 @@ def __init__(
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
-        self.ray_worker_group_cls = ray_worker_group_cls
-        self.device_name = device_name
-        self.validation_generations_logger = ValidationGenerationsLogger()
-
-        # 数据相关
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.collate_fn = collate_fn
-        self.train_sampler = train_sampler
-
-        # 角色配置 - 参考OneStepOffRayTrainer的配置
         self.use_reference_policy = Role.RefPolicy in role_worker_mapping
         self.use_rm = Role.RewardModel in role_worker_mapping
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name if device_name else self.config.trainer.device
+        self.validation_generations_logger = ValidationGenerationsLogger(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+        )
+
+        # if ref_in_actor is True, the reference policy will be actor without lora applied
         self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
 
-        # KL控制器
-        if config.algorithm.use_kl_in_reward:
-            self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
+        # define in-reward KL control
+        # kl loss control currently not suppoorted
+        if self.config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
 
-        # 确定是否使用critic - 参考OneStepOffRayTrainer的逻辑
-        if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+        if config.critic.enable is not None:
+            self.use_critic = bool(config.critic.enable)
+        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
             self.use_critic = True
-        elif self.config.algorithm.adv_estimator in [
-            AdvantageEstimator.GRPO,
-            AdvantageEstimator.GRPO_PASSK,
-            AdvantageEstimator.REINFORCE_PLUS_PLUS,
-            # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy
-            AdvantageEstimator.RLOO,
-            AdvantageEstimator.OPO,
-            AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
-            AdvantageEstimator.GPG,
-        ]:
-            self.use_critic = False
         else:
-            raise NotImplementedError(f"Unsupported advantage estimator: {self.config.algorithm.adv_estimator}")
-
-        # Worker groups
-        self.actor_wg = None
-        self.critic_wg = None
-        self.ref_policy_wg = None
-        self.rm_wg = None
-
-        # 训练状态
-        self.global_steps = 0
-        self.current_param_version = 0
-        self.total_training_steps = config.trainer.total_training_steps
-
-        # MessageQueue客户端
-        self.message_queue_client = None
-
-        # 与Rollouter的通信
-        self.rollouter_actor = None
-
-        # 统计信息
-        self.processed_samples = 0
-        self.stale_samples_processed = 0
-        self.param_sync_count = 0
+            warnings.warn(
+                "Disabled critic as algorithm.adv_estimator != gae. "
+                "If it is not intended, please set critic.enable=True",
+                stacklevel=2,
+            )
+            self.use_critic = False
 
         self._validate_config()
-
-    def _validate_config(self):
-        """验证配置"""
-        required_configs = ["trainer.total_training_steps", "algorithm.adv_estimator", "data.train_batch_size"]
-
-        for config_path in required_configs:
-            if not OmegaConf.select(self.config, config_path):
-                raise ValueError(f"Missing required config: {config_path}")
+        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
         self.message_queue_client = message_queue_client
 
-    def set_rollouter_actor(self, rollouter_actor):
-        """设置Rollouter Actor的引用"""
-        self.rollouter_actor = rollouter_actor
+    def _validate(self):
+        """执行验证 - 参考OneStepOffRayTrainer的验证逻辑"""
+        return None
 
     def init_workers(self):
-        """初始化训练workers - 参考OneStepOffRayTrainer的实现"""
-        logger.info("Initializing FullyAsyncTrainer workers...")
+        """Initialize distributed training workers using Ray backend.
 
+        Creates:
+        1. Ray resource pools from configuration
+        2. Worker groups for each role (actor, critic, etc.)
+        """
         self.resource_pool_manager.create_resource_pool()
+
         self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
 
         # 创建actor worker
@@ -244,36 +234,6 @@ def init_workers(self):
 
         logger.info("FullyAsyncTrainer workers initialized successfully")
 
-    def _load_checkpoint(self):
-        """加载检查点"""
-        # TODO: 实现检查点加载逻辑
-        logger.info("Checkpoint loading not implemented yet")
-
-    def _validate(self):
-        """执行验证 - 参考OneStepOffRayTrainer的验证逻辑"""
-        if self.val_reward_fn is None:
-            return None
-
-        # TODO: 实现完整的验证逻辑
-        logger.info("Running validation...")
-        val_metrics = {"val_reward": 0.0}  # 简化的验证指标
-        return val_metrics
-
-    def _save_checkpoint(self):
-        """保存检查点"""
-        # TODO: 实现检查点保存逻辑
-        logger.info("Checkpoint saving not implemented yet")
-
-    def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path):
-        """保存生成结果"""
-        # TODO: 实现生成结果保存逻辑
-        logger.debug(f"Dumping generations to {dump_path}")
-
-    def _balance_batch(self, batch: DataProto, metrics: dict):
-        """平衡batch中的有效token数量 - 参考OneStepOffRayTrainer的实现"""
-        # TODO: 实现batch平衡逻辑
-        pass
-
     def _sync_parameters_to_rollouter(self):
         """同步参数到Rollouter - 改进的同步机制"""
         if self.rollouter_actor is None:
@@ -332,12 +292,17 @@ def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) ->
         }
 
     def fit(self):
-        """主训练循环 - 基于OneStepOffRayTrainer的成熟实现"""
+        """
+        The training loop of PPO.
+        The driver process only need to call the compute functions of the worker group through RPC
+        to construct the PPO dataflow.
+        The light-weight advantage computation is done on the driver process.
+        """
         from omegaconf import OmegaConf
 
         from verl.utils.tracking import Tracking
 
-        logger_tracker = Tracking(
+        logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
             default_backend=self.config.trainer.logger,
@@ -346,101 +311,90 @@ def fit(self):
 
         self.global_steps = 0
 
-        # 加载检查点
+        # load checkpoint before doing anything
         self._load_checkpoint()
 
-        # 初始验证
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
-            if val_metrics:
-                pprint(f"Initial validation metrics: {val_metrics}")
-                logger_tracker.log(data=val_metrics, step=self.global_steps)
+            assert val_metrics, f"{val_metrics=}"
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
 
-        # 进度条
-        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Async Training")
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
 
+        # we start from step 1
         self.global_steps += 1
         last_val_metrics = None
 
-        if self.message_queue_client is None:
-            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+        # across epoch iterator
+        continuous_iterator = self._create_continuous_iterator()
 
-        logger.info("Starting fully async training loop...")
+        # Start the first asynchronous generation task.
+        batch_data_future = self._async_gen_next_batch(continuous_iterator)
+
+        while batch_data_future is not None:
+            metrics = {}
+            timing_raw = {}
 
-        while self.global_steps <= self.total_training_steps:
-            # 性能分析
             do_profile = (
                 self.global_steps in self.config.trainer.profile_steps
                 if self.config.trainer.profile_steps is not None
                 else False
             )
+            with marked_timer("start_profile", timing_raw):
+                self._start_profiling(do_profile)
 
-            if do_profile:
-                self.actor_wg.start_profile()
-                if self.use_reference_policy and not self.ref_in_actor:
-                    self.ref_policy_wg.start_profile()
-                if self.use_critic:
-                    self.critic_wg.start_profile()
-                if self.use_rm:
-                    self.rm_wg.start_profile()
-
-            metrics = {}
-            timing_raw = {}
             is_last_step = self.global_steps >= self.total_training_steps
 
             with marked_timer("step", timing_raw):
-                # 从队列获取样本
-                with marked_timer("get_batch_from_queue", timing_raw, color="blue"):
-                    min_batch_count = self.config.async_training.get("min_batch_count", 1)
-                    batch_timeout = self.config.async_training.get("batch_timeout", 30.0)
-
-                    batch_samples = self.message_queue_client.get_samples(
-                        min_batch=min_batch_count, timeout=batch_timeout
-                    )
-
-                    if batch_samples is None:
-                        logger.warning("Timeout waiting for batch samples, retrying...")
-                        time.sleep(1.0)
-                        continue
-
-                # 处理获取的样本
-                with marked_timer("process_batch_samples", timing_raw, color="cyan"):
-                    batch = self._process_batch_samples(batch_samples)
-
-                    # 计算样本新鲜度指标
-                    freshness_metrics = self._compute_sample_freshness_metrics(batch_samples)
-                    metrics.update(freshness_metrics)
-
-                    logger.info(
-                        f"Processing batch: {len(batch_samples)} samples, "
-                        f"avg_age={freshness_metrics['freshness/avg_sample_age']:.1f}, "
-                        f"max_age={freshness_metrics['freshness/max_sample_age']}"
-                    )
-
-                # 添加响应掩码 - 参考OneStepOffRayTrainer
-                batch.batch["response_mask"] = compute_response_mask(batch)
-
-                # 平衡batch
+                # wait for the previous batch
+                with marked_timer("wait_prev_gen", timing_raw, color="red"):
+                    epoch, batch, gen_batch_output = batch_data_future.get()
+                    timing_raw.update(gen_batch_output.meta_info["timing"])
+                    gen_batch_output.meta_info.pop("timing", None)
+
+                # asys next generation (with syns weights from actor to rollout)
+                with marked_timer("sync_rollout_weights", timing_raw, color="purple"):
+                    if not is_last_step:
+                        batch_data_future = self._async_gen_next_batch(continuous_iterator)
+
+                batch.non_tensor_batch["uid"] = np.array(
+                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
+                )
+                # repeat to align with repeated responses in rollout
+                batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                batch = batch.union(gen_batch_output)
+
+                if "response_mask" not in batch.batch.keys():
+                    batch.batch["response_mask"] = compute_response_mask(batch)
+                # Balance the number of valid tokens across DP ranks.
+                # NOTE: This usually changes the order of data in the `batch`,
+                # which won't affect the advantage calculation (since it's based on uid),
+                # but might affect the loss calculation (due to the change of mini-batching).
+                # TODO: Decouple the DP balancing and mini-batching.
                 if self.config.trainer.balance_batch:
                     self._balance_batch(batch, metrics=metrics)
 
-                # 计算全局有效token数量
+                # compute global_valid tokens
                 batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
 
-                # 计算奖励 - 参考OneStepOffRayTrainer的实现
                 with marked_timer("reward", timing_raw, color="yellow"):
+                    # compute reward model score
                     if self.use_rm:
                         reward_tensor = self.rm_wg.compute_rm_score(batch)
                         batch = batch.union(reward_tensor)
 
-                    if self.config.reward_model.get("launch_reward_fn_async", False):
+                    if self.config.reward_model.launch_reward_fn_async:
                         future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer)
                     else:
                         reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
 
-                # 计算旧的log probabilities - 参考OneStepOffRayTrainer
+                # recompute old_log_probs
                 with marked_timer("old_log_prob", timing_raw, color="blue"):
                     old_log_prob = self.actor_wg.compute_log_prob(batch)
                     entropys = old_log_prob.batch["entropys"]
@@ -452,8 +406,32 @@ def fit(self):
                     old_log_prob.batch.pop("entropys")
                     batch = batch.union(old_log_prob)
 
-                # 计算reference log probabilities
+                    if "rollout_log_probs" in batch.batch.keys():
+                        # TODO: we may want to add diff of probs too.
+                        rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                        actor_old_log_probs = batch.batch["old_log_probs"]
+                        attention_mask = batch.batch["attention_mask"]
+                        responses = batch.batch["responses"]
+                        response_length = responses.size(1)
+                        response_mask = attention_mask[:, -response_length:]
+
+                        rollout_probs = torch.exp(rollout_old_log_probs)
+                        actor_probs = torch.exp(actor_old_log_probs)
+                        rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                        rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                        rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                        rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                        rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                        metrics.update(
+                            {
+                                "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                                "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                                "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                            }
+                        )
+
                 if self.use_reference_policy:
+                    # compute reference log_prob
                     with marked_timer("ref", timing_raw, color="olive"):
                         if not self.ref_in_actor:
                             ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
@@ -461,22 +439,23 @@ def fit(self):
                             ref_log_prob = self.actor_wg.compute_ref_log_prob(batch)
                         batch = batch.union(ref_log_prob)
 
-                # 计算values
+                # compute values
                 if self.use_critic:
                     with marked_timer("values", timing_raw, color="cyan"):
                         values = self.critic_wg.compute_values(batch)
                         batch = batch.union(values)
 
-                # 处理奖励和优势计算
                 with marked_timer("adv", timing_raw, color="brown"):
-                    if self.config.reward_model.get("launch_reward_fn_async", False):
+                    # we combine with rule-based rm
+                    reward_extra_infos_dict: dict[str, list]
+                    if self.config.reward_model.launch_reward_fn_async:
                         reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
                     batch.batch["token_level_scores"] = reward_tensor
 
                     if reward_extra_infos_dict:
                         batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
 
-                    # 应用KL惩罚
+                    # compute rewards. apply_kl_penalty if available
                     if self.config.algorithm.use_kl_in_reward:
                         batch, kl_metrics = apply_kl_penalty(
                             batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
@@ -485,8 +464,11 @@ def fit(self):
                     else:
                         batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
 
-                    # 计算优势
-                    norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
+                    # compute advantages, executed on the driver process
+
+                    norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                        "norm_adv_by_std_in_grpo", True
+                    )  # GRPO adv normalization factor
 
                     batch = compute_advantage(
                         batch,
@@ -498,32 +480,34 @@ def fit(self):
                         config=self.config.algorithm,
                     )
 
-                # 更新critic
+                # update critic
                 if self.use_critic:
                     with marked_timer("update_critic", timing_raw, color="pink"):
                         critic_output = self.critic_wg.update_critic(batch)
                     critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
                     metrics.update(critic_output_metrics)
 
-                # 更新actor
+                # implement critic warmup
                 if self.config.trainer.critic_warmup <= self.global_steps:
+                    # update actor
                     with marked_timer("update_actor", timing_raw, color="red"):
                         batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
                         actor_output = self.actor_wg.update_actor(batch)
                     actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
                     metrics.update(actor_output_metrics)
 
-                    # 同步参数到Rollouter
-                    with marked_timer("sync_params", timing_raw, color="purple"):
-                        self._sync_parameters_to_rollouter()
-
-                # 记录rollout生成
+                # Log rollout generations if enabled
                 rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
                 if rollout_data_dir:
                     with marked_timer("dump_rollout_generations", timing_raw, color="green"):
                         inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
                         outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
                         scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                        if "request_id" in batch.non_tensor_batch:
+                            reward_extra_infos_dict.setdefault(
+                                "request_id",
+                                batch.non_tensor_batch["request_id"].tolist(),
+                            )
                         self._dump_generations(
                             inputs=inputs,
                             outputs=outputs,
@@ -532,97 +516,80 @@ def fit(self):
                             dump_path=rollout_data_dir,
                         )
 
-                # 验证
+                # validate
                 if (
-                    self.val_reward_fn is not None
-                    and self.config.trainer.test_freq > 0
-                    and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+                        self.val_reward_fn is not None
+                        and self.config.trainer.test_freq > 0
+                        and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
                 ):
                     with marked_timer("testing", timing_raw, color="green"):
-                        val_metrics = self._validate()
+                        val_metrics: dict = self._validate()
                         if is_last_step:
                             last_val_metrics = val_metrics
-                            print(last_val_metrics)
-                    if val_metrics:
-                        metrics.update(val_metrics)
-
-                # 保存检查点
+                    metrics.update(val_metrics)
+
+                # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
+                esi_close_to_expiration = should_save_ckpt_esi(
+                    max_steps_duration=self.max_steps_duration,
+                    redundant_time=self.config.trainer.esi_redundant_time,
+                )
+                # Check if the conditions for saving a checkpoint are met.
+                # The conditions include a mandatory condition (1) and
+                # one of the following optional conditions (2/3/4):
+                # 1. The save frequency is set to a positive value.
+                # 2. It's the last training step.
+                # 3. The current step number is a multiple of the save frequency.
+                # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. q
                 if self.config.trainer.save_freq > 0 and (
-                    is_last_step or self.global_steps % self.config.trainer.save_freq == 0
+                        is_last_step
+                        or self.global_steps % self.config.trainer.save_freq == 0
+                        or esi_close_to_expiration
                 ):
+                    if esi_close_to_expiration:
+                        print("Force saving checkpoint: ESI instance expiration approaching.")
                     with marked_timer("save_checkpoint", timing_raw, color="green"):
                         self._save_checkpoint()
 
-            # 收集指标 - 参考OneStepOffRayTrainer的指标收集
+            with marked_timer("stop_profile", timing_raw):
+                self._stop_profiling(do_profile)
+
+            steps_duration = timing_raw["step"]
+            self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+
+            # training metrics
             metrics.update(
                 {
                     "training/global_step": self.global_steps,
-                    "training/param_version": self.current_param_version,
-                    "training/param_sync_count": self.param_sync_count,
+                    "training/epoch": epoch,
                 }
             )
-
-            # 数据和性能指标
+            # collect metrics
             metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
             metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
-
+            # TODO: implement actual tflpo and theoretical tflpo
             n_gpus = self.resource_pool_manager.get_n_gpus()
             metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
 
-            # 队列状态指标
-            queue_size = self.message_queue_client.get_queue_size()
-            queue_stats = self.message_queue_client.get_statistics()
-            metrics.update(
-                {
-                    "queue/size": queue_size,
-                    "queue/total_produced": queue_stats["total_produced"],
-                    "queue/total_consumed": queue_stats["total_consumed"],
-                    "queue/dropped_samples": queue_stats["dropped_samples"],
-                }
-            )
+            # this is experimental and may be changed/removed in the future in favor of a general-purpose one
+            if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
+                self.train_dataloader.sampler.update(batch=batch)
 
-            # 记录日志
-            logger_tracker.log(data=metrics, step=self.global_steps)
+            # TODO: make a canonical logger that supports various backend
+            logger.log(data=metrics, step=self.global_steps)
 
-            # 更新进度条
             progress_bar.update(1)
-            progress_bar.set_postfix(
-                {
-                    "reward": f"{metrics.get('reward/mean', 0):.3f}",
-                    "kl": f"{metrics.get('actor/approx_kl', 0):.3f}",
-                    "queue_size": queue_size,
-                    "param_ver": self.current_param_version,
-                    "avg_age": f"{metrics.get('freshness/avg_sample_age', 0):.1f}",
-                }
-            )
-
-            if do_profile:
-                self.actor_wg.stop_profile()
-                if self.use_reference_policy and not self.ref_in_actor:
-                    self.ref_policy_wg.stop_profile()
-                if self.use_critic:
-                    self.critic_wg.stop_profile()
-                if self.use_rm:
-                    self.rm_wg.stop_profile()
-
             self.global_steps += 1
-            self.processed_samples += len(batch_samples)
 
             if is_last_step:
-                break
-
-        progress_bar.close()
-        logger.info(f"Training completed after {self.global_steps} steps")
-
-        # 最终验证
-        if self.val_reward_fn is not None:
-            val_metrics = self._validate()
-            if val_metrics:
-                pprint(f"Final validation metrics: {val_metrics}")
-                logger_tracker.log(data=val_metrics, step=self.global_steps)
+                pprint(f"Final validation metrics: {last_val_metrics}")
+                progress_bar.close()
+                return
 
-        # 最终检查点保存
-        self._save_checkpoint()
+            # this is experimental and may be changed/removed in the future
+            # in favor of a general-purpose data buffer pool
+            if hasattr(self.train_dataset, "on_batch_end"):
+                # The dataset may be changed after each training batch
+                self.train_dataset.on_batch_end(batch=batch)
 
     def get_statistics(self) -> dict:
         """获取训练统计信息"""
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index 1f7011bdf54..127ed2d0c24 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -105,7 +105,7 @@ def __init__(
         val_dataset: Dataset | None = None,
         collate_fn=None,
         train_sampler: Sampler | None = None,
-        device_name="cuda",
+        device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -143,8 +143,11 @@ def __init__(
         self.use_reference_policy = Role.RefPolicy in role_worker_mapping
         self.use_rm = Role.RewardModel in role_worker_mapping
         self.ray_worker_group_cls = ray_worker_group_cls
-        self.device_name = device_name
-        self.validation_generations_logger = ValidationGenerationsLogger()
+        self.device_name = device_name if device_name else self.config.trainer.device
+        self.validation_generations_logger = ValidationGenerationsLogger(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+        )
 
         # if ref_in_actor is True, the reference policy will be actor without lora applied
         self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
@@ -286,7 +289,7 @@ def init_workers(self):
 
         # create async rollout manager and request scheduler
         self.async_rollout_mode = False
-        if self.config.actor_rollout_ref.rollout.mode == "async" and self._is_rollout:
+        if self.config.actor_rollout_ref.rollout.mode == "async":
             from verl.workers.rollout.async_server import AsyncLLMServerManager
 
             self.async_rollout_mode = True

From a5ee455ecaad386f3288108da4f174cdf06b6e7f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 31 Jul 2025 21:09:28 +0800
Subject: [PATCH 014/182] train

---
 recipe/one_step_off_policy/main_ppo.py    |   2 +-
 recipe/one_step_off_policy/ray_trainer.py | 278 ++----------
 verl/trainer/ppo/ray_trainer.py           | 527 +++++++++++-----------
 3 files changed, 295 insertions(+), 512 deletions(-)

diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py
index d6072c5521e..0a037df17fa 100644
--- a/recipe/one_step_off_policy/main_ppo.py
+++ b/recipe/one_step_off_policy/main_ppo.py
@@ -188,7 +188,7 @@ def run(self, config):
 def main(config):
     from verl.trainer.main_ppo import run_ppo
 
-    run_ppo(config)
+    run_ppo(config, OneStepOffTaskRunner)
 
 
 if __name__ == "__main__":
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index 127ed2d0c24..c1687561d01 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -18,40 +18,25 @@
 This trainer supports model-agonistic model initialization with huggingface
 """
 
-import uuid
+import warnings
 from pprint import pprint
 
-import numpy as np
 import ray
-import torch
 from omegaconf import OmegaConf
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
 
-from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.ppo import core_algos
-from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
-from verl.trainer.ppo.metric_utils import (
-    compute_data_metrics,
-    compute_throughout_metrics,
-    compute_timing_metrics,
-)
+from verl.trainer.ppo.core_algos import AdvantageEstimator
 from verl.trainer.ppo.ray_trainer import (
     RayPPOTrainer,
     ResourcePoolManager,
     Role,
     WorkerType,
-    apply_kl_penalty,
-    compute_advantage,
-    compute_response_mask,
 )
-from verl.trainer.ppo.reward import compute_reward, compute_reward_async
 from verl.utils.debug import marked_timer
-from verl.utils.metric import (
-    reduce_metrics,
-)
 from verl.utils.tracking import ValidationGenerationsLogger
 
 
@@ -154,24 +139,20 @@ def __init__(
 
         # define in-reward KL control
         # kl loss control currently not suppoorted
-        if config.algorithm.use_kl_in_reward:
-            self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
+        if self.config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
 
-        if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+        if config.critic.enable is not None:
+            self.use_critic = bool(config.critic.enable)
+        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
             self.use_critic = True
-        elif self.config.algorithm.adv_estimator in [
-            AdvantageEstimator.GRPO,
-            AdvantageEstimator.GRPO_PASSK,
-            AdvantageEstimator.REINFORCE_PLUS_PLUS,
-            # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy
-            AdvantageEstimator.RLOO,
-            AdvantageEstimator.OPO,
-            AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
-            AdvantageEstimator.GPG,
-        ]:
-            self.use_critic = False
         else:
-            raise NotImplementedError
+            warnings.warn(
+                "Disabled critic as algorithm.adv_estimator != gae. "
+                "If it is not intended, please set critic.enable=True",
+                stacklevel=2,
+            )
+            self.use_critic = False
 
         self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
@@ -323,23 +304,7 @@ def _async_gen_next_batch(self, continuous_iterator):
         except Exception as e:
             print(f"Error in async_gen_next_batch: {e}")
             return None
-        batch = DataProto.from_single_dict(batch_dict)
-        # pop those keys for generation
-        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-        if "multi_modal_data" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("multi_modal_data")
-        if "raw_prompt" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("raw_prompt")
-        if "tools_kwargs" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("tools_kwargs")
-        if "interaction_kwargs" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-        gen_batch = batch.pop(
-            batch_keys=batch_keys_to_pop,
-            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-        )
-        gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+        batch, gen_batch = self._prepare_generate_batch(batch_dict)
         # sync weights from actor to rollout
         self.sync_rollout_weights()
         # async generation
@@ -385,6 +350,7 @@ def fit(self):
         # we start from step 1
         self.global_steps += 1
         last_val_metrics = None
+        self.max_steps_duration = 0
 
         # across epoch iterator
         continuous_iterator = self._create_continuous_iterator()
@@ -393,24 +359,16 @@ def fit(self):
         batch_data_future = self._async_gen_next_batch(continuous_iterator)
 
         while batch_data_future is not None:
+            metrics = {}
+            timing_raw = {}
+
             do_profile = (
                 self.global_steps in self.config.trainer.profile_steps
                 if self.config.trainer.profile_steps is not None
                 else False
             )
-            if do_profile:
-                self.actor_wg.start_profile()
-                if not self.hybrid_engine:
-                    self.rollout_wg.start_profile()
-                if self.use_reference_policy:
-                    self.ref_policy_wg.start_profile()
-                if self.use_critic:
-                    self.critic_wg.start_profile()
-                if self.use_rm:
-                    self.rm_wg.start_profile()
+            self._start_profiling(do_profile, timing_raw)
 
-            metrics = {}
-            timing_raw = {}
             is_last_step = self.global_steps >= self.total_training_steps
 
             with marked_timer("step", timing_raw):
@@ -425,184 +383,15 @@ def fit(self):
                     if not is_last_step:
                         batch_data_future = self._async_gen_next_batch(continuous_iterator)
 
-                batch.non_tensor_batch["uid"] = np.array(
-                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
-                )
-                # repeat to align with repeated responses in rollout
-                batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-                batch = batch.union(gen_batch_output)
-
-                batch.batch["response_mask"] = compute_response_mask(batch)
-                # Balance the number of valid tokens across DP ranks.
-                # NOTE: This usually changes the order of data in the `batch`,
-                # which won't affect the advantage calculation (since it's based on uid),
-                # but might affect the loss calculation (due to the change of mini-batching).
-                # TODO: Decouple the DP balancing and mini-batching.
-                if self.config.trainer.balance_batch:
-                    self._balance_batch(batch, metrics=metrics)
-
-                # compute global_valid tokens
-                batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
-
-                with marked_timer("reward", timing_raw, color="yellow"):
-                    # compute reward model score
-                    if self.use_rm:
-                        reward_tensor = self.rm_wg.compute_rm_score(batch)
-                        batch = batch.union(reward_tensor)
-
-                    if self.config.reward_model.launch_reward_fn_async:
-                        future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer)
-                    else:
-                        reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
-
-                # recompute old_log_probs
-                with marked_timer("old_log_prob", timing_raw, color="blue"):
-                    old_log_prob = self.actor_wg.compute_log_prob(batch)
-                    entropys = old_log_prob.batch["entropys"]
-                    response_masks = batch.batch["response_mask"]
-                    loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
-                    entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
-                    old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
-                    metrics.update(old_log_prob_metrics)
-                    old_log_prob.batch.pop("entropys")
-                    batch = batch.union(old_log_prob)
-
-                    if "rollout_log_probs" in batch.batch.keys():
-                        # TODO: we may want to add diff of probs too.
-                        rollout_old_log_probs = batch.batch["rollout_log_probs"]
-                        actor_old_log_probs = batch.batch["old_log_probs"]
-                        attention_mask = batch.batch["attention_mask"]
-                        responses = batch.batch["responses"]
-                        response_length = responses.size(1)
-                        response_mask = attention_mask[:, -response_length:]
-
-                        rollout_probs = torch.exp(rollout_old_log_probs)
-                        actor_probs = torch.exp(actor_old_log_probs)
-                        rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
-                        rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
-                        rollout_probs_diff_max = torch.max(rollout_probs_diff)
-                        rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
-                        rollout_probs_diff_std = torch.std(rollout_probs_diff)
-                        metrics.update(
-                            {
-                                "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
-                                "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
-                                "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
-                            }
-                        )
-
-                if self.use_reference_policy:
-                    # compute reference log_prob
-                    with marked_timer("ref", timing_raw, color="olive"):
-                        if not self.ref_in_actor:
-                            ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
-                        else:
-                            ref_log_prob = self.actor_wg.compute_ref_log_prob(batch)
-                        batch = batch.union(ref_log_prob)
-
-                # compute values
-                if self.use_critic:
-                    with marked_timer("values", timing_raw, color="cyan"):
-                        values = self.critic_wg.compute_values(batch)
-                        batch = batch.union(values)
-
-                with marked_timer("adv", timing_raw, color="brown"):
-                    # we combine with rule-based rm
-                    reward_extra_infos_dict: dict[str, list]
-                    if self.config.reward_model.launch_reward_fn_async:
-                        reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
-                    batch.batch["token_level_scores"] = reward_tensor
-
-                    if reward_extra_infos_dict:
-                        batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
-
-                    # compute rewards. apply_kl_penalty if available
-                    if self.config.algorithm.use_kl_in_reward:
-                        batch, kl_metrics = apply_kl_penalty(
-                            batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
-                        )
-                        metrics.update(kl_metrics)
-                    else:
-                        batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
-
-                    # compute advantages, executed on the driver process
-
-                    norm_adv_by_std_in_grpo = self.config.algorithm.get(
-                        "norm_adv_by_std_in_grpo", True
-                    )  # GRPO adv normalization factor
-
-                    batch = compute_advantage(
-                        batch,
-                        adv_estimator=self.config.algorithm.adv_estimator,
-                        gamma=self.config.algorithm.gamma,
-                        lam=self.config.algorithm.lam,
-                        num_repeat=self.config.actor_rollout_ref.rollout.n,
-                        norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
-                        config=self.config.algorithm,
-                    )
-
-                # update critic
-                if self.use_critic:
-                    with marked_timer("update_critic", timing_raw, color="pink"):
-                        critic_output = self.critic_wg.update_critic(batch)
-                    critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
-                    metrics.update(critic_output_metrics)
-
-                # implement critic warmup
-                if self.config.trainer.critic_warmup <= self.global_steps:
-                    # update actor
-                    with marked_timer("update_actor", timing_raw, color="red"):
-                        batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
-                        actor_output = self.actor_wg.update_actor(batch)
-                    actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
-                    metrics.update(actor_output_metrics)
-
-                # Log rollout generations if enabled
-                rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
-                if rollout_data_dir:
-                    with marked_timer("dump_rollout_generations", timing_raw, color="green"):
-                        inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
-                        outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
-                        scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
-                        self._dump_generations(
-                            inputs=inputs,
-                            outputs=outputs,
-                            scores=scores,
-                            reward_extra_infos_dict=reward_extra_infos_dict,
-                            dump_path=rollout_data_dir,
-                        )
-
-                # validate
-                if (
-                    self.val_reward_fn is not None
-                    and self.config.trainer.test_freq > 0
-                    and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
-                ):
-                    with marked_timer("testing", timing_raw, color="green"):
-                        val_metrics: dict = self._validate()
-                        if is_last_step:
-                            last_val_metrics = val_metrics
-                    metrics.update(val_metrics)
-
-                if self.config.trainer.save_freq > 0 and (
-                    is_last_step or self.global_steps % self.config.trainer.save_freq == 0
-                ):
-                    with marked_timer("save_checkpoint", timing_raw, color="green"):
-                        self._save_checkpoint()
-
-            # training metrics
-            metrics.update(
-                {
-                    "training/global_step": self.global_steps,
-                    "training/epoch": epoch,
-                }
-            )
-            # collect metrics
-            metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
-            metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
-            # TODO: implement actual tflpo and theoretical tflpo
-            n_gpus = self.resource_pool_manager.get_n_gpus()
-            metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+                batch = self._post_generate_batch(batch, gen_batch_output, metrics)
+                batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+                self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+                last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
+                self._check_save_checkpoint(is_last_step, timing_raw)
+
+            self._stop_profiling(do_profile, timing_raw)
+            self._collect_metrics(batch, epoch, metrics, timing_raw)
+            self._post_batch_processing(batch)
 
             # TODO: make a canonical logger that supports various backend
             logger.log(data=metrics, step=self.global_steps)
@@ -610,17 +399,6 @@ def fit(self):
             progress_bar.update(1)
             self.global_steps += 1
 
-            if do_profile:
-                self.actor_wg.stop_profile()
-                if not self.hybrid_engine:
-                    self.rollout_wg.stop_profile()
-                if self.use_reference_policy:
-                    self.ref_policy_wg.stop_profile()
-                if self.use_critic:
-                    self.critic_wg.stop_profile()
-                if self.use_rm:
-                    self.rm_wg.stop_profile()
-
             if is_last_step:
                 pprint(f"Final validation metrics: {last_val_metrics}")
                 progress_bar.close()
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 6a82a4bcf2b..49334db6bcd 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -989,27 +989,29 @@ def _load_checkpoint(self):
         else:
             print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
 
-    def _start_profiling(self, do_profile: bool) -> None:
+    def _start_profiling(self, do_profile: bool, timing_raw) -> None:
         """Start profiling for all worker groups if profiling is enabled."""
-        if do_profile:
-            self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
-            if self.use_reference_policy:
-                self.ref_policy_wg.start_profile()
-            if self.use_critic:
-                self.critic_wg.start_profile()
-            if self.use_rm:
-                self.rm_wg.start_profile()
-
-    def _stop_profiling(self, do_profile: bool) -> None:
+        with marked_timer("start_profile", timing_raw):
+            if do_profile:
+                self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
+                if self.use_reference_policy:
+                    self.ref_policy_wg.start_profile()
+                if self.use_critic:
+                    self.critic_wg.start_profile()
+                if self.use_rm:
+                    self.rm_wg.start_profile()
+
+    def _stop_profiling(self, do_profile: bool, timing_raw) -> None:
         """Stop profiling for all worker groups if profiling is enabled."""
-        if do_profile:
-            self.actor_rollout_wg.stop_profile()
-            if self.use_reference_policy:
-                self.ref_policy_wg.stop_profile()
-            if self.use_critic:
-                self.critic_wg.stop_profile()
-            if self.use_rm:
-                self.rm_wg.stop_profile()
+        with marked_timer("stop_profile", timing_raw):
+            if do_profile:
+                self.actor_rollout_wg.stop_profile()
+                if self.use_reference_policy:
+                    self.ref_policy_wg.stop_profile()
+                if self.use_critic:
+                    self.critic_wg.stop_profile()
+                if self.use_rm:
+                    self.rm_wg.stop_profile()
 
     def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
         """Reorder the data on single controller such that each dp rank gets similar total tokens"""
@@ -1079,35 +1081,9 @@ def fit(self):
                     if self.config.trainer.profile_steps is not None
                     else False
                 )
-                with marked_timer("start_profile", timing_raw):
-                    self._start_profiling(do_profile)
-
-                batch: DataProto = DataProto.from_single_dict(batch_dict)
-
-                # pop those keys for generation
-                batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-                non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-                if "multi_modal_data" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("multi_modal_data")
-                if "raw_prompt" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("raw_prompt")
-                if "tools_kwargs" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("tools_kwargs")
-                if "interaction_kwargs" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-                if "index" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("index")
-                if "agent_name" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("agent_name")
-
-                gen_batch = batch.pop(
-                    batch_keys=batch_keys_to_pop,
-                    non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-                )
+                self._start_profiling(do_profile, timing_raw)
 
-                # pass global_steps to trace
-                gen_batch.meta_info["global_steps"] = self.global_steps
-                gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                batch, gen_batch = self._prepare_generate_batch(batch_dict)
 
                 is_last_step = self.global_steps >= self.total_training_steps
 
@@ -1139,216 +1115,15 @@ def fit(self):
 
                             del gen_baseline_batch, gen_baseline_output
 
-                    batch.non_tensor_batch["uid"] = np.array(
-                        [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
-                    )
-                    # repeat to align with repeated responses in rollout
-                    batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-                    batch = batch.union(gen_batch_output)
-
-                    if "response_mask" not in batch.batch.keys():
-                        batch.batch["response_mask"] = compute_response_mask(batch)
-                    # Balance the number of valid tokens across DP ranks.
-                    # NOTE: This usually changes the order of data in the `batch`,
-                    # which won't affect the advantage calculation (since it's based on uid),
-                    # but might affect the loss calculation (due to the change of mini-batching).
-                    # TODO: Decouple the DP balancing and mini-batching.
-                    if self.config.trainer.balance_batch:
-                        self._balance_batch(batch, metrics=metrics)
-
-                    # compute global_valid tokens
-                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
-
-                    with marked_timer("reward", timing_raw, color="yellow"):
-                        # compute reward model score
-                        if self.use_rm:
-                            reward_tensor = self.rm_wg.compute_rm_score(batch)
-                            batch = batch.union(reward_tensor)
-
-                        if self.config.reward_model.launch_reward_fn_async:
-                            future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
-                        else:
-                            reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
-
-                    # recompute old_log_probs
-                    with marked_timer("old_log_prob", timing_raw, color="blue"):
-                        old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
-                        entropys = old_log_prob.batch["entropys"]
-                        response_masks = batch.batch["response_mask"]
-                        loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
-                        entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
-                        old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
-                        metrics.update(old_log_prob_metrics)
-                        old_log_prob.batch.pop("entropys")
-                        batch = batch.union(old_log_prob)
-
-                        if "rollout_log_probs" in batch.batch.keys():
-                            # TODO: we may want to add diff of probs too.
-                            rollout_old_log_probs = batch.batch["rollout_log_probs"]
-                            actor_old_log_probs = batch.batch["old_log_probs"]
-                            attention_mask = batch.batch["attention_mask"]
-                            responses = batch.batch["responses"]
-                            response_length = responses.size(1)
-                            response_mask = attention_mask[:, -response_length:]
-
-                            rollout_probs = torch.exp(rollout_old_log_probs)
-                            actor_probs = torch.exp(actor_old_log_probs)
-                            rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
-                            rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
-                            rollout_probs_diff_max = torch.max(rollout_probs_diff)
-                            rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
-                            rollout_probs_diff_std = torch.std(rollout_probs_diff)
-                            metrics.update(
-                                {
-                                    "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
-                                    "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
-                                    "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
-                                }
-                            )
-
-                    if self.use_reference_policy:
-                        # compute reference log_prob
-                        with marked_timer("ref", timing_raw, color="olive"):
-                            if not self.ref_in_actor:
-                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
-                            else:
-                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
-                            batch = batch.union(ref_log_prob)
-
-                    # compute values
-                    if self.use_critic:
-                        with marked_timer("values", timing_raw, color="cyan"):
-                            values = self.critic_wg.compute_values(batch)
-                            batch = batch.union(values)
-
-                    with marked_timer("adv", timing_raw, color="brown"):
-                        # we combine with rule-based rm
-                        reward_extra_infos_dict: dict[str, list]
-                        if self.config.reward_model.launch_reward_fn_async:
-                            reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
-                        batch.batch["token_level_scores"] = reward_tensor
-
-                        if reward_extra_infos_dict:
-                            batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
-
-                        # compute rewards. apply_kl_penalty if available
-                        if self.config.algorithm.use_kl_in_reward:
-                            batch, kl_metrics = apply_kl_penalty(
-                                batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
-                            )
-                            metrics.update(kl_metrics)
-                        else:
-                            batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
-
-                        # compute advantages, executed on the driver process
-
-                        norm_adv_by_std_in_grpo = self.config.algorithm.get(
-                            "norm_adv_by_std_in_grpo", True
-                        )  # GRPO adv normalization factor
-
-                        batch = compute_advantage(
-                            batch,
-                            adv_estimator=self.config.algorithm.adv_estimator,
-                            gamma=self.config.algorithm.gamma,
-                            lam=self.config.algorithm.lam,
-                            num_repeat=self.config.actor_rollout_ref.rollout.n,
-                            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
-                            config=self.config.algorithm,
-                        )
-
-                    # update critic
-                    if self.use_critic:
-                        with marked_timer("update_critic", timing_raw, color="pink"):
-                            critic_output = self.critic_wg.update_critic(batch)
-                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
-                        metrics.update(critic_output_metrics)
-
-                    # implement critic warmup
-                    if self.config.trainer.critic_warmup <= self.global_steps:
-                        # update actor
-                        with marked_timer("update_actor", timing_raw, color="red"):
-                            batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
-                            actor_output = self.actor_rollout_wg.update_actor(batch)
-                        actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
-                        metrics.update(actor_output_metrics)
-
-                    # Log rollout generations if enabled
-                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
-                    if rollout_data_dir:
-                        with marked_timer("dump_rollout_generations", timing_raw, color="green"):
-                            inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
-                            outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
-                            scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
-                            if "request_id" in batch.non_tensor_batch:
-                                reward_extra_infos_dict.setdefault(
-                                    "request_id",
-                                    batch.non_tensor_batch["request_id"].tolist(),
-                                )
-                            self._dump_generations(
-                                inputs=inputs,
-                                outputs=outputs,
-                                scores=scores,
-                                reward_extra_infos_dict=reward_extra_infos_dict,
-                                dump_path=rollout_data_dir,
-                            )
-
-                    # validate
-                    if (
-                        self.val_reward_fn is not None
-                        and self.config.trainer.test_freq > 0
-                        and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
-                    ):
-                        with marked_timer("testing", timing_raw, color="green"):
-                            val_metrics: dict = self._validate()
-                            if is_last_step:
-                                last_val_metrics = val_metrics
-                        metrics.update(val_metrics)
-
-                    # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
-                    esi_close_to_expiration = should_save_ckpt_esi(
-                        max_steps_duration=self.max_steps_duration,
-                        redundant_time=self.config.trainer.esi_redundant_time,
-                    )
-                    # Check if the conditions for saving a checkpoint are met.
-                    # The conditions include a mandatory condition (1) and
-                    # one of the following optional conditions (2/3/4):
-                    # 1. The save frequency is set to a positive value.
-                    # 2. It's the last training step.
-                    # 3. The current step number is a multiple of the save frequency.
-                    # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
-                    if self.config.trainer.save_freq > 0 and (
-                        is_last_step
-                        or self.global_steps % self.config.trainer.save_freq == 0
-                        or esi_close_to_expiration
-                    ):
-                        if esi_close_to_expiration:
-                            print("Force saving checkpoint: ESI instance expiration approaching.")
-                        with marked_timer("save_checkpoint", timing_raw, color="green"):
-                            self._save_checkpoint()
-
-                with marked_timer("stop_profile", timing_raw):
-                    self._stop_profiling(do_profile)
-
-                steps_duration = timing_raw["step"]
-                self.max_steps_duration = max(self.max_steps_duration, steps_duration)
-
-                # training metrics
-                metrics.update(
-                    {
-                        "training/global_step": self.global_steps,
-                        "training/epoch": epoch,
-                    }
-                )
-                # collect metrics
-                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
-                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
-                # TODO: implement actual tflpo and theoretical tflpo
-                n_gpus = self.resource_pool_manager.get_n_gpus()
-                metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+                    batch = self._post_generate_batch(batch, gen_batch_output, metrics)
+                    batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+                    self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+                    last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
+                    self._check_save_checkpoint(is_last_step, timing_raw)
 
-                # this is experimental and may be changed/removed in the future in favor of a general-purpose one
-                if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
-                    self.train_dataloader.sampler.update(batch=batch)
+                self._stop_profiling(do_profile, timing_raw)
+                self._collect_metrics(batch, epoch, metrics, timing_raw)
+                self._post_batch_processing(batch)
 
                 # TODO: make a canonical logger that supports various backend
                 logger.log(data=metrics, step=self.global_steps)
@@ -1361,8 +1136,238 @@ def fit(self):
                     progress_bar.close()
                     return
 
-                # this is experimental and may be changed/removed in the future
-                # in favor of a general-purpose data buffer pool
-                if hasattr(self.train_dataset, "on_batch_end"):
-                    # The dataset may be changed after each training batch
-                    self.train_dataset.on_batch_end(batch=batch)
+    def _prepare_generate_batch(self, batch_dict):
+        batch: DataProto = DataProto.from_single_dict(batch_dict)
+        # pop those keys for generation
+        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+        if "multi_modal_data" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("multi_modal_data")
+        if "raw_prompt" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("raw_prompt")
+        if "tools_kwargs" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("tools_kwargs")
+        if "interaction_kwargs" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+        if "index" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("index")
+        if "agent_name" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("agent_name")
+        gen_batch = batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+        )
+        # pass global_steps to trace
+        gen_batch.meta_info["global_steps"] = self.global_steps
+        gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+        return batch, gen_batch
+
+    def _post_generate_batch(self, batch, gen_batch_output, metrics):
+        batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+        # repeat to align with repeated responses in rollout
+        batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+        batch = batch.union(gen_batch_output)
+        if "response_mask" not in batch.batch.keys():
+            batch.batch["response_mask"] = compute_response_mask(batch)
+        # Balance the number of valid tokens across DP ranks.
+        # NOTE: This usually changes the order of data in the `batch`,
+        # which won't affect the advantage calculation (since it's based on uid),
+        # but might affect the loss calculation (due to the change of mini-batching).
+        # TODO: Decouple the DP balancing and mini-batching.
+        if self.config.trainer.balance_batch:
+            self._balance_batch(batch, metrics=metrics)
+        # compute global_valid tokens
+        batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+        return batch
+
+    def _process_batch_common(self, batch, metrics, timing_raw):
+        with marked_timer("reward", timing_raw, color="yellow"):
+            # compute reward model score
+            if self.use_rm:
+                reward_tensor = self.rm_wg.compute_rm_score(batch)
+                batch = batch.union(reward_tensor)
+
+            if self.config.reward_model.launch_reward_fn_async:
+                future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
+            else:
+                reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+        # recompute old_log_probs
+        with marked_timer("old_log_prob", timing_raw, color="blue"):
+            old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+            entropys = old_log_prob.batch["entropys"]
+            response_masks = batch.batch["response_mask"]
+            loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+            entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+            old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+            metrics.update(old_log_prob_metrics)
+            old_log_prob.batch.pop("entropys")
+            batch = batch.union(old_log_prob)
+
+            if "rollout_log_probs" in batch.batch.keys():
+                # TODO: we may want to add diff of probs too.
+                rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                actor_old_log_probs = batch.batch["old_log_probs"]
+                attention_mask = batch.batch["attention_mask"]
+                responses = batch.batch["responses"]
+                response_length = responses.size(1)
+                response_mask = attention_mask[:, -response_length:]
+
+                rollout_probs = torch.exp(rollout_old_log_probs)
+                actor_probs = torch.exp(actor_old_log_probs)
+                rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                metrics.update(
+                    {
+                        "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                        "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                        "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                    }
+                )
+        if self.use_reference_policy:
+            # compute reference log_prob
+            with marked_timer("ref", timing_raw, color="olive"):
+                if not self.ref_in_actor:
+                    ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                else:
+                    ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                batch = batch.union(ref_log_prob)
+        # compute values
+        if self.use_critic:
+            with marked_timer("values", timing_raw, color="cyan"):
+                values = self.critic_wg.compute_values(batch)
+                batch = batch.union(values)
+        with marked_timer("adv", timing_raw, color="brown"):
+            # we combine with rule-based rm
+            reward_extra_infos_dict: dict[str, list]
+            if self.config.reward_model.launch_reward_fn_async:
+                reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
+            batch.batch["token_level_scores"] = reward_tensor
+
+            if reward_extra_infos_dict:
+                batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+            # compute rewards. apply_kl_penalty if available
+            if self.config.algorithm.use_kl_in_reward:
+                batch, kl_metrics = apply_kl_penalty(
+                    batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                )
+                metrics.update(kl_metrics)
+            else:
+                batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+
+            # compute advantages, executed on the driver process
+
+            norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                "norm_adv_by_std_in_grpo", True
+            )  # GRPO adv normalization factor
+
+            batch = compute_advantage(
+                batch,
+                adv_estimator=self.config.algorithm.adv_estimator,
+                gamma=self.config.algorithm.gamma,
+                lam=self.config.algorithm.lam,
+                num_repeat=self.config.actor_rollout_ref.rollout.n,
+                norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                config=self.config.algorithm,
+            )
+        # update critic
+        if self.use_critic:
+            with marked_timer("update_critic", timing_raw, color="pink"):
+                critic_output = self.critic_wg.update_critic(batch)
+            critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+            metrics.update(critic_output_metrics)
+        # implement critic warmup
+        if self.config.trainer.critic_warmup <= self.global_steps:
+            # update actor
+            with marked_timer("update_actor", timing_raw, color="red"):
+                batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                actor_output = self.actor_rollout_wg.update_actor(batch)
+            actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+            metrics.update(actor_output_metrics)
+        return batch, reward_extra_infos_dict
+
+    def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw):
+        """Log rollout generations if enabled"""
+        rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+        if rollout_data_dir:
+            with marked_timer("dump_rollout_generations", timing_raw, color="green"):
+                inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                if "request_id" in batch.non_tensor_batch:
+                    reward_extra_infos_dict.setdefault(
+                        "request_id",
+                        batch.non_tensor_batch["request_id"].tolist(),
+                    )
+                self._dump_generations(
+                    inputs=inputs,
+                    outputs=outputs,
+                    scores=scores,
+                    reward_extra_infos_dict=reward_extra_infos_dict,
+                    dump_path=rollout_data_dir,
+                )
+
+    def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw):
+        if (
+            self.val_reward_fn is not None
+            and self.config.trainer.test_freq > 0
+            and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+        ):
+            with marked_timer("testing", timing_raw, color="green"):
+                val_metrics: dict = self._validate()
+                if is_last_step:
+                    last_val_metrics = val_metrics
+            metrics.update(val_metrics)
+        return last_val_metrics
+
+    def _check_save_checkpoint(self, is_last_step, timing_raw):
+        # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
+        esi_close_to_expiration = should_save_ckpt_esi(
+            max_steps_duration=self.max_steps_duration,
+            redundant_time=self.config.trainer.esi_redundant_time,
+        )
+        # Check if the conditions for saving a checkpoint are met.
+        # The conditions include a mandatory condition (1) and
+        # one of the following optional conditions (2/3/4):
+        # 1. The save frequency is set to a positive value.
+        # 2. It's the last training step.
+        # 3. The current step number is a multiple of the save frequency.
+        # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
+        if self.config.trainer.save_freq > 0 and (
+            is_last_step or self.global_steps % self.config.trainer.save_freq == 0 or esi_close_to_expiration
+        ):
+            if esi_close_to_expiration:
+                print("Force saving checkpoint: ESI instance expiration approaching.")
+            with marked_timer("save_checkpoint", timing_raw, color="green"):
+                self._save_checkpoint()
+
+    def _collect_metrics(self, batch, epoch, metrics, timing_raw):
+        steps_duration = timing_raw["step"]
+        self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+        # training metrics
+        metrics.update(
+            {
+                "training/global_step": self.global_steps,
+                "training/epoch": epoch,
+            }
+        )
+        # collect metrics
+        metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+        metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+        # TODO: implement actual tflpo and theoretical tflpo
+        n_gpus = self.resource_pool_manager.get_n_gpus()
+        metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+
+    def _post_batch_processing(self, batch: DataProto):
+        # this is experimental and may be changed/removed in the future in favor of a general-purpose one
+        if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
+            self.train_dataloader.sampler.update(batch=batch)
+
+        # this is experimental and may be changed/removed in the future
+        # in favor of a general-purpose data buffer pool
+        if hasattr(self.train_dataset, "on_batch_end"):
+            # The dataset may be changed after each training batch
+            self.train_dataset.on_batch_end(batch=batch)

From 33ed01fe68dba9d715a4c67c8f680edd166b41c1 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 1 Aug 2025 11:48:34 +0800
Subject: [PATCH 015/182] refactor init worker

---
 recipe/fully_async_policy/fully_async_main.py |   1 -
 .../fully_async_rollouter.py                  | 129 ++++---
 .../fully_async_policy/fully_async_trainer.py | 334 +++---------------
 recipe/fully_async_policy/message_queue.py    |  12 +-
 recipe/fully_async_policy/unittest/test_mq.py |  68 +---
 recipe/one_step_off_policy/ray_trainer.py     | 113 ++----
 verl/trainer/ppo/ray_trainer.py               |  76 +++-
 7 files changed, 228 insertions(+), 505 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 3773d90d8d7..e8053e74647 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -343,7 +343,6 @@ def _run_training_loop(self):
 
         logger.info("Starting Rollouter in background...")
         rollouter_future = self.components["rollouter"].fit.remote()
-        time.sleep(2.0)
         trainer_future = self.components["trainer"].fit.remote()
         self._monitor_components()
         ray.get(rollouter_future)
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 06380803aee..6b41d635013 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -28,7 +28,7 @@
 from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType, RayPPOTrainer
 from verl.utils.debug import marked_timer
 
 logger = logging.getLogger(__name__)
@@ -116,7 +116,7 @@ def get_status(self) -> dict:
 
 
 @ray.remote
-class FullyAsyncRollouter:
+class FullyAsyncRollouter(RayPPOTrainer):
     """
     异步样本生成器，负责持续生成训练样本并放入MessageQueue
     基于OneStepOffRayTrainer的成熟实现改进
@@ -130,23 +130,78 @@ def __init__(
         resource_pool_manager: ResourcePoolManager,
         ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
         processor=None,
-        train_dataset: Dataset | None = None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
         collate_fn=None,
-        train_sampler: Sampler | None = None,
-        device_name="cuda",
+        train_sampler: Optional[Sampler] = None,
+        device_name=None,
     ):
-        self.config = config
+        """
+        Initialize distributed PPO trainer with Ray backend.
+        Note that this trainer runs on the driver process on a single CPU/GPU node.
+
+        Args:
+            config: Configuration object containing training parameters.
+            tokenizer: Tokenizer used for encoding and decoding text.
+            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
+            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
+            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
+            processor: Optional data processor, used for multimodal data
+            reward_fn: Function for computing rewards during training.
+            val_reward_fn: Function for computing rewards during validation.
+            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
+            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
+            collate_fn: Function to collate data samples into batches.
+            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
+            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
+        """
+
+        # Store the tokenizer for text processing
         self.tokenizer = tokenizer
         self.processor = processor
+        self.config = config
+        self.reward_fn = reward_fn
+        self.val_reward_fn = val_reward_fn
+
+        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+        assert not self.hybrid_engine
+
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
+        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
+        self.use_rm = Role.RewardModel in role_worker_mapping
         self.ray_worker_group_cls = ray_worker_group_cls
-        self.device_name = device_name
+        self.device_name = device_name if device_name else self.config.trainer.device
+        self.validation_generations_logger = ValidationGenerationsLogger(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+        )
 
-        # 数据相关
-        self.train_dataset = train_dataset
-        self.collate_fn = collate_fn
-        self.train_sampler = train_sampler
+        # if ref_in_actor is True, the reference policy will be actor without lora applied
+        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
+
+        # define in-reward KL control
+        # kl loss control currently not suppoorted
+        if self.config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
+
+        if config.critic.enable is not None:
+            self.use_critic = bool(config.critic.enable)
+        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+            self.use_critic = True
+        else:
+            warnings.warn(
+                "Disabled critic as algorithm.adv_estimator != gae. "
+                "If it is not intended, please set critic.enable=True",
+                stacklevel=2,
+            )
+            self.use_critic = False
+
+        self._validate_config()
+        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+        self.message_queue_client = None
 
         # Rollout控制
         self.rollout_controller = RolloutController()
@@ -180,10 +235,13 @@ def __init__(
         self.sync_in_progress = False
         self.sync_lock = threading.Lock()
 
-        # 异步rollout模式
-        self.async_rollout_mode = config.actor_rollout_ref.rollout.mode == "async"
+    def set_message_queue_client(self, message_queue_client: MessageQueueClient):
+        """设置消息队列客户端"""
+        self.message_queue_client = message_queue_client
 
-        self._validate_config()
+    def _validate(self):
+        """执行验证 - 参考OneStepOffRayTrainer的验证逻辑"""
+        return None
 
     def _validate_config(self):
         """验证配置"""
@@ -263,10 +321,6 @@ def _init_async_rollout_manager(self):
             logger.warning(f"Failed to initialize async rollout manager: {e}")
             self.async_rollout_mode = False
 
-    def set_message_queue_client(self, message_queue_client: MessageQueueClient):
-        """设置消息队列客户端"""
-        self.message_queue_client = message_queue_client
-
     def set_parameter_synchronizer(self, param_synchronizer):
         """设置参数同步器"""
         self.param_synchronizer = param_synchronizer
@@ -370,39 +424,14 @@ def _execute_parameter_sync(self, param_version: int) -> bool:
             logger.error(f"Parameter sync execution failed: {e}")
             return False
 
-    def _create_dataloader(self):
-        """创建数据加载器"""
-        from torch.utils.data import DataLoader
-
-        if self.train_dataset is None:
-            raise ValueError("Training dataset not provided")
-
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.config.data.train_batch_size,
-            sampler=self.train_sampler,
-            collate_fn=self.collate_fn,
-            num_workers=self.config.data.get("dataloader_num_workers", 0),
-            drop_last=True,
-            pin_memory=True,  # 改进内存管理
-        )
-
     def _create_continuous_iterator(self):
-        """创建连续的数据迭代器"""
-        dataloader = self._create_dataloader()
-
-        epoch = 0
-        while self.running:
-            try:
-                for batch_dict in dataloader:
-                    if not self.running:
-                        return
-                    yield epoch, batch_dict
-                epoch += 1
-            except Exception as e:
-                logger.error(f"Error in data iterator: {e}")
-                time.sleep(1.0)  # 避免快速重试
-                continue
+        """
+        Create a continuous data iterator across epoch
+        """
+        for epoch in range(self.config.trainer.total_epochs):
+            iterator = iter(self.train_dataloader)
+            for batch_dict in iterator:
+                yield epoch, batch_dict
 
     def _should_pause_generation(self) -> bool:
         """
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 9122a97c8fa..16354313e4e 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -16,6 +16,7 @@
 import time
 import warnings
 from pprint import pprint
+from typing import Optional
 
 import numpy as np
 import ray
@@ -60,20 +61,20 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            train_dataset: Optional[Dataset] = None,
-            val_dataset: Optional[Dataset] = None,
-            collate_fn=None,
-            train_sampler: Optional[Sampler] = None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -138,6 +139,7 @@ def __init__(
 
         self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+        self.message_queue_client = None
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
@@ -234,63 +236,6 @@ def init_workers(self):
 
         logger.info("FullyAsyncTrainer workers initialized successfully")
 
-    def _sync_parameters_to_rollouter(self):
-        """同步参数到Rollouter - 改进的同步机制"""
-        if self.rollouter_actor is None:
-            logger.warning("Rollouter actor not set, skipping parameter sync")
-            return
-
-        self.current_param_version += 1
-
-        try:
-            # 通知MessageQueue更新参数版本
-            self.message_queue_client.update_param_version(self.current_param_version)
-
-            # 同步参数到Rollouter
-            sync_future = self.rollouter_actor.update_rollout_weights.remote(self.current_param_version)
-            ray.get(sync_future)
-
-            self.param_sync_count += 1
-            logger.info(f"Parameter sync completed, version: {self.current_param_version}")
-
-        except Exception as e:
-            logger.error(f"Failed to sync parameters: {e}")
-            self.current_param_version -= 1  # 回滚版本号
-            raise
-
-    def _process_batch_samples(self, batch_samples: list[QueueSample]) -> DataProto:
-        """处理从队列获取的batch样本 - 改进的批处理逻辑"""
-        if not batch_samples:
-            raise ValueError("Empty batch samples")
-
-        if len(batch_samples) == 1:
-            return batch_samples[0].data
-
-        # 合并多个batch - 使用DataProto的concat方法
-        try:
-            all_batches = [sample.data for sample in batch_samples]
-            merged_batch = DataProto.concat(all_batches)
-            logger.debug(f"Successfully merged {len(batch_samples)} batches")
-            return merged_batch
-        except Exception as e:
-            logger.error(f"Failed to merge batch samples: {e}")
-            raise
-
-    def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
-        """计算样本新鲜度指标"""
-        sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples]
-        current_time = time.time()
-        sample_latencies = [current_time - sample.timestamp for sample in batch_samples]
-
-        return {
-            "freshness/avg_sample_age": np.mean(sample_ages),
-            "freshness/max_sample_age": max(sample_ages),
-            "freshness/min_sample_age": min(sample_ages),
-            "freshness/avg_sample_latency": np.mean(sample_latencies),
-            "freshness/max_sample_latency": max(sample_latencies),
-            "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages),
-        }
-
     def fit(self):
         """
         The training loop of PPO.
@@ -298,6 +243,11 @@ def fit(self):
         to construct the PPO dataflow.
         The light-weight advantage computation is done on the driver process.
         """
+        logger.info("Starting Trainer...")
+
+        if self.message_queue_client is None:
+            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+
         from omegaconf import OmegaConf
 
         from verl.utils.tracking import Tracking
@@ -330,6 +280,7 @@ def fit(self):
         # we start from step 1
         self.global_steps += 1
         last_val_metrics = None
+        self.max_steps_duration = 0
 
         # across epoch iterator
         continuous_iterator = self._create_continuous_iterator()
@@ -346,8 +297,7 @@ def fit(self):
                 if self.config.trainer.profile_steps is not None
                 else False
             )
-            with marked_timer("start_profile", timing_raw):
-                self._start_profiling(do_profile)
+            self._start_profiling(do_profile, timing_raw)
 
             is_last_step = self.global_steps >= self.total_training_steps
 
@@ -363,216 +313,15 @@ def fit(self):
                     if not is_last_step:
                         batch_data_future = self._async_gen_next_batch(continuous_iterator)
 
-                batch.non_tensor_batch["uid"] = np.array(
-                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
-                )
-                # repeat to align with repeated responses in rollout
-                batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-                batch = batch.union(gen_batch_output)
-
-                if "response_mask" not in batch.batch.keys():
-                    batch.batch["response_mask"] = compute_response_mask(batch)
-                # Balance the number of valid tokens across DP ranks.
-                # NOTE: This usually changes the order of data in the `batch`,
-                # which won't affect the advantage calculation (since it's based on uid),
-                # but might affect the loss calculation (due to the change of mini-batching).
-                # TODO: Decouple the DP balancing and mini-batching.
-                if self.config.trainer.balance_batch:
-                    self._balance_batch(batch, metrics=metrics)
-
-                # compute global_valid tokens
-                batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
-
-                with marked_timer("reward", timing_raw, color="yellow"):
-                    # compute reward model score
-                    if self.use_rm:
-                        reward_tensor = self.rm_wg.compute_rm_score(batch)
-                        batch = batch.union(reward_tensor)
-
-                    if self.config.reward_model.launch_reward_fn_async:
-                        future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer)
-                    else:
-                        reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
-
-                # recompute old_log_probs
-                with marked_timer("old_log_prob", timing_raw, color="blue"):
-                    old_log_prob = self.actor_wg.compute_log_prob(batch)
-                    entropys = old_log_prob.batch["entropys"]
-                    response_masks = batch.batch["response_mask"]
-                    loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
-                    entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
-                    old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
-                    metrics.update(old_log_prob_metrics)
-                    old_log_prob.batch.pop("entropys")
-                    batch = batch.union(old_log_prob)
-
-                    if "rollout_log_probs" in batch.batch.keys():
-                        # TODO: we may want to add diff of probs too.
-                        rollout_old_log_probs = batch.batch["rollout_log_probs"]
-                        actor_old_log_probs = batch.batch["old_log_probs"]
-                        attention_mask = batch.batch["attention_mask"]
-                        responses = batch.batch["responses"]
-                        response_length = responses.size(1)
-                        response_mask = attention_mask[:, -response_length:]
-
-                        rollout_probs = torch.exp(rollout_old_log_probs)
-                        actor_probs = torch.exp(actor_old_log_probs)
-                        rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
-                        rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
-                        rollout_probs_diff_max = torch.max(rollout_probs_diff)
-                        rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
-                        rollout_probs_diff_std = torch.std(rollout_probs_diff)
-                        metrics.update(
-                            {
-                                "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
-                                "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
-                                "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
-                            }
-                        )
-
-                if self.use_reference_policy:
-                    # compute reference log_prob
-                    with marked_timer("ref", timing_raw, color="olive"):
-                        if not self.ref_in_actor:
-                            ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
-                        else:
-                            ref_log_prob = self.actor_wg.compute_ref_log_prob(batch)
-                        batch = batch.union(ref_log_prob)
-
-                # compute values
-                if self.use_critic:
-                    with marked_timer("values", timing_raw, color="cyan"):
-                        values = self.critic_wg.compute_values(batch)
-                        batch = batch.union(values)
-
-                with marked_timer("adv", timing_raw, color="brown"):
-                    # we combine with rule-based rm
-                    reward_extra_infos_dict: dict[str, list]
-                    if self.config.reward_model.launch_reward_fn_async:
-                        reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
-                    batch.batch["token_level_scores"] = reward_tensor
-
-                    if reward_extra_infos_dict:
-                        batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
-
-                    # compute rewards. apply_kl_penalty if available
-                    if self.config.algorithm.use_kl_in_reward:
-                        batch, kl_metrics = apply_kl_penalty(
-                            batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
-                        )
-                        metrics.update(kl_metrics)
-                    else:
-                        batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
-
-                    # compute advantages, executed on the driver process
-
-                    norm_adv_by_std_in_grpo = self.config.algorithm.get(
-                        "norm_adv_by_std_in_grpo", True
-                    )  # GRPO adv normalization factor
-
-                    batch = compute_advantage(
-                        batch,
-                        adv_estimator=self.config.algorithm.adv_estimator,
-                        gamma=self.config.algorithm.gamma,
-                        lam=self.config.algorithm.lam,
-                        num_repeat=self.config.actor_rollout_ref.rollout.n,
-                        norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
-                        config=self.config.algorithm,
-                    )
-
-                # update critic
-                if self.use_critic:
-                    with marked_timer("update_critic", timing_raw, color="pink"):
-                        critic_output = self.critic_wg.update_critic(batch)
-                    critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
-                    metrics.update(critic_output_metrics)
-
-                # implement critic warmup
-                if self.config.trainer.critic_warmup <= self.global_steps:
-                    # update actor
-                    with marked_timer("update_actor", timing_raw, color="red"):
-                        batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
-                        actor_output = self.actor_wg.update_actor(batch)
-                    actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
-                    metrics.update(actor_output_metrics)
-
-                # Log rollout generations if enabled
-                rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
-                if rollout_data_dir:
-                    with marked_timer("dump_rollout_generations", timing_raw, color="green"):
-                        inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
-                        outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
-                        scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
-                        if "request_id" in batch.non_tensor_batch:
-                            reward_extra_infos_dict.setdefault(
-                                "request_id",
-                                batch.non_tensor_batch["request_id"].tolist(),
-                            )
-                        self._dump_generations(
-                            inputs=inputs,
-                            outputs=outputs,
-                            scores=scores,
-                            reward_extra_infos_dict=reward_extra_infos_dict,
-                            dump_path=rollout_data_dir,
-                        )
-
-                # validate
-                if (
-                        self.val_reward_fn is not None
-                        and self.config.trainer.test_freq > 0
-                        and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
-                ):
-                    with marked_timer("testing", timing_raw, color="green"):
-                        val_metrics: dict = self._validate()
-                        if is_last_step:
-                            last_val_metrics = val_metrics
-                    metrics.update(val_metrics)
-
-                # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
-                esi_close_to_expiration = should_save_ckpt_esi(
-                    max_steps_duration=self.max_steps_duration,
-                    redundant_time=self.config.trainer.esi_redundant_time,
-                )
-                # Check if the conditions for saving a checkpoint are met.
-                # The conditions include a mandatory condition (1) and
-                # one of the following optional conditions (2/3/4):
-                # 1. The save frequency is set to a positive value.
-                # 2. It's the last training step.
-                # 3. The current step number is a multiple of the save frequency.
-                # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. q
-                if self.config.trainer.save_freq > 0 and (
-                        is_last_step
-                        or self.global_steps % self.config.trainer.save_freq == 0
-                        or esi_close_to_expiration
-                ):
-                    if esi_close_to_expiration:
-                        print("Force saving checkpoint: ESI instance expiration approaching.")
-                    with marked_timer("save_checkpoint", timing_raw, color="green"):
-                        self._save_checkpoint()
-
-            with marked_timer("stop_profile", timing_raw):
-                self._stop_profiling(do_profile)
-
-            steps_duration = timing_raw["step"]
-            self.max_steps_duration = max(self.max_steps_duration, steps_duration)
-
-            # training metrics
-            metrics.update(
-                {
-                    "training/global_step": self.global_steps,
-                    "training/epoch": epoch,
-                }
-            )
-            # collect metrics
-            metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
-            metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
-            # TODO: implement actual tflpo and theoretical tflpo
-            n_gpus = self.resource_pool_manager.get_n_gpus()
-            metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+                batch = self._post_generate_batch(batch, gen_batch_output, metrics)
+                batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+                self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+                last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
+                self._check_save_checkpoint(is_last_step, timing_raw)
 
-            # this is experimental and may be changed/removed in the future in favor of a general-purpose one
-            if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
-                self.train_dataloader.sampler.update(batch=batch)
+            self._stop_profiling(do_profile, timing_raw)
+            self._collect_metrics(batch, epoch, metrics, timing_raw)
+            self._post_batch_processing(batch)
 
             # TODO: make a canonical logger that supports various backend
             logger.log(data=metrics, step=self.global_steps)
@@ -585,12 +334,6 @@ def fit(self):
                 progress_bar.close()
                 return
 
-            # this is experimental and may be changed/removed in the future
-            # in favor of a general-purpose data buffer pool
-            if hasattr(self.train_dataset, "on_batch_end"):
-                # The dataset may be changed after each training batch
-                self.train_dataset.on_batch_end(batch=batch)
-
     def get_statistics(self) -> dict:
         """获取训练统计信息"""
         queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {}
@@ -605,3 +348,18 @@ def get_statistics(self) -> dict:
             "queue_total_consumed": queue_stats.get("total_consumed", 0),
             "queue_dropped_samples": queue_stats.get("dropped_samples", 0),
         }
+
+    def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
+        """计算样本新鲜度指标"""
+        sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples]
+        current_time = time.time()
+        sample_latencies = [current_time - sample.timestamp for sample in batch_samples]
+
+        return {
+            "freshness/avg_sample_age": np.mean(sample_ages),
+            "freshness/max_sample_age": max(sample_ages),
+            "freshness/min_sample_age": min(sample_ages),
+            "freshness/avg_sample_latency": np.mean(sample_latencies),
+            "freshness/max_sample_latency": max(sample_latencies),
+            "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages),
+        }
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 5866dcfd4a9..8e686e9a471 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -76,8 +76,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
             "staleness_threshold={self.staleness_threshold}"
         )
 
-    def put_samples(self, epoch: int, samples: List[Any], param_version: int,
-                    rollout_metadata_list: List[dict[str, Any]] = None) -> bool:
+    def put_samples(
+        self, epoch: int, samples: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None
+    ) -> bool:
         """
         放入一个batch样本到队列
 
@@ -103,8 +104,7 @@ def put_samples(self, epoch: int, samples: List[Any], param_version: int,
                 rollout_metadata_list = [{}] * len(samples)
 
             if len(rollout_metadata_list) != len(samples):
-                logger.warning(
-                    f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}")
+                logger.warning(f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}")
                 return False
 
             for sample, meta in zip(samples, rollout_metadata_list):
@@ -237,7 +237,9 @@ class MessageQueueClient:
     def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
-    def put_batch(self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None) -> bool:
+    def put_batch(
+        self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None
+    ) -> bool:
         """放入batch到队列"""
         return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list))
 
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
index dbc29c3e9ce..36172d02640 100644
--- a/recipe/fully_async_policy/unittest/test_mq.py
+++ b/recipe/fully_async_policy/unittest/test_mq.py
@@ -67,10 +67,7 @@ def test_put_samples_success(self, message_queue_client, mock_data_proto):
         metadata_list = [{"test": "data1"}, {"test": "data2"}]
 
         result = message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
-            param_version=1,
-            rollout_metadata_list=metadata_list
+            epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list
         )
 
         assert result is True
@@ -88,12 +85,7 @@ def test_put_samples_without_metadata(self, message_queue_client, mock_data_prot
         """测试不提供metadata时的处理"""
         samples = [mock_data_proto, mock_data_proto]
 
-        result = message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
-            param_version=1,
-            rollout_metadata_list=None
-        )
+        result = message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
 
         assert result is True
         queue_size = message_queue_client.get_queue_size()
@@ -105,10 +97,7 @@ def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_pro
         metadata_list = [{"test": "data1"}]  # 长度不匹配
 
         result = message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
-            param_version=1,
-            rollout_metadata_list=metadata_list
+            epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list
         )
 
         assert result is False  # 应该失败
@@ -126,7 +115,7 @@ def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto
             epoch=1,
             batch=samples,
             param_version=2,  # 5-2=3, 达到阈值
-            rollout_metadata_list=None
+            rollout_metadata_list=None,
         )
 
         assert result is False
@@ -140,12 +129,7 @@ def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto)
         # 填满队列（最大容量10）
         for i in range(6):  # 每次放入2个，总共12个，超过最大容量10
             samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_batch(
-                epoch=1,
-                batch=samples,
-                param_version=1,
-                rollout_metadata_list=None
-            )
+            message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
 
         # 队列大小应该保持在最大值
         queue_size = message_queue_client.get_queue_size()
@@ -160,12 +144,7 @@ def test_get_samples_success(self, message_queue_client, mock_data_proto):
         # 先放入一些samples
         samples = [mock_data_proto, mock_data_proto, mock_data_proto]
         metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}]
-        message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
-            param_version=1,
-            rollout_metadata_list=metadata_list
-        )
+        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list)
 
         # 获取2个samples
         retrieved_samples = message_queue_client.get_batch(min_batch_count=2)
@@ -194,12 +173,7 @@ def get_samples():
         def put_samples_later():
             time.sleep(0.5)  # 延迟放入
             samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_batch(
-                epoch=1,
-                batch=samples,
-                param_version=1,
-                rollout_metadata_list=None
-            )
+            message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
 
         # 启动消费者线程
         consumer_thread = threading.Thread(target=get_samples)
@@ -225,12 +199,7 @@ def test_clear_queue(self, message_queue_client, mock_data_proto):
         """测试清空队列"""
         # 先添加一些样本
         samples = [mock_data_proto, mock_data_proto, mock_data_proto]
-        message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
-            param_version=1,
-            rollout_metadata_list=None
-        )
+        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
 
         # 清空队列
         message_queue_client.clear_queue()
@@ -244,12 +213,7 @@ def test_get_queue_size(self, message_queue_client, mock_data_proto):
         assert message_queue_client.get_queue_size() == 0
 
         samples = [mock_data_proto]
-        message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
-            param_version=1,
-            rollout_metadata_list=None
-        )
+        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
         assert message_queue_client.get_queue_size() == 1
 
     def test_get_statistics(self, message_queue_client):
@@ -274,12 +238,7 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto):
         """测试获取内存使用统计"""
         # 添加一些样本
         samples = [mock_data_proto, mock_data_proto]
-        message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
-            param_version=1,
-            rollout_metadata_list=None
-        )
+        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
 
         memory_stats = message_queue_client.get_memory_usage()
 
@@ -328,12 +287,7 @@ def test_concurrent_put_get(self, mock_data_proto):
             def producer():
                 for i in range(50):
                     samples = [mock_data_proto, mock_data_proto]
-                    result = client.put_batch(
-                        epoch=i,
-                        batch=samples,
-                        param_version=1,
-                        rollout_metadata_list=None
-                    )
+                    result = client.put_batch(epoch=i, batch=samples, param_version=1, rollout_metadata_list=None)
                     results.append(("put", result))
                     time.sleep(0.1)
 
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index c1687561d01..c5b7a71225e 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -27,7 +27,6 @@
 from tqdm import tqdm
 
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
-from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator
 from verl.trainer.ppo.ray_trainer import (
@@ -163,94 +162,35 @@ def _validate(self):
         self.actor_rollout_wg = self.actor_wg
         return ret
 
-    def init_workers(self):
-        """Initialize distributed training workers using Ray backend.
-
-        Creates:
-        1. Ray resource pools from configuration
-        2. Worker groups for each role (actor, critic, etc.)
-        """
-        self.resource_pool_manager.create_resource_pool()
-
-        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
-
+    def _create_actor_rollout_classes(self):
         # create actor and rollout
-        for role, role_name in [(Role.Actor, "actor"), (Role.Rollout, "rollout")]:
-            resource_pool = self.resource_pool_manager.get_resource_pool(role)
-            role_cls = RayClassWithInitArgs(
-                cls=self.role_worker_mapping[role],
-                config=self.config.actor_rollout_ref,
-                role=role_name,
-            )
-            self.resource_pool_to_cls[resource_pool][role_name] = role_cls
-
-        # create critic
-        if self.use_critic:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
-            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
-            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
-
-        # create reference policy if needed
-        if self.use_reference_policy:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
-            ref_policy_cls = RayClassWithInitArgs(
-                self.role_worker_mapping[Role.RefPolicy],
-                config=self.config.actor_rollout_ref,
-                role="ref",
-                profile_option=self.config.trainer.npu_profile.options,
-            )
-            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
-
-        # create a reward model if reward_fn is None
-        if self.use_rm:
-            # we create a RM here
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
-            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
-            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
-
-        # initialize WorkerGroup
-        # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
-        # you should not use `create_colocated_worker_cls`.
-        # Instead, directly pass different resource pool to different worker groups.
-        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
-        all_wg = {}
-        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
-        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
-            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
-        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
-            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
-            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
-                "worker_nsight_options must be set when profile_steps is set"
-            )
-            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
-                OmegaConf.select(self.config.trainer, "worker_nsight_options")
-            )
-
-        for resource_pool, class_dict in self.resource_pool_to_cls.items():
-            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
-            wg_dict = self.ray_worker_group_cls(
-                resource_pool=resource_pool,
-                ray_cls_with_init=worker_dict_cls,
-                device_name=self.device_name,
-                **wg_kwargs,
-            )
-            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
-            all_wg.update(spawn_wg)
+        if not self.hybrid_engine:
+            for role in [Role.Actor, Role.Rollout]:
+                resource_pool = self.resource_pool_manager.get_resource_pool(role)
+                role_cls = RayClassWithInitArgs(
+                    cls=self.role_worker_mapping[role],
+                    config=self.config.actor_rollout_ref,
+                    role=str(role),
+                )
+                self.resource_pool_to_cls[resource_pool][str(role)] = role_cls
+        else:
+            raise NotImplementedError
 
+    def _init_models(self):
         if self.use_critic:
-            self.critic_wg = all_wg["critic"]
+            self.critic_wg = self.all_wg[str(Role.Critic)]
             self.critic_wg.init_model()
 
         if self.use_reference_policy and not self.ref_in_actor:
-            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)]
             self.ref_policy_wg.init_model()
 
         if self.use_rm:
-            self.rm_wg = all_wg["rm"]
+            self.rm_wg = self.all_wg[str(Role.RewardModel)]
             self.rm_wg.init_model()
 
-        self.actor_wg = all_wg["actor"]
-        self.rollout_wg = all_wg["rollout"]
+        self.actor_wg = self.all_wg[str(Role.Actor)]
+        self.rollout_wg = self.all_wg[str(Role.Rollout)]
         self.actor_wg.init_model()
         self.rollout_wg.init_model()
         self.actor_rollout_wg = self.actor_wg  # to be compatible with the functions that not be modified
@@ -268,21 +208,9 @@ def init_workers(self):
         )
         self.sync_rollout_weights()
 
-        # create async rollout manager and request scheduler
-        self.async_rollout_mode = False
-        if self.config.actor_rollout_ref.rollout.mode == "async":
-            from verl.workers.rollout.async_server import AsyncLLMServerManager
-
-            self.async_rollout_mode = True
-            self.async_rollout_manager = AsyncLLMServerManager(
-                config=self.config,
-                worker_group=self.rollout_wg,
-            )
-
     def sync_rollout_weights(self):
-        if not self.hybrid_engine:
-            self.actor_wg.sync_rollout_weights()
-            ray.get(self.rollout_wg.sync_rollout_weights())
+        self.actor_wg.sync_rollout_weights()
+        ray.get(self.rollout_wg.sync_rollout_weights())
 
     def _create_continuous_iterator(self):
         """
@@ -318,7 +246,6 @@ def fit(self):
         to construct the PPO dataflow.
         The light-weight advantage computation is done on the driver process.
         """
-        from omegaconf import OmegaConf
 
         from verl.utils.tracking import Tracking
 
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 49334db6bcd..26150cc631d 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -79,6 +79,40 @@ class Role(Enum):
     RewardModel = 5
     ActorRolloutRef = 6
 
+    def __str__(self):
+        """返回与代码中一致的字符串表示"""
+        return self._get_role_string()
+
+    def _get_role_string(self):
+        """获取角色对应的字符串名称"""
+        role_mapping = {
+            Role.Actor: "actor",
+            Role.Rollout: "rollout",
+            Role.ActorRollout: "actor_rollout",
+            Role.Critic: "critic",
+            Role.RefPolicy: "ref",
+            Role.RewardModel: "rm",
+            Role.ActorRolloutRef: "actor_rollout_ref",
+        }
+        return role_mapping.get(self, self.name.lower())
+
+    @classmethod
+    def from_string(cls, name: str):
+        """从字符串创建Role实例"""
+        string_mapping = {
+            "actor": cls.Actor,
+            "rollout": cls.Rollout,
+            "actor_rollout": cls.ActorRollout,
+            "critic": cls.Critic,
+            "ref": cls.RefPolicy,
+            "rm": cls.RewardModel,
+            "actor_rollout_ref": cls.ActorRolloutRef,
+        }
+        role = string_mapping.get(name.lower())
+        if role is None:
+            raise ValueError(f"No Role found for string: {name}")
+        return role
+
 
 @dataclass
 class ResourcePoolManager:
@@ -776,48 +810,65 @@ def init_workers(self):
         1. Ray resource pools from configuration
         2. Worker groups for each role (actor, critic, etc.)
         """
-        self.resource_pool_manager.create_resource_pool()
+        self._init_resource_pools()
+        self._create_worker_classes()
+        self._init_worker_groups()
+        self._init_models()
+        self._init_async_rollout_manager()
 
+    def _init_resource_pools(self):
+        self.resource_pool_manager.create_resource_pool()
         self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
 
+    def _create_worker_classes(self):
+        self._create_actor_rollout_classes()
+        self._create_critic_class()
+        self._create_reference_policy_class()
+        self._create_reward_model_class()
+
+    def _create_actor_rollout_classes(self):
         # create actor and rollout
         if self.hybrid_engine:
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
             actor_rollout_cls = RayClassWithInitArgs(
                 cls=self.role_worker_mapping[Role.ActorRollout],
                 config=self.config.actor_rollout_ref,
-                role="actor_rollout",
+                role=str(Role.ActorRollout),
                 profile_option=self.config.trainer.npu_profile.options,
             )
-            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls
         else:
             raise NotImplementedError
 
+    def _create_critic_class(self):
         # create critic
         if self.use_critic:
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
             critic_cfg = omega_conf_to_dataclass(self.config.critic)
             critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
-            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls
 
+    def _create_reference_policy_class(self):
         # create reference policy if needed
         if self.use_reference_policy:
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
             ref_policy_cls = RayClassWithInitArgs(
                 self.role_worker_mapping[Role.RefPolicy],
                 config=self.config.actor_rollout_ref,
-                role="ref",
+                role=str(Role.RefPolicy),
                 profile_option=self.config.trainer.npu_profile.options,
             )
-            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls
 
+    def _create_reward_model_class(self):
         # create a reward model if reward_fn is None
         if self.use_rm:
             # we create a RM here
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
             rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
-            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls
 
+    def _init_worker_groups(self):
         # initialize WorkerGroup
         # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
         # you should not use `create_colocated_worker_cls`.
@@ -846,23 +897,26 @@ def init_workers(self):
             )
             spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
             all_wg.update(spawn_wg)
+        self.all_wg = all_wg
 
+    def _init_models(self):
         if self.use_critic:
-            self.critic_wg = all_wg["critic"]
+            self.critic_wg = self.all_wg[str(Role.Critic)]
             self.critic_wg.init_model()
 
         if self.use_reference_policy and not self.ref_in_actor:
-            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)]
             self.ref_policy_wg.init_model()
 
         if self.use_rm:
-            self.rm_wg = all_wg["rm"]
+            self.rm_wg = self.all_wg[str(Role.RewardModel)]
             self.rm_wg.init_model()
 
         # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
-        self.actor_rollout_wg = all_wg["actor_rollout"]
+        self.actor_rollout_wg = self.all_wg[Role.ActorRollout]
         self.actor_rollout_wg.init_model()
 
+    def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         self.async_rollout_mode = False
         if self.config.actor_rollout_ref.rollout.mode == "async":

From 9e8b596271574776088dc5e2e8778fbf955d62c0 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 1 Aug 2025 15:34:42 +0800
Subject: [PATCH 016/182] init worker

---
 .../fully_async_rollouter.py                  | 386 +++++++-----------
 .../fully_async_policy/fully_async_trainer.py |  23 +-
 recipe/fully_async_policy/message_queue.py    |   8 +-
 recipe/fully_async_policy/unittest/test_mq.py |   4 +-
 recipe/one_step_off_policy/ray_trainer.py     |  47 +--
 5 files changed, 182 insertions(+), 286 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 6b41d635013..f1248441594 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -11,25 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import logging
 import threading
 import time
-import uuid
 from concurrent.futures import ThreadPoolExecutor
-from typing import Optional
 
-import numpy as np
 import ray
 from omegaconf import OmegaConf
 from torch.utils.data import Dataset, Sampler
+from tqdm import tqdm
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient
-from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
-from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType, RayPPOTrainer
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.debug import marked_timer
+from verl.utils.tracking import ValidationGenerationsLogger
 
 logger = logging.getLogger(__name__)
 
@@ -46,7 +42,7 @@ def __init__(self):
         self.lock = threading.RLock()
         self.pause_count = 0
 
-    def pause(self, timeout: Optional[float] = None) -> bool:
+    def pause(self, timeout: float | None = None) -> bool:
         """
         暂停rollout
 
@@ -115,7 +111,7 @@ def get_status(self) -> dict:
             }
 
 
-@ray.remote
+@ray.remote(num_cpus=10, max_concurrency=10)
 class FullyAsyncRollouter(RayPPOTrainer):
     """
     异步样本生成器，负责持续生成训练样本并放入MessageQueue
@@ -123,20 +119,20 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        train_dataset: Optional[Dataset] = None,
-        val_dataset: Optional[Dataset] = None,
-        collate_fn=None,
-        train_sampler: Optional[Sampler] = None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            train_dataset: Dataset | None = None,
+            val_dataset: Dataset | None = None,
+            collate_fn=None,
+            train_sampler: Sampler | None = None,
+            device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -170,8 +166,6 @@ def __init__(
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
-        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
-        self.use_rm = Role.RewardModel in role_worker_mapping
         self.ray_worker_group_cls = ray_worker_group_cls
         self.device_name = device_name if device_name else self.config.trainer.device
         self.validation_generations_logger = ValidationGenerationsLogger(
@@ -179,25 +173,11 @@ def __init__(
             experiment_name=self.config.trainer.experiment_name,
         )
 
-        # if ref_in_actor is True, the reference policy will be actor without lora applied
-        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
-
-        # define in-reward KL control
-        # kl loss control currently not suppoorted
-        if self.config.algorithm.use_kl_in_reward:
-            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
-
-        if config.critic.enable is not None:
-            self.use_critic = bool(config.critic.enable)
-        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
-            self.use_critic = True
-        else:
-            warnings.warn(
-                "Disabled critic as algorithm.adv_estimator != gae. "
-                "If it is not intended, please set critic.enable=True",
-                stacklevel=2,
-            )
-            self.use_critic = False
+        self.ref_in_actor = False
+        self.kl_ctrl_in_reward = False
+        self.use_critic = False
+        self.use_reference_policy = False
+        self.use_rm = False
 
         self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
@@ -212,7 +192,6 @@ def __init__(
         self.staleness_threshold = async_config.get("staleness_threshold", 3)
         self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5)
         self.generation_timeout = async_config.get("generation_timeout", 30.0)
-        self.batch_generation_interval = async_config.get("batch_generation_interval", 0.1)
 
         # 统计信息
         self.total_generated_samples = 0
@@ -239,91 +218,50 @@ def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
         self.message_queue_client = message_queue_client
 
-    def _validate(self):
-        """执行验证 - 参考OneStepOffRayTrainer的验证逻辑"""
-        return None
+    def set_parameter_synchronizer(self, param_synchronizer):
+        """设置参数同步器"""
+        self.param_synchronizer = param_synchronizer
 
     def _validate_config(self):
-        """验证配置"""
-        required_configs = [
-            "data.train_batch_size",
-            "actor_rollout_ref.rollout.n",
-            "async_training.staleness_threshold",
-        ]
-
-        for config_path in required_configs:
-            if not OmegaConf.select(self.config, config_path):
-                logger.warning(f"Missing recommended config: {config_path}")
-
         # 验证异步训练配置
         if not hasattr(self.config, "async_training"):
             raise ValueError("Missing async_training configuration")
 
     def init_workers(self):
-        """初始化rollout workers - 参考OneStepOffRayTrainer的实现"""
+        """初始化rollout workers"""
         logger.info("Initializing Rollouter workers...")
-
-        self.resource_pool_manager.create_resource_pool()
-        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
-
-        # 只创建rollout worker
-        resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout)
-        role_cls = RayClassWithInitArgs(
-            cls=self.role_worker_mapping[Role.Rollout],
-            config=self.config.actor_rollout_ref,
-            role="rollout",
-        )
-        self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls
-
-        # 初始化WorkerGroup
-        all_wg = {}
-        wg_kwargs = {}
-        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
-            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
-        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
-            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
-            if OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None:
-                wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
-                    OmegaConf.select(self.config.trainer, "worker_nsight_options")
-                )
-
-        for resource_pool, class_dict in self.resource_pool_to_cls.items():
-            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
-            wg_dict = self.ray_worker_group_cls(
-                resource_pool=resource_pool,
-                ray_cls_with_init=worker_dict_cls,
-                device_name=self.device_name,
-                **wg_kwargs,
-            )
-            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
-            all_wg.update(spawn_wg)
+        self._init_resource_pools()
 
         self.rollout_wg = all_wg["rollout"]
         self.rollout_wg.init_model()
 
-        # 初始化异步rollout管理器（如果需要）
-        if self.async_rollout_mode:
-            self._init_async_rollout_manager()
+    def _create_actor_rollout_classes(self):
+        # only create rollout
+        for role in [Role.Rollout]:
+            resource_pool = self.resource_pool_manager.get_resource_pool(role)
+            role_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[role],
+                config=self.config.actor_rollout_ref,
+                role=str(role),
+            )
+            self.resource_pool_to_cls[resource_pool][str(role)] = role_cls
 
-        logger.info("Rollouter workers initialized successfully")
+    def _init_models(self):
+        self.rollout_wg = self.all_wg[str(Role.Rollout)]
+        self.rollout_wg.init_model()
+        self.actor_rollout_wg = self.rollout_wg
 
     def _init_async_rollout_manager(self):
-        """初始化异步rollout管理器"""
-        try:
-            from verl.workers.rollout.async_server import AsyncLLMServerManager
+        # create async rollout manager and request scheduler
+        self.async_rollout_mode = False
+        if self.config.actor_rollout_ref.rollout.mode == "async":
+            from verl.experimental.agent_loop import AgentLoopManager
 
-            self.async_rollout_manager = AsyncLLMServerManager(
+            self.async_rollout_mode = True
+            self.async_rollout_manager = AgentLoopManager(
                 config=self.config,
-                worker_group=self.rollout_wg,
+                worker_group=self.actor_rollout_wg,
             )
-            logger.info("Async rollout manager initialized")
-        except Exception as e:
-            logger.warning(f"Failed to initialize async rollout manager: {e}")
-            self.async_rollout_mode = False
-
-    def set_parameter_synchronizer(self, param_synchronizer):
-        """设置参数同步器"""
-        self.param_synchronizer = param_synchronizer
 
     def update_rollout_weights(self, param_version: int) -> bool:
         """
@@ -468,143 +406,117 @@ def _should_pause_generation(self) -> bool:
             logger.error(f"Error checking pause conditions: {e}")
             return True  # 出错时暂停生成
 
-    def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]:
-        """生成单个batch的样本 - 改进的生成逻辑"""
-        try:
-            batch = DataProto.from_single_dict(batch_dict)
-
-            # 处理batch用于生成 - 参考OneStepOffRayTrainer的处理逻辑
-            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-
-            # 处理多模态数据和其他可选字段
-            optional_keys = ["multi_modal_data", "raw_prompt", "tools_kwargs", "interaction_kwargs"]
-            for key in optional_keys:
-                if key in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append(key)
-
-            gen_batch = batch.pop(
-                batch_keys=batch_keys_to_pop,
-                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-            )
-
-            # 重复生成多个响应 - 参考OneStepOffRayTrainer
-            n_repeats = self.config.actor_rollout_ref.rollout.n
-            gen_batch = gen_batch.repeat(repeat_times=n_repeats, interleave=True)
-
-            # 执行生成
-            if self.async_rollout_mode:
-                # 异步生成
-                gen_batch_output = ray.get(
-                    self.rollout_wg.async_generate_sequences.remote(gen_batch), timeout=self.generation_timeout
-                )
-            else:
-                # 同步生成
-                gen_batch_output = ray.get(
-                    self.rollout_wg.generate_sequences.remote(gen_batch), timeout=self.generation_timeout
-                )
-
-            # 添加UID - 确保每个样本有唯一标识
-            batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
-
-            # 重复原始batch以对齐生成的响应
-            batch = batch.repeat(repeat_times=n_repeats, interleave=True)
-
-            # 合并数据
-            final_batch = batch.union(gen_batch_output)
-
-            # 添加rollout metadata
-            final_batch.meta_info["rollout_param_version"] = self.current_param_version
-            final_batch.meta_info["generation_timestamp"] = time.time()
-
-            return final_batch
-
-        except Exception as e:
-            logger.error(f"Error generating batch: {e}")
-            self.generation_errors += 1
-            return None
-
-    def _generation_loop(self):
-        """主要的生成循环 - 改进的循环逻辑"""
-        logger.info("Starting generation loop...")
+    def fit(self):
+        """开始异步生成样本 - 改进的主运行逻辑
+        主要的生成循环
 
-        try:
-            continuous_iterator = self._create_continuous_iterator()
+        循环入口，需要
+        1. running 判断
+        4. 中断判断
+        3. 新鲜度判断
 
-            for epoch, batch_dict in continuous_iterator:
-                if not self.running:
-                    break
-
-                # 等待如果被暂停
-                if not self.rollout_controller.wait_if_paused(timeout=1.0):
-                    if not self.running:
-                        break
-                    continue
-
-                # 检查是否应该暂停生成
-                if self._should_pause_generation():
-                    time.sleep(self.batch_generation_interval)
-                    continue
-
-                # 生成样本
-                timing_raw = {}
-                with marked_timer("generate_batch", timing_raw):
-                    generated_batch = self._generate_batch(epoch, batch_dict)
-
-                if generated_batch is not None:
-                    # 准备rollout metadata
-                    rollout_metadata = {
-                        "timing": timing_raw,
-                        "generation_timestamp": time.time(),
-                        "rollout_param_version": self.current_param_version,
-                        "epoch": epoch,
-                    }
-
-                    # 放入队列
-                    success = self.message_queue_client.put_samples(
-                        epoch=epoch,
-                        sample=generated_batch,
-                        param_version=self.current_param_version,
-                        rollout_metadata=rollout_metadata,
-                    )
-
-                    if success:
-                        self.total_generated_samples += 1
-                        if self.total_generated_samples % 10 == 0:
-                            logger.info(
-                                f"Generated {self.total_generated_samples} batches, "
-                                f"param_version={self.current_param_version}, "
-                                f"errors={self.generation_errors}"
-                            )
-                    else:
-                        self.dropped_stale_samples += 1
-                        if self.dropped_stale_samples % 5 == 0:
-                            logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
-
-                # 控制生成频率
-                if self.batch_generation_interval > 0:
-                    time.sleep(self.batch_generation_interval)
+        生成样本过程中，需要
+        1. running 判断
+        2. 中断判断
+        """
 
-        except Exception as e:
-            logger.error(f"Generation loop error: {e}")
-        finally:
-            logger.info("Generation loop finished")
+        from verl.utils.tracking import Tracking
 
-    def fit(self):
-        """开始异步生成样本 - 改进的主运行逻辑"""
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
         logger.info("Starting Rollouter...")
-
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-
+        if self.param_synchronizer is None:
+            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
         self.running = True
 
         # 在单独的线程中运行生成循环
-        self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
-        self.generation_thread.start()
+        self.report_thread = threading.Thread(target=self._report_loop, daemon=True)
+        self.report_thread.start()
+
+        self.global_steps = 0
+
+        # load checkpoint before doing anything
+        self._load_checkpoint()
+
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+        self.max_steps_duration = 0
+
+        continuous_iterator = self._create_continuous_iterator()
+        for epoch, batch_dict in continuous_iterator:
+            if not self.running:
+                break
+            # 等待如果被暂停
+            if not self.rollout_controller.wait_if_paused(timeout=1.0):
+                if not self.running:
+                    break
+
+            # 检查是否应该暂停生成
+            self._should_pause_generation()
+
+            metrics = {}
+            timing_raw = {}
+            batch, gen_batch = self._prepare_generate_batch(batch_dict)
+            is_last_step = self.global_steps >= self.total_training_steps
 
-        logger.info("Rollouter started successfully")
+            # generate a batch
+            with marked_timer("gen", timing_raw, color="red"):
+                if not self.async_rollout_mode:
+                    gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                else:
+                    gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
+                timing_raw.update(gen_batch_output.meta_info["timing"])
+                gen_batch_output.meta_info.pop("timing", None)
+
+            if gen_batch_output is not None:
+                # 准备rollout metadata
+                rollout_metadata = {
+                    "timing": timing_raw,
+                    "generation_timestamp": time.time(),
+                    "rollout_param_version": self.current_param_version,
+                    "epoch": epoch,
+                }
+                # 放入队列
+                success = self.message_queue_client.put_samples(
+                    epoch=epoch,
+                    sample=gen_batch_output,
+                    param_version=self.current_param_version,
+                    rollout_metadata=rollout_metadata,
+                )
+                if success:
+                    self.total_generated_samples += 1
+                    if self.total_generated_samples % 10 == 0:
+                        logger.info(
+                            f"Generated {self.total_generated_samples} batches, "
+                            f"param_version={self.current_param_version}, "
+                            f"errors={self.generation_errors}"
+                        )
+                else:
+                    self.dropped_stale_samples += 1
+                    if self.dropped_stale_samples % 5 == 0:
+                        logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
 
+    def _report_loop(self):
         try:
             # 主线程保持运行，处理控制信号和状态监控
             last_stats_time = time.time()
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 16354313e4e..97567527b97 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -16,38 +16,25 @@
 import time
 import warnings
 from pprint import pprint
-from typing import Optional
 
 import numpy as np
 import ray
-import torch
 from omegaconf import OmegaConf
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
 
-from recipe.fully_async_policy.message_queue import QueueSample, MessageQueueClient
-from verl import DataProto
+from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.ppo import core_algos
-from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
-from verl.trainer.ppo.metric_utils import (
-    compute_data_metrics,
-    compute_throughout_metrics,
-    compute_timing_metrics,
-)
+from verl.trainer.ppo.core_algos import AdvantageEstimator
 from verl.trainer.ppo.ray_trainer import (
     RayPPOTrainer,
     ResourcePoolManager,
     Role,
     WorkerType,
-    apply_kl_penalty,
-    compute_advantage,
-    compute_response_mask,
 )
-from verl.trainer.ppo.reward import compute_reward, compute_reward_async
 from verl.utils.debug import marked_timer
-from verl.utils.metric import reduce_metrics
 from verl.utils.tracking import ValidationGenerationsLogger
 
 logger = logging.getLogger(__name__)
@@ -70,10 +57,10 @@ def __init__(
         processor=None,
         reward_fn=None,
         val_reward_fn=None,
-        train_dataset: Optional[Dataset] = None,
-        val_dataset: Optional[Dataset] = None,
+            train_dataset: Dataset | None = None,
+            val_dataset: Dataset | None = None,
         collate_fn=None,
-        train_sampler: Optional[Sampler] = None,
+            train_sampler: Sampler | None = None,
         device_name=None,
     ):
         """
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 8e686e9a471..06f0d2cbbe9 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -18,7 +18,7 @@
 import uuid
 from collections import deque
 from dataclasses import dataclass
-from typing import Any, Optional, List
+from typing import Any
 
 import ray
 from omegaconf import DictConfig
@@ -77,7 +77,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         )
 
     def put_samples(
-        self, epoch: int, samples: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None
+            self, epoch: int, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
     ) -> bool:
         """
         放入一个batch样本到队列
@@ -107,7 +107,7 @@ def put_samples(
                 logger.warning(f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}")
                 return False
 
-            for sample, meta in zip(samples, rollout_metadata_list):
+            for sample, meta in zip(samples, rollout_metadata_list, strict=False):
                 queue_sample = QueueSample(
                     id=str(uuid.uuid4()),
                     epoch=epoch,
@@ -238,7 +238,7 @@ def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
     def put_batch(
-        self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None
+            self, epoch: int, batch: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
     ) -> bool:
         """放入batch到队列"""
         return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list))
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
index 36172d02640..52a9f17d8ae 100644
--- a/recipe/fully_async_policy/unittest/test_mq.py
+++ b/recipe/fully_async_policy/unittest/test_mq.py
@@ -11,16 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import threading
 import time
 from unittest.mock import Mock
 
 import pytest
 import ray
-from recipe.fully_async_policy.message_queue import QueueSample, MessageQueue, MessageQueueClient
 from omegaconf import DictConfig
 
+from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample
+
 
 @pytest.fixture
 def mock_data_proto():
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index c5b7a71225e..893760965d0 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -76,20 +76,20 @@ class OneStepOffRayTrainer(RayPPOTrainer):
     # TODO: support each role have individual ray_worker_group_cls,
     # i.e., support different backend of different role
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        train_dataset: Dataset | None = None,
-        val_dataset: Dataset | None = None,
-        collate_fn=None,
-        train_sampler: Sampler | None = None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            train_dataset: Dataset | None = None,
+            val_dataset: Dataset | None = None,
+            collate_fn=None,
+            train_sampler: Sampler | None = None,
+            device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -164,17 +164,14 @@ def _validate(self):
 
     def _create_actor_rollout_classes(self):
         # create actor and rollout
-        if not self.hybrid_engine:
-            for role in [Role.Actor, Role.Rollout]:
-                resource_pool = self.resource_pool_manager.get_resource_pool(role)
-                role_cls = RayClassWithInitArgs(
-                    cls=self.role_worker_mapping[role],
-                    config=self.config.actor_rollout_ref,
-                    role=str(role),
-                )
-                self.resource_pool_to_cls[resource_pool][str(role)] = role_cls
-        else:
-            raise NotImplementedError
+        for role in [Role.Actor, Role.Rollout]:
+            resource_pool = self.resource_pool_manager.get_resource_pool(role)
+            role_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[role],
+                config=self.config.actor_rollout_ref,
+                role=str(role),
+            )
+            self.resource_pool_to_cls[resource_pool][str(role)] = role_cls
 
     def _init_models(self):
         if self.use_critic:

From 8d8b99d42ce393f39ac5fbacd06b67befc2724f9 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 4 Aug 2025 17:32:48 +0800
Subject: [PATCH 017/182] add rollouter thread

---
 .../fully_async_rollouter.py                  | 461 ++++++++++--------
 .../fully_async_policy/fully_async_trainer.py |  94 +---
 2 files changed, 284 insertions(+), 271 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index f1248441594..0f4f624007e 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -15,6 +15,7 @@
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
 
 import ray
 from omegaconf import OmegaConf
@@ -29,88 +30,6 @@
 
 logger = logging.getLogger(__name__)
 
-
-class RolloutController:
-    """控制rollout的暂停和恢复 - 改进的控制机制"""
-
-    def __init__(self):
-        self.is_paused = False
-        self.pause_event = threading.Event()
-        self.resume_event = threading.Event()
-        self.resume_event.set()  # 初始状态为可运行
-        self.pending_requests = []
-        self.lock = threading.RLock()
-        self.pause_count = 0
-
-    def pause(self, timeout: float | None = None) -> bool:
-        """
-        暂停rollout
-
-        Args:
-            timeout: 暂停超时时间，如果为None则无限等待
-
-        Returns:
-            bool: 是否成功暂停
-        """
-        with self.lock:
-            if not self.is_paused:
-                self.is_paused = True
-                self.resume_event.clear()
-                self.pause_event.set()
-                self.pause_count += 1
-                logger.info(f"Rollout paused (count: {self.pause_count})")
-                return True
-            else:
-                logger.debug("Rollout already paused")
-                return True
-
-    def resume(self) -> bool:
-        """
-        恢复rollout
-
-        Returns:
-            bool: 是否成功恢复
-        """
-        with self.lock:
-            if self.is_paused:
-                self.is_paused = False
-                self.pause_event.clear()
-                self.resume_event.set()
-                logger.info("Rollout resumed")
-                return True
-            else:
-                logger.debug("Rollout already running")
-                return True
-
-    def wait_if_paused(self, timeout: float = None) -> bool:
-        """
-        如果被暂停则等待恢复
-
-        Args:
-            timeout: 等待超时时间
-
-        Returns:
-            bool: 是否成功等待（未超时）
-        """
-        if self.is_paused:
-            logger.debug(f"Waiting for resume (timeout: {timeout})")
-            return self.resume_event.wait(timeout)
-        return True
-
-    def is_pause_requested(self) -> bool:
-        """检查是否有暂停请求"""
-        return self.pause_event.is_set()
-
-    def get_status(self) -> dict:
-        """获取控制器状态"""
-        with self.lock:
-            return {
-                "is_paused": self.is_paused,
-                "pause_count": self.pause_count,
-                "has_pending_requests": len(self.pending_requests) > 0,
-            }
-
-
 @ray.remote(num_cpus=10, max_concurrency=10)
 class FullyAsyncRollouter(RayPPOTrainer):
     """
@@ -181,10 +100,10 @@ def __init__(
 
         self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+
+        # rollouter 参数配置
         self.message_queue_client = None
 
-        # Rollout控制
-        self.rollout_controller = RolloutController()
         self.current_param_version = 0
 
         # 新鲜度控制 - 改进的配置管理
@@ -203,10 +122,19 @@ def __init__(
         self.rollout_wg = None
         self.message_queue_client = None
 
-        # 运行状态
+        # 并发控制
         self.running = False
+        self.paused = False
         self.generation_thread = None
         self.thread_executor = ThreadPoolExecutor(max_workers=2)
+        self.lock = threading.RLock()
+        self.condition = threading.Condition(self.lock)
+
+        # 暂停/恢复统计信息
+        self.pause_count = 0
+        self.resume_count = 0
+        self.total_pause_time = 0.0
+        self.last_pause_time = None
 
         # 参数同步相关
         self.param_synchronizer = None
@@ -216,11 +144,13 @@ def __init__(
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
-        self.message_queue_client = message_queue_client
+        with self.lock:
+            self.message_queue_client = message_queue_client
 
     def set_parameter_synchronizer(self, param_synchronizer):
         """设置参数同步器"""
-        self.param_synchronizer = param_synchronizer
+        with self.lock:
+            self.param_synchronizer = param_synchronizer
 
     def _validate_config(self):
         # 验证异步训练配置
@@ -229,11 +159,11 @@ def _validate_config(self):
 
     def init_workers(self):
         """初始化rollout workers"""
-        logger.info("Initializing Rollouter workers...")
-        self._init_resource_pools()
-
-        self.rollout_wg = all_wg["rollout"]
-        self.rollout_wg.init_model()
+        with self.lock:
+            logger.info("Initializing Rollouter workers...")
+            self._init_resource_pools()
+            self.rollout_wg = self.all_wg["rollout"]
+            self.rollout_wg.init_model()
 
     def _create_actor_rollout_classes(self):
         # only create rollout
@@ -371,43 +301,43 @@ def _create_continuous_iterator(self):
             for batch_dict in iterator:
                 yield epoch, batch_dict
 
-    def _should_pause_generation(self) -> bool:
-        """
-        判断是否应该暂停生成，基于新鲜度控制 - 改进的判断逻辑
-        """
-        if self.message_queue_client is None:
-            return False
+    def fit(self):
+        """开始异步生成样本 - 改进的主运行逻辑"""
+        from verl.utils.tracking import Tracking
 
-        try:
-            queue_stats = self.message_queue_client.get_statistics()
-            queue_size = queue_stats["queue_size"]
-            current_trainer_version = queue_stats["current_param_version"]
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+        logger.info("Starting Rollouter...")
+        if self.message_queue_client is None:
+            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+        if self.param_synchronizer is None:
+            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
-            # 计算参数版本差异
-            version_diff = self.current_param_version - current_trainer_version
+        # 设置运行状态
+        with self.lock:
+            self.running = True
+            self.paused = False
 
-            # 如果版本差异过大，暂停生成
-            if version_diff >= self.max_staleness_allowed:
-                logger.debug(
-                    f"Should pause due to staleness: rollout_version={self.current_param_version}, "
-                    f"trainer_version={current_trainer_version}, diff={version_diff}"
-                )
-                return True
+        # 创建并启动生成线程
+        self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
+        self.generation_thread.start()
 
-            # 如果队列太满，也暂停生成
-            max_queue_size = self.staleness_threshold * self.config.data.train_batch_size
-            if queue_size >= max_queue_size:
-                logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
-                return True
+        # 创建并启动监控线程
+        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
+        self.monitor_thread.start()
 
-            return False
+        # 等待线程完成
+        self.generation_thread.join()
+        self.monitor_thread.join()
 
-        except Exception as e:
-            logger.error(f"Error checking pause conditions: {e}")
-            return True  # 出错时暂停生成
+        logger.info("Rollouter fit completed")
 
-    def fit(self):
-        """开始异步生成样本 - 改进的主运行逻辑
+    def _generation_loop(self):
+        """
         主要的生成循环
 
         循环入口，需要
@@ -428,16 +358,6 @@ def fit(self):
             default_backend=self.config.trainer.logger,
             config=OmegaConf.to_container(self.config, resolve=True),
         )
-        logger.info("Starting Rollouter...")
-        if self.message_queue_client is None:
-            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-        if self.param_synchronizer is None:
-            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
-        self.running = True
-
-        # 在单独的线程中运行生成循环
-        self.report_thread = threading.Thread(target=self._report_loop, daemon=True)
-        self.report_thread.start()
 
         self.global_steps = 0
 
@@ -462,17 +382,40 @@ def fit(self):
         last_val_metrics = None
         self.max_steps_duration = 0
 
+        """
+        主要的生成循环
+
+        循环入口，需要
+        1. running 判断
+        4. 中断判断
+        3. 新鲜度判断
+
+        生成样本过程中，需要
+        1. running 判断
+        2. 中断判断
+        """
+
         continuous_iterator = self._create_continuous_iterator()
         for epoch, batch_dict in continuous_iterator:
-            if not self.running:
-                break
-            # 等待如果被暂停
-            if not self.rollout_controller.wait_if_paused(timeout=1.0):
+            with self.lock:
+                if not self.running:
+                    break
+
+                # 如果被暂停，等待恢复
+                while self.paused and self.running:
+                    logger.debug("Generation thread paused, waiting...")
+                    self.condition.wait()
+
+                # 再次检查运行状态
                 if not self.running:
                     break
 
             # 检查是否应该暂停生成
-            self._should_pause_generation()
+            while True:
+                if self._should_pause_generation():
+                    with self.lock:
+                        self.paused = True
+                        logger.info("Generation paused due to staleness or queue size")
 
             metrics = {}
             timing_raw = {}
@@ -503,18 +446,54 @@ def fit(self):
                     param_version=self.current_param_version,
                     rollout_metadata=rollout_metadata,
                 )
-                if success:
-                    self.total_generated_samples += 1
-                    if self.total_generated_samples % 10 == 0:
-                        logger.info(
-                            f"Generated {self.total_generated_samples} batches, "
-                            f"param_version={self.current_param_version}, "
-                            f"errors={self.generation_errors}"
-                        )
-                else:
-                    self.dropped_stale_samples += 1
-                    if self.dropped_stale_samples % 5 == 0:
-                        logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
+
+                with self.lock:
+                    if success:
+                        self.total_generated_samples += 1
+                        if self.total_generated_samples % 10 == 0:
+                            logger.info(
+                                f"Generated {self.total_generated_samples} batches, "
+                                f"param_version={self.current_param_version}, "
+                                f"errors={self.generation_errors}"
+                            )
+                    else:
+                        self.dropped_stale_samples += 1
+                        if self.dropped_stale_samples % 5 == 0:
+                            logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
+
+    def _monitor_loop(self):
+        """监控线程 - 监控状态并处理控制信号"""
+        try:
+            # 主线程保持运行，处理控制信号和状态监控
+            last_stats_time = time.time()
+            stats_interval = 30.0  # 30秒报告一次统计
+            check_interval = 5.0  # 5秒检查一次状态
+
+            while True:
+                with self.lock:
+                    if not self.running:
+                        break
+
+                time.sleep(check_interval)
+
+                # 定期打印统计信息
+                current_time = time.time()
+                if current_time - last_stats_time >= stats_interval:
+                    self._log_statistics()
+                    last_stats_time = current_time
+
+                # 检查是否应该恢复生成
+                if self._should_resume_generation():
+                    with self.lock:
+                        if self.paused:
+                            self.paused = False
+                            self.condition.notify_all()
+                            logger.info("Generation resumed")
+
+        except Exception as e:
+            logger.error(f"Error in monitor loop: {e}")
+        finally:
+            logger.info("Monitor thread exiting")
 
     def _report_loop(self):
         try:
@@ -544,32 +523,116 @@ def _report_loop(self):
         finally:
             self.shutdown()
 
-    def _log_statistics(self):
-        """记录统计信息"""
+
+    def _should_pause_generation(self) -> bool:
+        """
+        判断是否应该暂停生成，基于新鲜度控制 - 改进的判断逻辑
+        """
         try:
-            controller_status = self.rollout_controller.get_status()
             queue_stats = self.message_queue_client.get_statistics()
+            queue_size = queue_stats["queue_size"]
+            current_trainer_version = queue_stats["current_param_version"]
+
+            # 计算参数版本差异
+            version_diff = self.current_param_version - current_trainer_version
+
+            # 如果版本差异过大，暂停生成
+            if version_diff >= self.max_staleness_allowed:
+                logger.debug(
+                    f"Should pause due to staleness: rollout_version={self.current_param_version}, "
+                    f"trainer_version={current_trainer_version}, diff={version_diff}"
+                )
+                return True
+
+            # 如果队列太满，也暂停生成
+            max_queue_size = self.staleness_threshold * self.config.data.train_batch_size
+            if queue_size >= max_queue_size:
+                logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
+                return True
+
+            return False
 
-            logger.info(
-                f"Rollouter stats - Generated: {self.total_generated_samples}, "
-                f"Dropped: {self.dropped_stale_samples}, "
-                f"Errors: {self.generation_errors}, "
-                f"Queue size: {queue_stats['queue_size']}, "
-                f"Param version: {self.current_param_version}, "
-                f"Paused: {controller_status['is_paused']}, "
-                f"Sync requests: {self.param_sync_requests}"
-            )
         except Exception as e:
-            logger.error(f"Error logging statistics: {e}")
+            logger.error(f"Error checking pause conditions: {e}")
+            return True  # 出错时暂停生成
+
+    def _should_resume_generation(self) -> bool:
+        """判断是否应该恢复生成"""
+        if self.message_queue_client is None:
+            return False
+
+        try:
+            with self.lock:
+                if not self.paused:
+                    return False
+
+            queue_stats = self.message_queue_client.get_statistics()
+            queue_size = queue_stats["queue_size"]
+            current_trainer_version = queue_stats["current_param_version"]
+
+            # 计算参数版本差异
+            version_diff = self.current_param_version - current_trainer_version
+
+            # 如果版本差异减小，可以恢复生成
+            if version_diff < self.max_staleness_allowed - 1:
+                logger.debug(
+                    f"Can resume due to reduced staleness: rollout_version={self.current_param_version}, "
+                    f"trainer_version={current_trainer_version}, diff={version_diff}"
+                )
+                return True
+
+            # 如果队列不太满，也可以恢复生成
+            resume_queue_size = (self.staleness_threshold * self.config.data.train_batch_size) // 2
+            if queue_size <= resume_queue_size:
+                logger.debug(
+                    f"Can resume due to reduced queue: size={queue_size}, resume_threshold={resume_queue_size}")
+                return True
+
+            return False
+
+        except Exception as e:
+            logger.error(f"Error checking resume conditions: {e}")
+            return False
+
+    def pause(self) -> bool:
+        """暂停生成 - 供外部调用"""
+        with self.lock:
+            if not self.running:
+                logger.warning("Cannot pause: not running")
+                return False
+
+            if self.paused:
+                logger.debug("Already paused")
+                return True
+
+            self.paused = True
+            logger.info("Generation paused")
+            return True
+
+    def resume(self) -> bool:
+        """恢复生成 - 供外部调用"""
+        with self.lock:
+            if not self.running:
+                logger.warning("Cannot resume: not running")
+                return False
+
+            if not self.paused:
+                logger.debug("Not paused")
+                return True
+
+            self.paused = False
+            self.condition.notify_all()
+            logger.info("Generation resumed")
+            return True
 
     def shutdown(self):
         """关闭Rollouter - 改进的关闭逻辑"""
         logger.info("Shutting down Rollouter...")
 
-        self.running = False
-
-        # 恢复可能被暂停的生成线程
-        self.rollout_controller.resume()
+        with self.lock:
+            self.running = False
+            self.paused = False
+            self.condition.notify_all()
 
         # 等待生成线程结束
         if self.generation_thread and self.generation_thread.is_alive():
@@ -579,6 +642,14 @@ def shutdown(self):
             if self.generation_thread.is_alive():
                 logger.warning("Generation thread did not finish within timeout")
 
+        # 等待监控线程结束
+        if self.monitor_thread and self.monitor_thread.is_alive():
+            logger.info("Waiting for monitor thread to finish...")
+            self.monitor_thread.join(timeout=5.0)
+
+            if self.monitor_thread.is_alive():
+                logger.warning("Monitor thread did not finish within timeout")
+
         # 关闭线程池
         if self.thread_executor:
             self.thread_executor.shutdown(wait=True)
@@ -593,31 +664,35 @@ def shutdown(self):
 
         logger.info("Rollouter shutdown complete")
 
-    def get_statistics(self) -> dict:
-        """获取统计信息 - 改进的统计信息"""
-        controller_status = self.rollout_controller.get_status()
-
-        stats = {
-            "total_generated_samples": self.total_generated_samples,
-            "dropped_stale_samples": self.dropped_stale_samples,
-            "generation_errors": self.generation_errors,
-            "current_param_version": self.current_param_version,
-            "param_sync_requests": self.param_sync_requests,
-            "last_sync_time": self.last_sync_time,
-            "is_running": self.running,
-            "sync_in_progress": self.sync_in_progress,
-        }
-
-        stats.update(controller_status)
-
-        # 添加队列统计（如果可用）
-        if self.message_queue_client:
-            try:
-                queue_stats = self.message_queue_client.get_statistics()
-                stats["queue_size"] = queue_stats.get("queue_size", 0)
-                stats["queue_total_produced"] = queue_stats.get("total_produced", 0)
-                stats["queue_dropped_samples"] = queue_stats.get("dropped_samples", 0)
-            except Exception as e:
-                logger.debug(f"Error getting queue statistics: {e}")
 
-        return stats
+    def _log_statistics(self):
+        """记录统计信息"""
+        try:
+            controller_status = self.rollout_controller.get_status()
+            queue_stats = self.message_queue_client.get_statistics()
+
+            logger.info(
+                f"Rollouter stats - Generated: {self.total_generated_samples}, "
+                f"Dropped: {self.dropped_stale_samples}, "
+                f"Errors: {self.generation_errors}, "
+                f"Queue size: {queue_stats['queue_size']}, "
+                f"Param version: {self.current_param_version}, "
+                f"Paused: {controller_status['is_paused']}, "
+                f"Sync requests: {self.param_sync_requests}"
+            )
+        except Exception as e:
+            logger.error(f"Error logging statistics: {e}")
+
+    def get_statistics(self) -> dict:
+        with self.lock:
+            stats = {
+                "total_generated_samples": self.total_generated_samples,
+                "dropped_stale_samples": self.dropped_stale_samples,
+                "generation_errors": self.generation_errors,
+                "current_param_version": self.current_param_version,
+                "param_sync_requests": self.param_sync_requests,
+                "last_sync_time": self.last_sync_time,
+                "is_running": self.running,
+                "sync_in_progress": self.sync_in_progress,
+            }
+            return stats
\ No newline at end of file
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 97567527b97..0dd90127d7d 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -132,96 +132,34 @@ def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
         self.message_queue_client = message_queue_client
 
-    def _validate(self):
-        """执行验证 - 参考OneStepOffRayTrainer的验证逻辑"""
-        return None
-
-    def init_workers(self):
-        """Initialize distributed training workers using Ray backend.
-
-        Creates:
-        1. Ray resource pools from configuration
-        2. Worker groups for each role (actor, critic, etc.)
-        """
-        self.resource_pool_manager.create_resource_pool()
-
-        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
-
-        # 创建actor worker
-        resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor)
-        actor_cls = RayClassWithInitArgs(
-            cls=self.role_worker_mapping[Role.Actor],
-            config=self.config.actor_rollout_ref,
-            role="actor",
-        )
-        self.resource_pool_to_cls[resource_pool]["actor"] = actor_cls
-
-        # 创建critic worker
-        if self.use_critic:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
-            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
-            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
-
-        # 创建reference policy worker
-        if self.use_reference_policy:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
-            ref_policy_cls = RayClassWithInitArgs(
-                cls=self.role_worker_mapping[Role.RefPolicy],
+    def _create_actor_rollout_classes(self):
+        # create actor
+        for role in [Role.Actor]:
+            resource_pool = self.resource_pool_manager.get_resource_pool(role)
+            role_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[role],
                 config=self.config.actor_rollout_ref,
-                role="ref",
+                role=str(role),
             )
-            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
-
-        # 创建reward model worker
-        if self.use_rm:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
-            rm_cls = RayClassWithInitArgs(
-                cls=self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model
-            )
-            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
-
-        # 初始化WorkerGroup - 参考OneStepOffRayTrainer的实现
-        all_wg = {}
-        wg_kwargs = {}
-        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
-            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
-        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
-            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
-            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
-                "worker_nsight_options must be set when profile_steps is set"
-            )
-            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
-                OmegaConf.select(self.config.trainer, "worker_nsight_options")
-            )
-
-        for resource_pool, class_dict in self.resource_pool_to_cls.items():
-            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
-            wg_dict = self.ray_worker_group_cls(
-                resource_pool=resource_pool,
-                ray_cls_with_init=worker_dict_cls,
-                device_name=self.device_name,
-                **wg_kwargs,
-            )
-            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
-            all_wg.update(spawn_wg)
-
-        # 分配worker groups
-        self.actor_wg = all_wg["actor"]
-        self.actor_wg.init_model()
+            self.resource_pool_to_cls[resource_pool][str(role)] = role_cls
 
+    def _init_models(self):
         if self.use_critic:
-            self.critic_wg = all_wg["critic"]
+            self.critic_wg = self.all_wg[str(Role.Critic)]
             self.critic_wg.init_model()
 
         if self.use_reference_policy and not self.ref_in_actor:
-            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)]
             self.ref_policy_wg.init_model()
 
         if self.use_rm:
-            self.rm_wg = all_wg["rm"]
+            self.rm_wg = self.all_wg[str(Role.RewardModel)]
             self.rm_wg.init_model()
 
-        logger.info("FullyAsyncTrainer workers initialized successfully")
+        self.actor_wg = self.all_wg[str(Role.Actor)]
+        self.actor_wg.init_model()
+        self.actor_rollout_wg = self.actor_wg  # to be compatible with the functions that not be modified
+
 
     def fit(self):
         """

From ba8f1ce51ff7b8dee6eb911bbf68d6cbffb371b8 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 4 Aug 2025 18:56:59 +0800
Subject: [PATCH 018/182] lock

---
 .../fully_async_rollouter.py                  | 98 +++----------------
 1 file changed, 14 insertions(+), 84 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 0f4f624007e..6274237c6a8 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -30,6 +30,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 @ray.remote(num_cpus=10, max_concurrency=10)
 class FullyAsyncRollouter(RayPPOTrainer):
     """
@@ -126,6 +127,7 @@ def __init__(
         self.running = False
         self.paused = False
         self.generation_thread = None
+        self.monitor_thread = None
         self.thread_executor = ThreadPoolExecutor(max_workers=2)
         self.lock = threading.RLock()
         self.condition = threading.Condition(self.lock)
@@ -303,14 +305,6 @@ def _create_continuous_iterator(self):
 
     def fit(self):
         """开始异步生成样本 - 改进的主运行逻辑"""
-        from verl.utils.tracking import Tracking
-
-        logger = Tracking(
-            project_name=self.config.trainer.project_name,
-            experiment_name=self.config.trainer.experiment_name,
-            default_backend=self.config.trainer.logger,
-            config=OmegaConf.to_container(self.config, resolve=True),
-        )
         logger.info("Starting Rollouter...")
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
@@ -401,6 +395,9 @@ def _generation_loop(self):
                 if not self.running:
                     break
 
+                if self._should_pause_generation():
+                    self.pause()
+
                 # 如果被暂停，等待恢复
                 while self.paused and self.running:
                     logger.debug("Generation thread paused, waiting...")
@@ -410,13 +407,6 @@ def _generation_loop(self):
                 if not self.running:
                     break
 
-            # 检查是否应该暂停生成
-            while True:
-                if self._should_pause_generation():
-                    with self.lock:
-                        self.paused = True
-                        logger.info("Generation paused due to staleness or queue size")
-
             metrics = {}
             timing_raw = {}
             batch, gen_batch = self._prepare_generate_batch(batch_dict)
@@ -499,7 +489,7 @@ def _report_loop(self):
         try:
             # 主线程保持运行，处理控制信号和状态监控
             last_stats_time = time.time()
-            stats_interval = 30.0  # 30秒报告一次统计
+            stats_interval = 10.0
 
             while self.running:
                 time.sleep(1.0)
@@ -507,14 +497,16 @@ def _report_loop(self):
                 # 定期打印统计信息
                 current_time = time.time()
                 if current_time - last_stats_time >= stats_interval:
-                    self._log_statistics()
+                    self.get_statistics()
                     last_stats_time = current_time
+                    if not self._should_pause_generation():
+                        self.resume()
 
                 # 检查生成线程状态
                 if not self.generation_thread.is_alive():
                     logger.error("Generation thread died, restarting...")
-                    self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
-                    self.generation_thread.start()
+                    raise RuntimeError("generation_thread not alive")
+
 
         except KeyboardInterrupt:
             logger.info("Received interrupt signal, shutting down...")
@@ -523,7 +515,6 @@ def _report_loop(self):
         finally:
             self.shutdown()
 
-
     def _should_pause_generation(self) -> bool:
         """
         判断是否应该暂停生成，基于新鲜度控制 - 改进的判断逻辑
@@ -556,68 +547,25 @@ def _should_pause_generation(self) -> bool:
             logger.error(f"Error checking pause conditions: {e}")
             return True  # 出错时暂停生成
 
-    def _should_resume_generation(self) -> bool:
-        """判断是否应该恢复生成"""
-        if self.message_queue_client is None:
-            return False
-
-        try:
-            with self.lock:
-                if not self.paused:
-                    return False
-
-            queue_stats = self.message_queue_client.get_statistics()
-            queue_size = queue_stats["queue_size"]
-            current_trainer_version = queue_stats["current_param_version"]
-
-            # 计算参数版本差异
-            version_diff = self.current_param_version - current_trainer_version
-
-            # 如果版本差异减小，可以恢复生成
-            if version_diff < self.max_staleness_allowed - 1:
-                logger.debug(
-                    f"Can resume due to reduced staleness: rollout_version={self.current_param_version}, "
-                    f"trainer_version={current_trainer_version}, diff={version_diff}"
-                )
-                return True
-
-            # 如果队列不太满，也可以恢复生成
-            resume_queue_size = (self.staleness_threshold * self.config.data.train_batch_size) // 2
-            if queue_size <= resume_queue_size:
-                logger.debug(
-                    f"Can resume due to reduced queue: size={queue_size}, resume_threshold={resume_queue_size}")
-                return True
-
-            return False
-
-        except Exception as e:
-            logger.error(f"Error checking resume conditions: {e}")
-            return False
-
     def pause(self) -> bool:
         """暂停生成 - 供外部调用"""
         with self.lock:
             if not self.running:
-                logger.warning("Cannot pause: not running")
                 return False
 
             if self.paused:
-                logger.debug("Already paused")
                 return True
 
             self.paused = True
-            logger.info("Generation paused")
             return True
 
     def resume(self) -> bool:
         """恢复生成 - 供外部调用"""
         with self.lock:
             if not self.running:
-                logger.warning("Cannot resume: not running")
                 return False
 
             if not self.paused:
-                logger.debug("Not paused")
                 return True
 
             self.paused = False
@@ -664,35 +612,17 @@ def shutdown(self):
 
         logger.info("Rollouter shutdown complete")
 
-
-    def _log_statistics(self):
-        """记录统计信息"""
-        try:
-            controller_status = self.rollout_controller.get_status()
-            queue_stats = self.message_queue_client.get_statistics()
-
-            logger.info(
-                f"Rollouter stats - Generated: {self.total_generated_samples}, "
-                f"Dropped: {self.dropped_stale_samples}, "
-                f"Errors: {self.generation_errors}, "
-                f"Queue size: {queue_stats['queue_size']}, "
-                f"Param version: {self.current_param_version}, "
-                f"Paused: {controller_status['is_paused']}, "
-                f"Sync requests: {self.param_sync_requests}"
-            )
-        except Exception as e:
-            logger.error(f"Error logging statistics: {e}")
-
     def get_statistics(self) -> dict:
         with self.lock:
+            queue_stats = self.message_queue_client.get_statistics()
             stats = {
                 "total_generated_samples": self.total_generated_samples,
                 "dropped_stale_samples": self.dropped_stale_samples,
-                "generation_errors": self.generation_errors,
                 "current_param_version": self.current_param_version,
                 "param_sync_requests": self.param_sync_requests,
                 "last_sync_time": self.last_sync_time,
                 "is_running": self.running,
                 "sync_in_progress": self.sync_in_progress,
+                "queue_size": f"{queue_stats['queue_size']}",
             }
-            return stats
\ No newline at end of file
+            return stats

From 8e5edeb8884b76134e574444657065cba147ee16 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 4 Aug 2025 19:22:45 +0800
Subject: [PATCH 019/182] test

---
 recipe/fully_async_policy/run_benchmark.sh    | 307 ++++++++++++
 .../unittest/test_fully_async_components.py   | 444 ++++++++++++++++++
 tests/special_e2e/run_fully_async_policy.sh   | 196 ++++++++
 3 files changed, 947 insertions(+)
 create mode 100644 recipe/fully_async_policy/run_benchmark.sh
 create mode 100644 recipe/fully_async_policy/unittest/test_fully_async_components.py
 create mode 100644 tests/special_e2e/run_fully_async_policy.sh

diff --git a/recipe/fully_async_policy/run_benchmark.sh b/recipe/fully_async_policy/run_benchmark.sh
new file mode 100644
index 00000000000..f9bfaceaa32
--- /dev/null
+++ b/recipe/fully_async_policy/run_benchmark.sh
@@ -0,0 +1,307 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+# Benchmark script for fully_async_policy performance testing
+# This script runs various performance tests to evaluate the async training system
+
+NUM_GPUS=${NUM_GPUS:-8}
+ACTOR_STRATEGY=${ACTOR_STRATEGY:-"fsdp2"}
+
+# Download model if not exists
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+# Create benchmark results directory
+BENCHMARK_DIR="benchmark_results_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "${BENCHMARK_DIR}"
+
+echo "Starting fully_async_policy performance benchmark..."
+echo "Results will be saved to: ${BENCHMARK_DIR}"
+
+# Benchmark parameters
+n_gpus_rollout=2
+n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
+
+# Common parameters
+train_prompt_bsz=16
+n_resp_per_prompt=4
+train_prompt_mini_bsz=4
+max_prompt_length=512
+max_response_length=1024
+
+# Benchmark Test 1: Different staleness thresholds
+echo "=== Benchmark Test 1: Staleness Threshold Impact ==="
+staleness_values=(1 3 5 10)
+
+for staleness in "${staleness_values[@]}"; do
+    echo "Testing staleness threshold: ${staleness}"
+
+    exp_name="benchmark-staleness-${staleness}"
+    log_file="${BENCHMARK_DIR}/staleness_${staleness}.log"
+
+    timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
+        data.train_files="${HOME}/data/gsm8k/train.parquet" \
+        data.val_files="${HOME}/data/gsm8k/test.parquet" \
+        data.prompt_key=prompt \
+        data.truncation='left' \
+        data.max_prompt_length=${max_prompt_length} \
+        data.max_response_length=${max_response_length} \
+        data.train_batch_size=${train_prompt_bsz} \
+        actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+        actor_rollout_ref.model.path="${MODEL_PATH}" \
+        trainer.logger=['console'] \
+        trainer.project_name='verl-benchmark' \
+        trainer.experiment_name="${exp_name}" \
+        trainer.val_before_train=False \
+        trainer.test_freq=-1 \
+        trainer.save_freq=-1 \
+        trainer.total_epochs=1 \
+        trainer.total_training_steps=10 \
+        trainer.n_gpus_per_node=${n_gpus_training} \
+        rollout.n_gpus_per_node=${n_gpus_rollout} \
+        async_training.staleness_threshold=${staleness} \
+        async_training.max_staleness_allowed=$((staleness + 2)) \
+        > "${log_file}" 2>&1 || echo "Test with staleness ${staleness} timed out or failed"
+
+    # Extract key metrics from log
+    if [ -f "${log_file}" ]; then
+        echo "=== Metrics for staleness=${staleness} ===" >> "${BENCHMARK_DIR}/summary.txt"
+        grep -E "(Generated.*batches|Dropped.*samples|param_version|Queue size)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
+        echo "" >> "${BENCHMARK_DIR}/summary.txt"
+    fi
+done
+
+# Benchmark Test 2: Different queue sizes
+echo "=== Benchmark Test 2: Queue Size Impact ==="
+queue_sizes=(50 100 500 1000)
+
+for queue_size in "${queue_sizes[@]}"; do
+    echo "Testing queue size: ${queue_size}"
+
+    exp_name="benchmark-queue-${queue_size}"
+    log_file="${BENCHMARK_DIR}/queue_${queue_size}.log"
+
+    timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
+        data.train_files="${HOME}/data/gsm8k/train.parquet" \
+        data.val_files="${HOME}/data/gsm8k/test.parquet" \
+        data.prompt_key=prompt \
+        data.truncation='left' \
+        data.max_prompt_length=${max_prompt_length} \
+        data.max_response_length=${max_response_length} \
+        data.train_batch_size=${train_prompt_bsz} \
+        actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+        actor_rollout_ref.model.path="${MODEL_PATH}" \
+        trainer.logger=['console'] \
+        trainer.project_name='verl-benchmark' \
+        trainer.experiment_name="${exp_name}" \
+        trainer.val_before_train=False \
+        trainer.test_freq=-1 \
+        trainer.save_freq=-1 \
+        trainer.total_epochs=1 \
+        trainer.total_training_steps=10 \
+        trainer.n_gpus_per_node=${n_gpus_training} \
+        rollout.n_gpus_per_node=${n_gpus_rollout} \
+        async_training.max_queue_size=${queue_size} \
+        > "${log_file}" 2>&1 || echo "Test with queue size ${queue_size} timed out or failed"
+
+    # Extract key metrics from log
+    if [ -f "${log_file}" ]; then
+        echo "=== Metrics for queue_size=${queue_size} ===" >> "${BENCHMARK_DIR}/summary.txt"
+        grep -E "(Generated.*batches|Queue size|memory)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
+        echo "" >> "${BENCHMARK_DIR}/summary.txt"
+    fi
+done
+
+# Benchmark Test 3: Different batch generation intervals
+echo "=== Benchmark Test 3: Generation Interval Impact ==="
+intervals=(0.0 0.1 0.5 1.0)
+
+for interval in "${intervals[@]}"; do
+    echo "Testing batch generation interval: ${interval}s"
+
+    exp_name="benchmark-interval-${interval}"
+    log_file="${BENCHMARK_DIR}/interval_${interval}.log"
+
+    timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
+        data.train_files="${HOME}/data/gsm8k/train.parquet" \
+        data.val_files="${HOME}/data/gsm8k/test.parquet" \
+        data.prompt_key=prompt \
+        data.truncation='left' \
+        data.max_prompt_length=${max_prompt_length} \
+        data.max_response_length=${max_response_length} \
+        data.train_batch_size=${train_prompt_bsz} \
+        actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+        actor_rollout_ref.model.path="${MODEL_PATH}" \
+        trainer.logger=['console'] \
+        trainer.project_name='verl-benchmark' \
+        trainer.experiment_name="${exp_name}" \
+        trainer.val_before_train=False \
+        trainer.test_freq=-1 \
+        trainer.save_freq=-1 \
+        trainer.total_epochs=1 \
+        trainer.total_training_steps=10 \
+        trainer.n_gpus_per_node=${n_gpus_training} \
+        rollout.n_gpus_per_node=${n_gpus_rollout} \
+        async_training.batch_generation_interval=${interval} \
+        > "${log_file}" 2>&1 || echo "Test with interval ${interval} timed out or failed"
+
+    # Extract key metrics from log
+    if [ -f "${log_file}" ]; then
+        echo "=== Metrics for interval=${interval}s ===" >> "${BENCHMARK_DIR}/summary.txt"
+        grep -E "(Generated.*batches|generation_timestamp)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
+        echo "" >> "${BENCHMARK_DIR}/summary.txt"
+    fi
+done
+
+# Benchmark Test 4: Resource allocation comparison
+echo "=== Benchmark Test 4: Resource Allocation Comparison ==="
+
+# Test different rollout/training GPU distributions
+if [ "${NUM_GPUS}" -ge "6" ]; then
+    gpu_configs=(
+        "1,$((NUM_GPUS - 1))"  # 1 rollout, rest training
+        "2,$((NUM_GPUS - 2))"  # 2 rollout, rest training
+        "3,$((NUM_GPUS - 3))"  # 3 rollout, rest training
+    )
+
+    for config in "${gpu_configs[@]}"; do
+        IFS=',' read -r rollout_gpus training_gpus <<< "$config"
+
+        echo "Testing GPU allocation: ${rollout_gpus} rollout, ${training_gpus} training"
+
+        exp_name="benchmark-gpu-${rollout_gpus}r-${training_gpus}t"
+        log_file="${BENCHMARK_DIR}/gpu_${rollout_gpus}_${training_gpus}.log"
+
+        timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
+            data.train_files="${HOME}/data/gsm8k/train.parquet" \
+            data.val_files="${HOME}/data/gsm8k/test.parquet" \
+            data.prompt_key=prompt \
+            data.truncation='left' \
+            data.max_prompt_length=${max_prompt_length} \
+            data.max_response_length=${max_response_length} \
+            data.train_batch_size=${train_prompt_bsz} \
+            actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+            actor_rollout_ref.model.path="${MODEL_PATH}" \
+            trainer.logger=['console'] \
+            trainer.project_name='verl-benchmark' \
+            trainer.experiment_name="${exp_name}" \
+            trainer.val_before_train=False \
+            trainer.test_freq=-1 \
+            trainer.save_freq=-1 \
+            trainer.total_epochs=1 \
+            trainer.total_training_steps=10 \
+            trainer.n_gpus_per_node=${training_gpus} \
+            rollout.n_gpus_per_node=${rollout_gpus} \
+            > "${log_file}" 2>&1 || echo "Test with GPU config ${config} timed out or failed"
+
+        # Extract key metrics from log
+        if [ -f "${log_file}" ]; then
+            echo "=== Metrics for ${rollout_gpus}r/${training_gpus}t GPUs ===" >> "${BENCHMARK_DIR}/summary.txt"
+            grep -E "(Generated.*batches|training.*steps|GPU)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
+            echo "" >> "${BENCHMARK_DIR}/summary.txt"
+        fi
+    done
+fi
+
+# Benchmark Test 5: Pause/Resume Performance
+echo "=== Benchmark Test 5: Pause/Resume Performance Test ==="
+log_file="${BENCHMARK_DIR}/pause_resume.log"
+
+# Start the training in background
+python3 -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${HOME}/data/gsm8k/train.parquet" \
+    data.val_files="${HOME}/data/gsm8k/test.parquet" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    trainer.logger=['console'] \
+    trainer.project_name='verl-benchmark-pause' \
+    trainer.experiment_name='pause-resume-test' \
+    trainer.val_before_train=False \
+    trainer.test_freq=-1 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=1 \
+    trainer.total_training_steps=20 \
+    trainer.n_gpus_per_node=${n_gpus_training} \
+    rollout.n_gpus_per_node=${n_gpus_rollout} \
+    > "${log_file}" 2>&1 &
+
+TRAINING_PID=$!
+
+# Note: In actual implementation, we would need a way to remotely control pause/resume
+# This is a placeholder for testing the pause/resume functionality
+echo "Training started with PID: ${TRAINING_PID}"
+echo "Pause/resume testing would require remote control interface" >> "${BENCHMARK_DIR}/summary.txt"
+
+# Wait a bit and then kill the training (simulating early termination)
+sleep 60
+if kill -0 $TRAINING_PID 2>/dev/null; then
+    echo "Stopping training process..."
+    kill $TRAINING_PID
+fi
+
+# Generate performance report
+echo "=== Generating Performance Report ==="
+report_file="${BENCHMARK_DIR}/performance_report.md"
+
+cat > "${report_file}" << EOF
+# Fully Async Policy Performance Benchmark Report
+
+**Date:** $(date)
+**Hardware:** ${NUM_GPUS} GPUs
+**Strategy:** ${ACTOR_STRATEGY}
+**Model:** ${MODEL_ID}
+
+## Test Configuration
+- Training Batch Size: ${train_prompt_bsz}
+- Responses per Prompt: ${n_resp_per_prompt}
+- Max Prompt Length: ${max_prompt_length}
+- Max Response Length: ${max_response_length}
+
+## Results Summary
+$(cat "${BENCHMARK_DIR}/summary.txt" 2>/dev/null || echo "No summary available")
+
+## Log Files
+EOF
+
+# List all log files
+for log_file in "${BENCHMARK_DIR}"/*.log; do
+    if [ -f "$log_file" ]; then
+        echo "- $(basename "${log_file}")" >> "${report_file}"
+    fi
+done
+
+cat >> "${report_file}" << EOF
+
+## Key Findings
+- **Staleness Impact:** Lower staleness thresholds may increase sample dropping but improve freshness
+- **Queue Size Impact:** Larger queues provide better buffering but use more memory
+- **Generation Interval:** Shorter intervals increase throughput but may stress the system
+- **GPU Allocation:** Balance between generation and training capacity is crucial
+- **Pause/Resume:** System should handle interruptions gracefully
+
+## Recommendations
+1. Start with staleness_threshold=3 for good balance
+2. Use queue_size=500-1000 for most workloads
+3. Set generation_interval=0.1s for good performance
+4. Allocate 2-3 GPUs for rollout in typical 8-GPU setups
+5. Monitor queue utilization and adjust based on workload
+
+EOF
+
+echo "Benchmark completed!"
+echo "Results saved to: ${BENCHMARK_DIR}/"
+echo "Performance report: ${report_file}"
+
+# Print summary to console
+if [ -f "${BENCHMARK_DIR}/summary.txt" ]; then
+    echo ""
+    echo "=== BENCHMARK SUMMARY ==="
+    cat "${BENCHMARK_DIR}/summary.txt"
+fi
+
diff --git a/recipe/fully_async_policy/unittest/test_fully_async_components.py b/recipe/fully_async_policy/unittest/test_fully_async_components.py
new file mode 100644
index 00000000000..8e5279b84bb
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/test_fully_async_components.py
@@ -0,0 +1,444 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+单元测试文件，用于测试完全异步PPO训练系统的各个组件
+"""
+
+import os
+
+# Import components to test
+import sys
+import time
+import unittest
+from unittest.mock import Mock
+
+import ray
+from omegaconf import OmegaConf
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from fully_async_rollouter import FullyAsyncRollouter
+from fully_async_trainer import FullyAsyncTrainer
+from message_queue import MessageQueueClient
+from param_sync import ParameterSynchronizer
+
+
+class TestMessageQueue(unittest.TestCase):
+    """测试MessageQueue的功能"""
+
+    def setUp(self):
+        """设置测试环境"""
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True)
+
+        # 创建MessageQueue客户端
+        self.message_queue = MessageQueueClient.remote(max_queue_size=100, max_staleness=3)
+
+    def tearDown(self):
+        """清理测试环境"""
+        if hasattr(self, "message_queue"):
+            ray.kill(self.message_queue)
+
+    def test_put_and_get_samples(self):
+        """测试放入和获取样本的基本功能"""
+        # 创建模拟样本数据
+        mock_sample = Mock()
+        mock_sample.batch_size = 4
+
+        # 测试放入样本
+        success = ray.get(
+            self.message_queue.put_samples.remote(
+                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+            )
+        )
+        self.assertTrue(success)
+
+        # 测试获取样本
+        result = ray.get(self.message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1))
+
+        self.assertIsNotNone(result)
+        samples, metadata_list = result
+        self.assertEqual(len(samples), 1)
+        self.assertEqual(len(metadata_list), 1)
+
+    def test_staleness_control(self):
+        """测试新鲜度控制功能"""
+        mock_sample = Mock()
+        mock_sample.batch_size = 4
+
+        # 放入一个参数版本较老的样本
+        success = ray.get(
+            self.message_queue.put_samples.remote(
+                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+            )
+        )
+        self.assertTrue(success)
+
+        # 尝试用较新的参数版本获取样本（应该被拒绝）
+        result = ray.get(
+            self.message_queue.get_samples.remote(
+                min_batch_count=1,
+                timeout=5.0,
+                current_param_version=5,  # 版本差距为4 > max_staleness(3)
+            )
+        )
+
+        # 应该返回空结果，因为样本过期
+        self.assertIsNone(result)
+
+    def test_queue_statistics(self):
+        """测试队列统计功能"""
+        # 获取初始统计
+        stats = ray.get(self.message_queue.get_statistics.remote())
+        initial_queue_size = stats["queue_size"]
+
+        # 添加一些样本
+        mock_sample = Mock()
+        mock_sample.batch_size = 4
+
+        for i in range(3):
+            ray.get(
+                self.message_queue.put_samples.remote(
+                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+                )
+            )
+
+        # 检查统计是否更新
+        stats = ray.get(self.message_queue.get_statistics.remote())
+        self.assertEqual(stats["queue_size"], initial_queue_size + 3)
+        self.assertEqual(stats["total_produced"], 3)
+
+
+class TestParameterSynchronizer(unittest.TestCase):
+    """测试参数同步器的功能"""
+
+    def setUp(self):
+        """设置测试环境"""
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True)
+
+        self.config = OmegaConf.create(
+            {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}}
+        )
+
+    def test_sync_with_retry(self):
+        """测试带重试机制的参数同步"""
+        # 创建模拟的worker groups
+        mock_actor_wg = Mock()
+        mock_rollout_wg = Mock()
+
+        # 模拟同步操作
+        mock_actor_wg.get_weights.return_value = ray.put({"param1": "value1"})
+        mock_rollout_wg.set_weights.return_value = []
+
+        synchronizer = ParameterSynchronizer.remote(
+            config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg
+        )
+
+        # 测试成功同步
+        result = ray.get(synchronizer.sync_weights.remote())
+        self.assertTrue(result)
+
+    def test_sync_failure_and_retry(self):
+        """测试同步失败和重试机制"""
+        mock_actor_wg = Mock()
+        mock_rollout_wg = Mock()
+
+        # 模拟同步失败
+        mock_actor_wg.get_weights.side_effect = Exception("Sync failed")
+
+        synchronizer = ParameterSynchronizer.remote(
+            config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg
+        )
+
+        # 测试失败时的重试
+        result = ray.get(synchronizer.sync_weights.remote())
+        self.assertFalse(result)
+
+
+class TestFullyAsyncRollouter(unittest.TestCase):
+    """测试异步Rollouter的功能"""
+
+    def setUp(self):
+        """设置测试环境"""
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True)
+
+    def test_pause_resume_functionality(self):
+        """测试暂停和恢复功能"""
+        # 创建配置
+        config = OmegaConf.create(
+            {
+                "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}},
+                "algorithm": {"use_kl_in_reward": False},
+                "critic": {"enable": False},
+                "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"},
+                "async_training": {
+                    "staleness_threshold": 3,
+                    "max_staleness_allowed": 5,
+                    "generation_timeout": 10.0,
+                    "batch_generation_interval": 0.1,
+                },
+            }
+        )
+
+        # 创建模拟的依赖
+        mock_tokenizer = Mock()
+        mock_role_worker_mapping = Mock()
+        mock_resource_pool_manager = Mock()
+
+        # 创建Rollouter实例
+        rollouter = FullyAsyncRollouter.remote(
+            config=config,
+            tokenizer=mock_tokenizer,
+            role_worker_mapping=mock_role_worker_mapping,
+            resource_pool_manager=mock_resource_pool_manager,
+        )
+
+        # 测试暂停功能
+        result = ray.get(rollouter.pause_rollout.remote())
+        self.assertTrue(result)
+
+        # 检查暂停状态
+        is_paused = ray.get(rollouter.is_rollout_paused.remote())
+        self.assertTrue(is_paused)
+
+        # 测试恢复功能
+        result = ray.get(rollouter.resume_rollout.remote())
+        self.assertTrue(result)
+
+        # 检查恢复状态
+        is_paused = ray.get(rollouter.is_rollout_paused.remote())
+        self.assertFalse(is_paused)
+
+    def test_statistics_collection(self):
+        """测试统计信息收集功能"""
+        config = OmegaConf.create(
+            {
+                "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}},
+                "algorithm": {"use_kl_in_reward": False},
+                "critic": {"enable": False},
+                "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"},
+                "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "generation_timeout": 10.0},
+            }
+        )
+
+        mock_tokenizer = Mock()
+        mock_role_worker_mapping = Mock()
+        mock_resource_pool_manager = Mock()
+
+        rollouter = FullyAsyncRollouter.remote(
+            config=config,
+            tokenizer=mock_tokenizer,
+            role_worker_mapping=mock_role_worker_mapping,
+            resource_pool_manager=mock_resource_pool_manager,
+        )
+
+        # 获取统计信息
+        stats = ray.get(rollouter.get_statistics.remote())
+
+        # 验证统计信息包含必要的字段
+        expected_keys = [
+            "total_generated_samples",
+            "dropped_stale_samples",
+            "generation_errors",
+            "current_param_version",
+            "is_paused",
+            "pause_count",
+            "resume_count",
+        ]
+
+        for key in expected_keys:
+            self.assertIn(key, stats)
+
+
+class TestFullyAsyncTrainer(unittest.TestCase):
+    """测试异步Trainer的功能"""
+
+    def setUp(self):
+        """设置测试环境"""
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True)
+
+    def test_freshness_metrics_calculation(self):
+        """测试新鲜度指标计算"""
+        # 创建基本配置
+        config = OmegaConf.create(
+            {
+                "trainer": {
+                    "device": "cpu",
+                    "project_name": "test",
+                    "experiment_name": "test",
+                    "total_epochs": 1,
+                    "total_training_steps": 2,
+                },
+                "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "batch_timeout": 10.0},
+                "data": {"train_batch_size": 4},
+                "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}},
+                "algorithm": {"use_kl_in_reward": False},
+                "critic": {"enable": False},
+            }
+        )
+
+        # 创建模拟的依赖
+        mock_tokenizer = Mock()
+        mock_role_worker_mapping = Mock()
+        mock_resource_pool_manager = Mock()
+
+        trainer = FullyAsyncTrainer.remote(
+            config=config,
+            tokenizer=mock_tokenizer,
+            role_worker_mapping=mock_role_worker_mapping,
+            resource_pool_manager=mock_resource_pool_manager,
+        )
+
+        # 测试新鲜度指标计算
+        current_time = time.time()
+        metadata_list = [
+            {"generation_timestamp": current_time - 5, "rollout_param_version": 1},
+            {"generation_timestamp": current_time - 10, "rollout_param_version": 2},
+            {"generation_timestamp": current_time - 15, "rollout_param_version": 1},
+        ]
+
+        freshness_metrics = ray.get(trainer._calculate_freshness_metrics.remote(metadata_list, current_param_version=3))
+
+        # 验证新鲜度指标
+        self.assertIn("avg_sample_age", freshness_metrics)
+        self.assertIn("max_sample_age", freshness_metrics)
+        self.assertIn("min_sample_age", freshness_metrics)
+        self.assertIn("version_diversity", freshness_metrics)
+        self.assertIn("staleness_ratio", freshness_metrics)
+
+
+class TestIntegrationScenarios(unittest.TestCase):
+    """测试组件集成场景"""
+
+    def setUp(self):
+        """设置测试环境"""
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True)
+
+    def test_message_queue_trainer_integration(self):
+        """测试MessageQueue与Trainer的集成"""
+        # 创建MessageQueue
+        message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
+
+        # 放入一些测试样本
+        mock_sample = Mock()
+        mock_sample.batch_size = 4
+
+        ray.get(
+            message_queue.put_samples.remote(
+                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+            )
+        )
+
+        # 验证Trainer能够获取样本
+        result = ray.get(message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1))
+
+        self.assertIsNotNone(result)
+        samples, metadata_list = result
+        self.assertEqual(len(samples), 1)
+
+    def test_rollouter_message_queue_integration(self):
+        """测试Rollouter与MessageQueue的集成"""
+        # 这个测试需要更多的模拟设置，因为涉及到实际的模型生成
+        # 在实际实现中，可以使用更多的Mock对象来模拟这种集成
+        pass
+
+
+class TestErrorHandling(unittest.TestCase):
+    """测试错误处理和边界情况"""
+
+    def setUp(self):
+        """设置测试环境"""
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True)
+
+    def test_message_queue_overflow(self):
+        """测试消息队列溢出处理"""
+        # 创建小容量的队列
+        message_queue = MessageQueueClient.remote(max_queue_size=2, max_staleness=3)
+
+        mock_sample = Mock()
+        mock_sample.batch_size = 4
+
+        # 填满队列
+        for i in range(2):
+            result = ray.get(
+                message_queue.put_samples.remote(
+                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+                )
+            )
+            self.assertTrue(result)
+
+        # 尝试再放入一个样本（应该失败或者覆盖旧样本）
+        result = ray.get(
+            message_queue.put_samples.remote(
+                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+            )
+        )
+
+        # 根据实现，这里可能是False（拒绝）或True（覆盖）
+        self.assertIsInstance(result, bool)
+
+    def test_timeout_handling(self):
+        """测试超时处理"""
+        message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
+
+        # 尝试从空队列获取样本，应该超时
+        start_time = time.time()
+        result = ray.get(
+            message_queue.get_samples.remote(
+                min_batch_count=1,
+                timeout=1.0,  # 1秒超时
+                current_param_version=1,
+            )
+        )
+        elapsed = time.time() - start_time
+
+        # 应该返回None并且大约在1秒后返回
+        self.assertIsNone(result)
+        self.assertGreater(elapsed, 0.9)  # 允许一些误差
+        self.assertLess(elapsed, 2.0)
+
+
+if __name__ == "__main__":
+    # 设置测试套件
+    test_suite = unittest.TestSuite()
+
+    # 添加测试用例
+    test_classes = [
+        TestMessageQueue,
+        TestParameterSynchronizer,
+        TestFullyAsyncRollouter,
+        TestFullyAsyncTrainer,
+        TestIntegrationScenarios,
+        TestErrorHandling,
+    ]
+
+    for test_class in test_classes:
+        tests = unittest.TestLoader().loadTestsFromTestCase(test_class)
+        test_suite.addTests(tests)
+
+    # 运行测试
+    runner = unittest.TextTestRunner(verbosity=2)
+    result = runner.run(test_suite)
+
+    # 清理Ray
+    if ray.is_initialized():
+        ray.shutdown()
+
+    # 退出
+    exit(0 if result.wasSuccessful() else 1)
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
new file mode 100644
index 00000000000..9692aab0d44
--- /dev/null
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -0,0 +1,196 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+# Test script for fully_async_policy E2E regression testing
+# This script runs fully async PPO training with both FSDP2 and Megatron backends
+# to ensure the asynchronous training mechanism works correctly
+
+NUM_GPUS=${NUM_GPUS:-8}
+ACTOR_STRATEGY=${ACTOR_STRATEGY:-"fsdp2"}  # fsdp2 or megatron
+
+# Download model if not exists
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=1024
+max_response_length=2048
+enable_overlong_buffer=True
+overlong_buffer_len=128
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+train_prompt_bsz=8
+n_resp_per_prompt=3
+train_prompt_mini_bsz=4
+
+# Temperature parameters
+temperature=1.0
+top_p=1.0
+top_k=-1
+val_top_p=0.7
+
+# Fully async specific parameters
+# Allocate 2 GPUs for rollout, remaining for training
+n_gpus_rollout=2
+n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
+
+# Async training specific configurations
+staleness_threshold=3
+max_staleness_allowed=5
+max_queue_size=1000
+min_batch_count=1
+batch_timeout=30.0
+generation_timeout=30.0
+batch_generation_interval=0.1
+max_sync_retries=3
+sync_timeout=30.0
+sync_retry_delay=1.0
+
+exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"
+
+echo "Running fully_async_policy with ${ACTOR_STRATEGY} strategy"
+echo "Total GPUs: ${NUM_GPUS}, Rollout GPUs: ${n_gpus_rollout}, Training GPUs: ${n_gpus_training}"
+
+# Common parameters for both FSDP2 and Megatron
+common_params=(
+    data.train_files="${HOME}/data/gsm8k/train.parquet"
+    data.val_files="${HOME}/data/gsm8k/test.parquet"
+    data.prompt_key=prompt
+    data.truncation='left'
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    data.train_batch_size=${train_prompt_bsz}
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    algorithm.adv_estimator=${adv_estimator}
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+    actor_rollout_ref.hybrid_engine=False
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low}
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high}
+    actor_rollout_ref.actor.clip_ratio_c=10.0
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    actor_rollout_ref.model.enable_gradient_checkpointing=True
+    actor_rollout_ref.actor.optim.lr=1e-6
+    actor_rollout_ref.actor.optim.lr_warmup_steps=-1
+    actor_rollout_ref.actor.optim.weight_decay=0.1
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80
+    actor_rollout_ref.rollout.temperature=${temperature}
+    actor_rollout_ref.rollout.top_p=${top_p}
+    actor_rollout_ref.rollout.top_k=${top_k}
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature}
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p}
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k}
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True
+    actor_rollout_ref.rollout.val_kwargs.n=1
+    actor_rollout_ref.rollout.enable_chunked_prefill=True
+    reward_model.reward_manager=dapo
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer}
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len}
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor}
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length}
+    trainer.logger=['console']
+    trainer.project_name='verl-test-fully-async'
+    trainer.experiment_name="${exp_name}"
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    trainer.total_epochs=2
+    trainer.total_training_steps=4
+    trainer.resume_mode=disable
+    trainer.nnodes=1
+    trainer.n_gpus_per_node=${n_gpus_training}
+    rollout.nnodes=1
+    rollout.n_gpus_per_node=${n_gpus_rollout}
+    # Fully async specific configurations
+    async_training.staleness_threshold=${staleness_threshold}
+    async_training.max_staleness_allowed=${max_staleness_allowed}
+    async_training.max_queue_size=${max_queue_size}
+    async_training.min_batch_count=${min_batch_count}
+    async_training.batch_timeout=${batch_timeout}
+    async_training.generation_timeout=${generation_timeout}
+    async_training.batch_generation_interval=${batch_generation_interval}
+    async_training.max_sync_retries=${max_sync_retries}
+    async_training.sync_timeout=${sync_timeout}
+    async_training.sync_retry_delay=${sync_retry_delay}
+)
+
+if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then
+    echo "Running fully async training with FSDP2 strategy..."
+    # FSDP2 specific parameters
+    gen_tp=2
+    sp_size=2
+    fsdp_size=2
+    ref_offload=True
+    actor_offload=False
+
+    python3 -m recipe.fully_async_policy.fully_async_main \
+        "${common_params[@]}" \
+        actor_rollout_ref.actor.strategy=fsdp2 \
+        critic.strategy=fsdp2 \
+        actor_rollout_ref.actor.grad_clip=1.0 \
+        actor_rollout_ref.model.use_remove_padding=True \
+        actor_rollout_ref.actor.use_dynamic_bsz=True \
+        actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
+        actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
+        actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+        actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+        actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+        actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+        actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+        actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+        actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} $@
+
+elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
+    echo "Running fully async training with Megatron strategy..."
+    # Megatron specific parameters
+    gen_tp=2
+    train_tp=1
+    train_pp=2
+    ref_offload=True
+    actor_offload=False
+
+    python3 -m recipe.fully_async_policy.fully_async_main \
+        --config-path=config \
+        --config-name='fully_async_ppo_megatron_trainer.yaml' \
+        "${common_params[@]}" \
+        actor_rollout_ref.actor.strategy=megatron \
+        critic.strategy=megatron \
+        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+        actor_rollout_ref.actor.megatron.param_offload=${actor_offload} \
+        actor_rollout_ref.actor.megatron.optimizer_offload=${actor_offload} \
+        actor_rollout_ref.actor.megatron.grad_offload=${actor_offload} \
+        actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+        actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+        actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+        actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+        actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+        actor_rollout_ref.ref.megatron.param_offload=${ref_offload} $@
+else
+    echo "Error: Unknown strategy ${ACTOR_STRATEGY}. Please use 'fsdp2' or 'megatron'"
+    exit 1
+fi
+
+echo "Fully async policy E2E test completed successfully with ${ACTOR_STRATEGY} strategy"
+

From 941c3dea8ccb39981ef53f17a5a9bb117f67b702 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 5 Aug 2025 11:53:26 +0800
Subject: [PATCH 020/182] init models

---
 recipe/fully_async_policy/fully_async_main.py | 159 ++++++------------
 .../fully_async_rollouter.py                  | 108 ++++++------
 .../fully_async_policy/fully_async_trainer.py |   5 -
 3 files changed, 100 insertions(+), 172 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index e8053e74647..54460b3611d 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -32,17 +32,6 @@
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.utils.fs import copy_to_local
 
-logger = logging.getLogger(__name__)
-
-
-def setup_logging():
-    """设置日志配置"""
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-        handlers=[logging.StreamHandler(), logging.FileHandler("fully_async_training.log")],
-    )
-
 
 def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
     """
@@ -81,9 +70,6 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
         resource_pool_spec["rollout_pool"] = rollout_pool
         mapping[Role.Rollout] = "rollout_pool"
 
-    logger.info(f"Resource pool specification: {resource_pool_spec}")
-    logger.info(f"Role mapping: {mapping}")
-
     return ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
 
 
@@ -168,22 +154,22 @@ def __init__(self):
 
     def run(self, config):
         """运行完全异步的PPO训练"""
-        setup_logging()
-        logger.info("Starting fully async PPO training...")
+        print("Starting fully async PPO training...")
         # 设置信号处理
         self._setup_signal_handlers()
         # 初始化基础组件
         self._initialize_components(config)
+        time.sleep(60)
         # 启动训练流程
-        self._run_training_loop()
+        # self._run_training_loop()
 
-        self._cleanup_resources()
+        # self._cleanup_resources()
 
     def _setup_signal_handlers(self):
         """设置信号处理器"""
 
         def signal_handler(signum, frame):
-            logger.info(f"Received signal {signum}, initiating shutdown...")
+            print(f"Received signal {signum}, initiating shutdown...")
             self.running = False
             self.shutdown_event.set()
 
@@ -206,7 +192,7 @@ def _initialize_components(self, config) -> None:
         OmegaConf.resolve(config)
 
         # 初始化模型路径和tokenizer
-        logger.info("Initializing model and tokenizer...")
+        print("Initializing model and tokenizer...")
         local_path = copy_to_local(
             config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
         )
@@ -222,27 +208,13 @@ def _initialize_components(self, config) -> None:
         self.components["processor"] = processor
 
         # 创建worker映射和资源池
-        logger.info("Creating worker mapping and resource pools...")
+        print("Creating worker mapping and resource pools...")
         role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config)
         self.components["role_worker_mapping"] = role_worker_mapping
         self.components["ray_worker_group_cls"] = ray_worker_group_cls
 
-        # 创建数据集
-        logger.info("Creating datasets...")
-        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
-        from verl.utils.dataset.rl_dataset import collate_fn
-
-        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor)
-        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
-        train_sampler = create_rl_sampler(config.data, train_dataset)
-
-        self.components["train_dataset"] = train_dataset
-        self.components["val_dataset"] = val_dataset
-        self.components["train_sampler"] = train_sampler
-        self.components["collate_fn"] = collate_fn
-
         # 创建奖励函数
-        logger.info("Loading reward functions...")
+        print("Loading reward functions...")
         reward_fn = load_reward_manager(
             config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
         )
@@ -253,7 +225,7 @@ def _initialize_components(self, config) -> None:
         self.components["val_reward_fn"] = val_reward_fn
 
         # 创建MessageQueue
-        logger.info("Creating MessageQueue...")
+        print("Creating MessageQueue...")
         max_queue_size = config.async_training.get("max_queue_size", 1000)
         message_queue = MessageQueue.remote(config, max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
@@ -262,25 +234,26 @@ def _initialize_components(self, config) -> None:
         self.components["message_queue_client"] = message_queue_client
 
         # 创建Rollouter
-        logger.info("Creating Rollouter...")
+        print("Creating Rollouter...")
         self._create_rollouter(config)
 
         # 创建Trainer
-        logger.info("Creating FullyAsyncTrainer...")
+        print("Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
         # 设置参数同步
-        logger.info("Setting up parameter synchronization...")
-        param_synchronizer = AsyncParameterSynchronizer(
-            config=config,
-            actor_wg=self.components["trainer"].actor_wg,
-            rollouter=self.components["rollouter"],
-        )
-        self.components["param_synchronizer"] = param_synchronizer
-        logger.info("All components initialized successfully")
+        # print("Setting up parameter synchronization...")
+        # param_synchronizer = AsyncParameterSynchronizer(
+        #     config=config,
+        #     actor_wg=self.components["trainer"].actor_wg,
+        #     rollouter=self.components["rollouter"],
+        # )
+        # self.components["param_synchronizer"] = param_synchronizer
+        # print("All components initialized successfully")
 
     def _create_rollouter(self, config) -> None:
         """创建Rollouter"""
+        pprint(self.components)
         rollouter = FullyAsyncRollouter.remote(
             config=config,
             tokenizer=self.components["tokenizer"],
@@ -288,21 +261,19 @@ def _create_rollouter(self, config) -> None:
             resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]),
             ray_worker_group_cls=self.components["ray_worker_group_cls"],
             processor=self.components["processor"],
-            train_dataset=self.components["train_dataset"],
-            collate_fn=self.components["collate_fn"],
-            train_sampler=self.components["train_sampler"],
             device_name=config.trainer.device,
         )
+        print(rollouter)
+
+        print("========== rollouter init workers ======")
 
         # 初始化Rollouter
-        init_future = rollouter.init_workers.remote()
-        ray.get(init_future, timeout=60.0)
+        ray.get(rollouter.init_workers.remote())
 
-        set_queue_future = rollouter.set_message_queue_client.remote(self.components["message_queue_client"])
-        ray.get(set_queue_future, timeout=10.0)
+        ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"]))
 
         self.components["rollouter"] = rollouter
-        logger.info("Rollouter created and initialized successfully")
+        print("Rollouter created and initialized successfully")
 
     def _create_trainer(self, config) -> None:
         """创建Trainer"""
@@ -322,39 +293,33 @@ def _create_trainer(self, config) -> None:
             processor=self.components["processor"],
             reward_fn=self.components["reward_fn"],
             val_reward_fn=self.components["val_reward_fn"],
-            train_dataset=self.components["train_dataset"],
-            val_dataset=self.components["val_dataset"],
-            collate_fn=self.components["collate_fn"],
-            train_sampler=self.components["train_sampler"],
             device_name=config.trainer.device,
         )
 
         # 初始化Trainer
-        trainer.init_workers()
-        trainer.set_message_queue_client(self.components["message_queue_client"])
-        trainer.set_rollouter(self.components["rollouter"])
-
+        ray.get(trainer.init_workers.remote())
+        ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"]))
         self.components["trainer"] = trainer
-        logger.info("FullyAsyncTrainer created and initialized successfully")
+        print("FullyAsyncTrainer created and initialized successfully")
 
     def _run_training_loop(self):
         """运行训练循环"""
         self.running = True
 
-        logger.info("Starting Rollouter in background...")
+        print("Starting Rollouter in background...")
         rollouter_future = self.components["rollouter"].fit.remote()
         trainer_future = self.components["trainer"].fit.remote()
         self._monitor_components()
         ray.get(rollouter_future)
         ray.get(trainer_future)
 
-        logger.info("Training completed or interrupted")
+        print("Training completed or interrupted")
 
     def _run_rollouter(self):
         try:
             ray.get(self.components["rollouter"].fit.remote())
         except Exception as e:
-            logger.error(f"Rollouter error: {e}")
+            print(f"Rollouter error: {e}")
             self.running = False
             self.shutdown_event.set()
 
@@ -363,14 +328,14 @@ def _run_trainer(self):
         try:
             self.components["trainer"].fit()
         except Exception as e:
-            logger.error(f"Trainer error: {e}")
+            print(f"Trainer error: {e}")
         finally:
             self.running = False
             self.shutdown_event.set()
 
     def _monitor_components(self):
         """监控组件状态"""
-        logger.info("Starting component monitoring...")
+        print("Starting component monitoring...")
 
         last_stats_time = time.time()
         stats_interval = 60.0  # 60秒报告一次统计
@@ -391,9 +356,9 @@ def _monitor_components(self):
                 self._check_component_health()
 
             except Exception as e:
-                logger.error(f"Error in component monitoring: {e}")
+                print(f"Error in component monitoring: {e}")
 
-        logger.info("Component monitoring stopped")
+        print("Component monitoring stopped")
 
     def _log_component_statistics(self):
         """记录组件统计信息"""
@@ -407,27 +372,27 @@ def _log_component_statistics(self):
             # 获取队列统计
             queue_stats = self.components["message_queue_client"].get_statistics()
 
-            logger.info("=== Component Statistics ===")
-            logger.info(
+            print("=== Component Statistics ===")
+            print(
                 f"Trainer - Steps: {trainer_stats['global_steps']}, "
                 f"Samples: {trainer_stats['processed_samples']}, "
                 f"Param version: {trainer_stats['current_param_version']}"
             )
 
-            logger.info(
+            print(
                 f"Rollouter - Generated: {rollouter_stats['total_generated_samples']}, "
                 f"Dropped: {rollouter_stats['dropped_stale_samples']}, "
                 f"Errors: {rollouter_stats['generation_errors']}"
             )
 
-            logger.info(
+            print(
                 f"Queue - Size: {queue_stats['queue_size']}, "
                 f"Produced: {queue_stats['total_produced']}, "
                 f"Consumed: {queue_stats['total_consumed']}"
             )
 
         except Exception as e:
-            logger.error(f"Error getting component statistics: {e}")
+            print(f"Error getting component statistics: {e}")
 
     def _check_component_health(self):
         """检查组件健康状态"""
@@ -442,43 +407,43 @@ def _check_component_health(self):
             rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0)
 
             if not rollouter_stats["is_running"]:
-                logger.warning("Rollouter is not running!")
+                print("Rollouter is not running!")
                 # 可以尝试重启或报告错误
 
         except Exception as e:
-            logger.warning(f"Health check failed: {e}")
+            print(f"Health check failed: {e}")
 
     def _cleanup_resources(self):
         """清理资源"""
-        logger.info("Cleaning up resources...")
+        print("Cleaning up resources...")
 
         try:
             # 停止Rollouter
             if "rollouter" in self.components:
-                logger.info("Shutting down Rollouter...")
+                print("Shutting down Rollouter...")
                 try:
                     shutdown_future = self.components["rollouter"].shutdown.remote()
                     ray.get(shutdown_future, timeout=10.0)
                 except Exception as e:
-                    logger.warning(f"Error shutting down Rollouter: {e}")
+                    print(f"Error shutting down Rollouter: {e}")
 
             # 清理MessageQueue
             if "message_queue_client" in self.components:
-                logger.info("Cleaning up MessageQueue...")
+                print("Cleaning up MessageQueue...")
                 try:
                     self.components["message_queue_client"].shutdown()
                 except Exception as e:
-                    logger.warning(f"Error cleaning up MessageQueue: {e}")
+                    print(f"Error cleaning up MessageQueue: {e}")
 
             # 清理参数同步器
             if "param_synchronizer" in self.components:
-                logger.info("Cleaning up parameter synchronizer...")
+                print("Cleaning up parameter synchronizer...")
                 # TODO: 添加参数同步器的清理逻辑
 
-            logger.info("Resource cleanup completed")
+            print("Resource cleanup completed")
 
         except Exception as e:
-            logger.error(f"Error during cleanup: {e}")
+            print(f"Error during cleanup: {e}")
 
     def get_training_status(self) -> dict:
         """获取训练状态"""
@@ -495,7 +460,7 @@ def get_training_status(self) -> dict:
                 "rollouter_stats": rollouter_stats,
             }
         except Exception as e:
-            logger.error(f"Error getting training status: {e}")
+            print(f"Error getting training status: {e}")
             return {"status": "error", "error": str(e)}
 
 
@@ -503,27 +468,9 @@ def get_training_status(self) -> dict:
 def main(config):
     """主入口函数"""
     from verl.trainer.main_ppo import run_ppo
-
     # 确保异步训练配置存在
     if not hasattr(config, "async_training"):
-        # 设置默认异步训练配置
-        config.async_training = OmegaConf.create(
-            {
-                "staleness_threshold": 3,
-                "max_staleness_allowed": 5,
-                "max_queue_size": 1000,
-                "min_batch_count": 1,
-                "batch_timeout": 30.0,
-                "generation_timeout": 30.0,
-                "batch_generation_interval": 0.1,
-                "max_sync_retries": 3,
-                "sync_timeout": 30.0,
-                "sync_retry_delay": 1.0,
-            }
-        )
-        logger.info("Using default async training configuration")
-
-    logger.info("Starting fully async PPO training with improved architecture")
+        raise RuntimeError("must set async_training config")
     run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
 
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 6274237c6a8..59047db95f7 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -15,6 +15,7 @@
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
+from pprint import pprint
 from typing import Optional
 
 import ray
@@ -28,8 +29,6 @@
 from verl.utils.debug import marked_timer
 from verl.utils.tracking import ValidationGenerationsLogger
 
-logger = logging.getLogger(__name__)
-
 
 @ray.remote(num_cpus=10, max_concurrency=10)
 class FullyAsyncRollouter(RayPPOTrainer):
@@ -48,10 +47,6 @@ def __init__(
             processor=None,
             reward_fn=None,
             val_reward_fn=None,
-            train_dataset: Dataset | None = None,
-            val_dataset: Dataset | None = None,
-            collate_fn=None,
-            train_sampler: Sampler | None = None,
             device_name=None,
     ):
         """
@@ -73,7 +68,6 @@ def __init__(
             train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
             device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
         """
-
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
         self.processor = processor
@@ -99,7 +93,18 @@ def __init__(
         self.use_reference_policy = False
         self.use_rm = False
 
+
+        # 创建数据集
+        print("Creating datasets...")
+        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+        from verl.utils.dataset.rl_dataset import collate_fn
+
+        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor)
+        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
+        train_sampler = create_rl_sampler(config.data, train_dataset)
+
         self._validate_config()
+        pprint(f"Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
         # rollouter 参数配置
@@ -159,14 +164,6 @@ def _validate_config(self):
         if not hasattr(self.config, "async_training"):
             raise ValueError("Missing async_training configuration")
 
-    def init_workers(self):
-        """初始化rollout workers"""
-        with self.lock:
-            logger.info("Initializing Rollouter workers...")
-            self._init_resource_pools()
-            self.rollout_wg = self.all_wg["rollout"]
-            self.rollout_wg.init_model()
-
     def _create_actor_rollout_classes(self):
         # only create rollout
         for role in [Role.Rollout]:
@@ -183,17 +180,6 @@ def _init_models(self):
         self.rollout_wg.init_model()
         self.actor_rollout_wg = self.rollout_wg
 
-    def _init_async_rollout_manager(self):
-        # create async rollout manager and request scheduler
-        self.async_rollout_mode = False
-        if self.config.actor_rollout_ref.rollout.mode == "async":
-            from verl.experimental.agent_loop import AgentLoopManager
-
-            self.async_rollout_mode = True
-            self.async_rollout_manager = AgentLoopManager(
-                config=self.config,
-                worker_group=self.actor_rollout_wg,
-            )
 
     def update_rollout_weights(self, param_version: int) -> bool:
         """
@@ -206,11 +192,11 @@ def update_rollout_weights(self, param_version: int) -> bool:
         Returns:
             bool: 是否成功更新参数
         """
-        logger.info(f"Updating rollout weights to version {param_version}")
+        self.logger.info(f"Updating rollout weights to version {param_version}")
 
         with self.sync_lock:
             if self.sync_in_progress:
-                logger.warning(f"Sync already in progress, skipping version {param_version}")
+                self.logger.warning(f"Sync already in progress, skipping version {param_version}")
                 return False
 
             self.sync_in_progress = True
@@ -218,7 +204,7 @@ def update_rollout_weights(self, param_version: int) -> bool:
         try:
             # 暂停rollout - 带超时机制
             if not self.rollout_controller.pause(timeout=10.0):
-                logger.error("Failed to pause rollout within timeout")
+                print("Failed to pause rollout within timeout")
                 return False
 
             # 等待当前generation完成（如果有的话）
@@ -231,12 +217,12 @@ def update_rollout_weights(self, param_version: int) -> bool:
                 self.current_param_version = param_version
                 self.param_sync_requests += 1
                 self.last_sync_time = time.time()
-                logger.info(f"Successfully updated rollout weights to version {param_version}")
+                self.logger.info(f"Successfully updated rollout weights to version {param_version}")
             else:
-                logger.error(f"Failed to sync parameters to version {param_version}")
+                print(f"Failed to sync parameters to version {param_version}")
 
         except Exception as e:
-            logger.error(f"Error during parameter sync: {e}")
+            print(f"Error during parameter sync: {e}")
             sync_success = False
         finally:
             # 恢复rollout
@@ -268,15 +254,15 @@ def _execute_parameter_sync(self, param_version: int) -> bool:
             # 执行参数同步
             if self.param_synchronizer:
                 self.param_synchronizer.sync_weights()
-                logger.debug("Parameter synchronization completed via synchronizer")
+                self.logger.debug("Parameter synchronization completed via synchronizer")
             else:
                 # 直接使用rollout worker group的同步机制
                 if hasattr(self.rollout_wg, "sync_rollout_weights"):
                     sync_futures = self.rollout_wg.sync_rollout_weights()
                     ray.get(sync_futures)
-                    logger.debug("Parameter synchronization completed via rollout worker group")
+                    self.logger.debug("Parameter synchronization completed via rollout worker group")
                 else:
-                    logger.warning("No parameter synchronization mechanism available")
+                    self.logger.warning("No parameter synchronization mechanism available")
                     return False
 
             # 恢复推理引擎
@@ -291,7 +277,7 @@ def _execute_parameter_sync(self, param_version: int) -> bool:
             return True
 
         except Exception as e:
-            logger.error(f"Parameter sync execution failed: {e}")
+            print(f"Parameter sync execution failed: {e}")
             return False
 
     def _create_continuous_iterator(self):
@@ -305,7 +291,7 @@ def _create_continuous_iterator(self):
 
     def fit(self):
         """开始异步生成样本 - 改进的主运行逻辑"""
-        logger.info("Starting Rollouter...")
+        self.logger.info("Starting Rollouter...")
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
         if self.param_synchronizer is None:
@@ -328,7 +314,7 @@ def fit(self):
         self.generation_thread.join()
         self.monitor_thread.join()
 
-        logger.info("Rollouter fit completed")
+        self.logger.info("Rollouter fit completed")
 
     def _generation_loop(self):
         """
@@ -346,10 +332,10 @@ def _generation_loop(self):
 
         from verl.utils.tracking import Tracking
 
-        logger = Tracking(
+        self.logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
-            default_backend=self.config.trainer.logger,
+            default_backend=self.config.trainer.self.logger,
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
@@ -364,7 +350,7 @@ def _generation_loop(self):
             val_metrics = self._validate()
             assert val_metrics, f"{val_metrics=}"
             pprint(f"Initial validation metrics: {val_metrics}")
-            logger.log(data=val_metrics, step=self.global_steps)
+            self.logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
 
@@ -400,7 +386,7 @@ def _generation_loop(self):
 
                 # 如果被暂停，等待恢复
                 while self.paused and self.running:
-                    logger.debug("Generation thread paused, waiting...")
+                    self.logger.debug("Generation thread paused, waiting...")
                     self.condition.wait()
 
                 # 再次检查运行状态
@@ -441,7 +427,7 @@ def _generation_loop(self):
                     if success:
                         self.total_generated_samples += 1
                         if self.total_generated_samples % 10 == 0:
-                            logger.info(
+                            self.logger.info(
                                 f"Generated {self.total_generated_samples} batches, "
                                 f"param_version={self.current_param_version}, "
                                 f"errors={self.generation_errors}"
@@ -449,7 +435,7 @@ def _generation_loop(self):
                     else:
                         self.dropped_stale_samples += 1
                         if self.dropped_stale_samples % 5 == 0:
-                            logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
+                            self.logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
 
     def _monitor_loop(self):
         """监控线程 - 监控状态并处理控制信号"""
@@ -478,12 +464,12 @@ def _monitor_loop(self):
                         if self.paused:
                             self.paused = False
                             self.condition.notify_all()
-                            logger.info("Generation resumed")
+                            self.logger.info("Generation resumed")
 
         except Exception as e:
-            logger.error(f"Error in monitor loop: {e}")
+            print(f"Error in monitor loop: {e}")
         finally:
-            logger.info("Monitor thread exiting")
+            self.logger.info("Monitor thread exiting")
 
     def _report_loop(self):
         try:
@@ -504,14 +490,14 @@ def _report_loop(self):
 
                 # 检查生成线程状态
                 if not self.generation_thread.is_alive():
-                    logger.error("Generation thread died, restarting...")
+                    print("Generation thread died, restarting...")
                     raise RuntimeError("generation_thread not alive")
 
 
         except KeyboardInterrupt:
-            logger.info("Received interrupt signal, shutting down...")
+            self.logger.info("Received interrupt signal, shutting down...")
         except Exception as e:
-            logger.error(f"Error in main loop: {e}")
+            print(f"Error in main loop: {e}")
         finally:
             self.shutdown()
 
@@ -529,7 +515,7 @@ def _should_pause_generation(self) -> bool:
 
             # 如果版本差异过大，暂停生成
             if version_diff >= self.max_staleness_allowed:
-                logger.debug(
+                self.logger.debug(
                     f"Should pause due to staleness: rollout_version={self.current_param_version}, "
                     f"trainer_version={current_trainer_version}, diff={version_diff}"
                 )
@@ -538,13 +524,13 @@ def _should_pause_generation(self) -> bool:
             # 如果队列太满，也暂停生成
             max_queue_size = self.staleness_threshold * self.config.data.train_batch_size
             if queue_size >= max_queue_size:
-                logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
+                self.logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
                 return True
 
             return False
 
         except Exception as e:
-            logger.error(f"Error checking pause conditions: {e}")
+            print(f"Error checking pause conditions: {e}")
             return True  # 出错时暂停生成
 
     def pause(self) -> bool:
@@ -570,12 +556,12 @@ def resume(self) -> bool:
 
             self.paused = False
             self.condition.notify_all()
-            logger.info("Generation resumed")
+            self.logger.info("Generation resumed")
             return True
 
     def shutdown(self):
         """关闭Rollouter - 改进的关闭逻辑"""
-        logger.info("Shutting down Rollouter...")
+        self.logger.info("Shutting down Rollouter...")
 
         with self.lock:
             self.running = False
@@ -584,19 +570,19 @@ def shutdown(self):
 
         # 等待生成线程结束
         if self.generation_thread and self.generation_thread.is_alive():
-            logger.info("Waiting for generation thread to finish...")
+            self.logger.info("Waiting for generation thread to finish...")
             self.generation_thread.join(timeout=10.0)
 
             if self.generation_thread.is_alive():
-                logger.warning("Generation thread did not finish within timeout")
+                self.logger.warning("Generation thread did not finish within timeout")
 
         # 等待监控线程结束
         if self.monitor_thread and self.monitor_thread.is_alive():
-            logger.info("Waiting for monitor thread to finish...")
+            self.logger.info("Waiting for monitor thread to finish...")
             self.monitor_thread.join(timeout=5.0)
 
             if self.monitor_thread.is_alive():
-                logger.warning("Monitor thread did not finish within timeout")
+                self.logger.warning("Monitor thread did not finish within timeout")
 
         # 关闭线程池
         if self.thread_executor:
@@ -608,9 +594,9 @@ def shutdown(self):
                 # TODO: 添加异步rollout管理器的清理逻辑
                 pass
             except Exception as e:
-                logger.warning(f"Error cleaning up async rollout manager: {e}")
+                self.logger.warning(f"Error cleaning up async rollout manager: {e}")
 
-        logger.info("Rollouter shutdown complete")
+        self.logger.info("Rollouter shutdown complete")
 
     def get_statistics(self) -> dict:
         with self.lock:
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 0dd90127d7d..2a507e41efa 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -57,10 +57,6 @@ def __init__(
         processor=None,
         reward_fn=None,
         val_reward_fn=None,
-            train_dataset: Dataset | None = None,
-            val_dataset: Dataset | None = None,
-        collate_fn=None,
-            train_sampler: Sampler | None = None,
         device_name=None,
     ):
         """
@@ -125,7 +121,6 @@ def __init__(
             self.use_critic = False
 
         self._validate_config()
-        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
         self.message_queue_client = None
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):

From 274883a81ee9bf9c5beb9f4b51dc721c16eaa416 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 5 Aug 2025 16:10:32 +0800
Subject: [PATCH 021/182] gen data

---
 recipe/fully_async_policy/fully_async_main.py | 33 ++-----
 .../fully_async_rollouter.py                  | 88 +++++++++----------
 .../fully_async_policy/fully_async_trainer.py |  5 --
 recipe/fully_async_policy/message_queue.py    | 13 ++-
 .../{ => unittest}/test_fully_async.py        | 14 ++-
 recipe/fully_async_policy/unittest/test_mq.py | 35 ++++----
 6 files changed, 73 insertions(+), 115 deletions(-)
 rename recipe/fully_async_policy/{ => unittest}/test_fully_async.py (91%)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 54460b3611d..dd544e9b49a 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import os
 import signal
 import socket
@@ -27,7 +26,6 @@
 from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter
 from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer
 from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
-from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer
 from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.utils.fs import copy_to_local
@@ -159,9 +157,9 @@ def run(self, config):
         self._setup_signal_handlers()
         # 初始化基础组件
         self._initialize_components(config)
-        time.sleep(60)
+        # time.sleep(60)
         # 启动训练流程
-        # self._run_training_loop()
+        self._run_training_loop()
 
         # self._cleanup_resources()
 
@@ -239,7 +237,7 @@ def _initialize_components(self, config) -> None:
 
         # 创建Trainer
         print("Creating FullyAsyncTrainer...")
-        self._create_trainer(config)
+        # self._create_trainer(config)
 
         # 设置参数同步
         # print("Setting up parameter synchronization...")
@@ -308,31 +306,13 @@ def _run_training_loop(self):
 
         print("Starting Rollouter in background...")
         rollouter_future = self.components["rollouter"].fit.remote()
-        trainer_future = self.components["trainer"].fit.remote()
-        self._monitor_components()
+        # trainer_future = self.components["trainer"].fit.remote()
+        # self._monitor_components()
         ray.get(rollouter_future)
-        ray.get(trainer_future)
+        # ray.get(trainer_future)
 
         print("Training completed or interrupted")
 
-    def _run_rollouter(self):
-        try:
-            ray.get(self.components["rollouter"].fit.remote())
-        except Exception as e:
-            print(f"Rollouter error: {e}")
-            self.running = False
-            self.shutdown_event.set()
-
-    def _run_trainer(self):
-        """运行trainer"""
-        try:
-            self.components["trainer"].fit()
-        except Exception as e:
-            print(f"Trainer error: {e}")
-        finally:
-            self.running = False
-            self.shutdown_event.set()
-
     def _monitor_components(self):
         """监控组件状态"""
         print("Starting component monitoring...")
@@ -468,6 +448,7 @@ def get_training_status(self) -> dict:
 def main(config):
     """主入口函数"""
     from verl.trainer.main_ppo import run_ppo
+
     # 确保异步训练配置存在
     if not hasattr(config, "async_training"):
         raise RuntimeError("must set async_training config")
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 59047db95f7..f273b3a45a4 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -11,16 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
 from pprint import pprint
-from typing import Optional
 
 import ray
 from omegaconf import OmegaConf
-from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient
@@ -38,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -93,7 +90,6 @@ def __init__(
         self.use_reference_policy = False
         self.use_rm = False
 
-
         # 创建数据集
         print("Creating datasets...")
         from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
@@ -180,7 +176,6 @@ def _init_models(self):
         self.rollout_wg.init_model()
         self.actor_rollout_wg = self.rollout_wg
 
-
     def update_rollout_weights(self, param_version: int) -> bool:
         """
         更新rollout模型参数 - 改进的参数同步实现
@@ -192,11 +187,11 @@ def update_rollout_weights(self, param_version: int) -> bool:
         Returns:
             bool: 是否成功更新参数
         """
-        self.logger.info(f"Updating rollout weights to version {param_version}")
+        print(f"Updating rollout weights to version {param_version}")
 
         with self.sync_lock:
             if self.sync_in_progress:
-                self.logger.warning(f"Sync already in progress, skipping version {param_version}")
+                print(f"Sync already in progress, skipping version {param_version}")
                 return False
 
             self.sync_in_progress = True
@@ -217,7 +212,7 @@ def update_rollout_weights(self, param_version: int) -> bool:
                 self.current_param_version = param_version
                 self.param_sync_requests += 1
                 self.last_sync_time = time.time()
-                self.logger.info(f"Successfully updated rollout weights to version {param_version}")
+                print(f"Successfully updated rollout weights to version {param_version}")
             else:
                 print(f"Failed to sync parameters to version {param_version}")
 
@@ -254,15 +249,15 @@ def _execute_parameter_sync(self, param_version: int) -> bool:
             # 执行参数同步
             if self.param_synchronizer:
                 self.param_synchronizer.sync_weights()
-                self.logger.debug("Parameter synchronization completed via synchronizer")
+                print("Parameter synchronization completed via synchronizer")
             else:
                 # 直接使用rollout worker group的同步机制
                 if hasattr(self.rollout_wg, "sync_rollout_weights"):
                     sync_futures = self.rollout_wg.sync_rollout_weights()
                     ray.get(sync_futures)
-                    self.logger.debug("Parameter synchronization completed via rollout worker group")
+                    print("Parameter synchronization completed via rollout worker group")
                 else:
-                    self.logger.warning("No parameter synchronization mechanism available")
+                    print("No parameter synchronization mechanism available")
                     return False
 
             # 恢复推理引擎
@@ -291,11 +286,11 @@ def _create_continuous_iterator(self):
 
     def fit(self):
         """开始异步生成样本 - 改进的主运行逻辑"""
-        self.logger.info("Starting Rollouter...")
+        print("Starting Rollouter...")
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-        if self.param_synchronizer is None:
-            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
+        # if self.param_synchronizer is None:
+        #     raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
         # 设置运行状态
         with self.lock:
@@ -314,7 +309,7 @@ def fit(self):
         self.generation_thread.join()
         self.monitor_thread.join()
 
-        self.logger.info("Rollouter fit completed")
+        print("Rollouter fit completed")
 
     def _generation_loop(self):
         """
@@ -335,7 +330,7 @@ def _generation_loop(self):
         self.logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
-            default_backend=self.config.trainer.self.logger,
+            default_backend=self.config.trainer.logger,
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
@@ -386,7 +381,7 @@ def _generation_loop(self):
 
                 # 如果被暂停，等待恢复
                 while self.paused and self.running:
-                    self.logger.debug("Generation thread paused, waiting...")
+                    print("Generation thread paused, waiting...")
                     self.condition.wait()
 
                 # 再次检查运行状态
@@ -413,21 +408,19 @@ def _generation_loop(self):
                     "timing": timing_raw,
                     "generation_timestamp": time.time(),
                     "rollout_param_version": self.current_param_version,
-                    "epoch": epoch,
                 }
                 # 放入队列
                 success = self.message_queue_client.put_samples(
-                    epoch=epoch,
-                    sample=gen_batch_output,
+                    samples=gen_batch_output,
                     param_version=self.current_param_version,
-                    rollout_metadata=rollout_metadata,
+                    rollout_metadata_list=rollout_metadata,
                 )
 
                 with self.lock:
                     if success:
                         self.total_generated_samples += 1
                         if self.total_generated_samples % 10 == 0:
-                            self.logger.info(
+                            print(
                                 f"Generated {self.total_generated_samples} batches, "
                                 f"param_version={self.current_param_version}, "
                                 f"errors={self.generation_errors}"
@@ -435,7 +428,7 @@ def _generation_loop(self):
                     else:
                         self.dropped_stale_samples += 1
                         if self.dropped_stale_samples % 5 == 0:
-                            self.logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}")
+                            print(f"Dropped stale samples: {self.dropped_stale_samples}")
 
     def _monitor_loop(self):
         """监控线程 - 监控状态并处理控制信号"""
@@ -459,17 +452,17 @@ def _monitor_loop(self):
                     last_stats_time = current_time
 
                 # 检查是否应该恢复生成
-                if self._should_resume_generation():
+                if not self._should_pause_generation():
                     with self.lock:
                         if self.paused:
                             self.paused = False
                             self.condition.notify_all()
-                            self.logger.info("Generation resumed")
+                            print("Generation resumed")
 
         except Exception as e:
             print(f"Error in monitor loop: {e}")
         finally:
-            self.logger.info("Monitor thread exiting")
+            print("Monitor thread exiting")
 
     def _report_loop(self):
         try:
@@ -493,9 +486,8 @@ def _report_loop(self):
                     print("Generation thread died, restarting...")
                     raise RuntimeError("generation_thread not alive")
 
-
         except KeyboardInterrupt:
-            self.logger.info("Received interrupt signal, shutting down...")
+            print("Received interrupt signal, shutting down...")
         except Exception as e:
             print(f"Error in main loop: {e}")
         finally:
@@ -515,7 +507,7 @@ def _should_pause_generation(self) -> bool:
 
             # 如果版本差异过大，暂停生成
             if version_diff >= self.max_staleness_allowed:
-                self.logger.debug(
+                print(
                     f"Should pause due to staleness: rollout_version={self.current_param_version}, "
                     f"trainer_version={current_trainer_version}, diff={version_diff}"
                 )
@@ -524,7 +516,7 @@ def _should_pause_generation(self) -> bool:
             # 如果队列太满，也暂停生成
             max_queue_size = self.staleness_threshold * self.config.data.train_batch_size
             if queue_size >= max_queue_size:
-                self.logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
+                print(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
                 return True
 
             return False
@@ -556,12 +548,12 @@ def resume(self) -> bool:
 
             self.paused = False
             self.condition.notify_all()
-            self.logger.info("Generation resumed")
+            print("Generation resumed")
             return True
 
     def shutdown(self):
         """关闭Rollouter - 改进的关闭逻辑"""
-        self.logger.info("Shutting down Rollouter...")
+        print("Shutting down Rollouter...")
 
         with self.lock:
             self.running = False
@@ -570,19 +562,19 @@ def shutdown(self):
 
         # 等待生成线程结束
         if self.generation_thread and self.generation_thread.is_alive():
-            self.logger.info("Waiting for generation thread to finish...")
+            print("Waiting for generation thread to finish...")
             self.generation_thread.join(timeout=10.0)
 
             if self.generation_thread.is_alive():
-                self.logger.warning("Generation thread did not finish within timeout")
+                print("Generation thread did not finish within timeout")
 
         # 等待监控线程结束
         if self.monitor_thread and self.monitor_thread.is_alive():
-            self.logger.info("Waiting for monitor thread to finish...")
+            print("Waiting for monitor thread to finish...")
             self.monitor_thread.join(timeout=5.0)
 
             if self.monitor_thread.is_alive():
-                self.logger.warning("Monitor thread did not finish within timeout")
+                print("Monitor thread did not finish within timeout")
 
         # 关闭线程池
         if self.thread_executor:
@@ -594,9 +586,9 @@ def shutdown(self):
                 # TODO: 添加异步rollout管理器的清理逻辑
                 pass
             except Exception as e:
-                self.logger.warning(f"Error cleaning up async rollout manager: {e}")
+                print(f"Error cleaning up async rollout manager: {e}")
 
-        self.logger.info("Rollouter shutdown complete")
+        print("Rollouter shutdown complete")
 
     def get_statistics(self) -> dict:
         with self.lock:
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 2a507e41efa..1a3076f19c2 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -20,12 +20,10 @@
 import numpy as np
 import ray
 from omegaconf import OmegaConf
-from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
-from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator
 from verl.trainer.ppo.ray_trainer import (
@@ -155,7 +153,6 @@ def _init_models(self):
         self.actor_wg.init_model()
         self.actor_rollout_wg = self.actor_wg  # to be compatible with the functions that not be modified
 
-
     def fit(self):
         """
         The training loop of PPO.
@@ -168,8 +165,6 @@ def fit(self):
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
 
-        from omegaconf import OmegaConf
-
         from verl.utils.tracking import Tracking
 
         logger = Tracking(
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 06f0d2cbbe9..dcd3c27ed15 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -31,7 +31,6 @@ class QueueSample:
     """单个batch样本，包含参数版本和新鲜度信息"""
 
     id: str
-    epoch: int
     data: Any
     param_version: int
     timestamp: float
@@ -77,13 +76,12 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         )
 
     def put_samples(
-            self, epoch: int, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
+        self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
     ) -> bool:
         """
         放入一个batch样本到队列
 
         Args:
-            epoch: 当前epoch
             samples: 样本数据
             param_version: 参数版本号
             rollout_metadata_list: rollout相关的元数据
@@ -110,7 +108,6 @@ def put_samples(
             for sample, meta in zip(samples, rollout_metadata_list, strict=False):
                 queue_sample = QueueSample(
                     id=str(uuid.uuid4()),
-                    epoch=epoch,
                     data=sample,
                     param_version=param_version,
                     timestamp=time.time(),
@@ -237,13 +234,13 @@ class MessageQueueClient:
     def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
-    def put_batch(
-            self, epoch: int, batch: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
+    def put_samples(
+        self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
     ) -> bool:
         """放入batch到队列"""
-        return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list))
+        return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list))
 
-    def get_batch(self, min_batch_count: int = 1) -> list[QueueSample]:
+    def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]:
         """从队列获取batch，一直等待直到有足够样本"""
         return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/unittest/test_fully_async.py
similarity index 91%
rename from recipe/fully_async_policy/test_fully_async.py
rename to recipe/fully_async_policy/unittest/test_fully_async.py
index c138debcaa0..a6646b17575 100644
--- a/recipe/fully_async_policy/test_fully_async.py
+++ b/recipe/fully_async_policy/unittest/test_fully_async.py
@@ -61,14 +61,13 @@ def test_basic_put_get(self):
         mock_batch = Mock(spec=DataProto)
 
         # 放入样本
-        success = self.client.put_batch(epoch=0, batch=mock_batch, param_version=1, rollout_metadata={"test": "data"})
+        success = self.client.put_samples(samples=mock_batch, param_version=1, rollout_metadata={"test": "data"})
         self.assertTrue(success)
 
         # 获取样本
-        samples = self.client.get_batch(min_batch_count=1, timeout=5.0)
+        samples = self.client.get_samples(min_batch_count=1, timeout=5.0)
         self.assertIsNotNone(samples)
         self.assertEqual(len(samples), 1)
-        self.assertEqual(samples[0].epoch, 0)
         self.assertEqual(samples[0].param_version, 1)
 
     def test_freshness_control(self):
@@ -79,9 +78,8 @@ def test_freshness_control(self):
         self.client.update_param_version(10)
 
         # 尝试放入过期样本
-        success = self.client.put_batch(
-            epoch=0,
-            batch=mock_batch,
+        success = self.client.put_samples(
+            samples=mock_batch,
             param_version=5,  # 版本差异为5，超过阈值3
             rollout_metadata={},
         )
@@ -161,11 +159,11 @@ def test_integration():
 
         # 生产样本
         for i in range(5):
-            success = client.put_batch(epoch=i, batch=mock_batch, param_version=i, rollout_metadata={"batch_id": i})
+            success = client.put_samples(samples=mock_batch, param_version=i, rollout_metadata={"batch_id": i})
             assert success, f"Failed to put batch {i}"
 
         # 消费样本
-        samples = client.get_batch(min_batch_count=3, timeout=10.0)
+        samples = client.get_samples(min_batch_count=3, timeout=10.0)
         assert samples is not None, "Failed to get samples"
         assert len(samples) == 3, f"Expected 3 samples, got {len(samples)}"
 
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
index 52a9f17d8ae..02e9839bcfd 100644
--- a/recipe/fully_async_policy/unittest/test_mq.py
+++ b/recipe/fully_async_policy/unittest/test_mq.py
@@ -66,9 +66,7 @@ def test_put_samples_success(self, message_queue_client, mock_data_proto):
         samples = [mock_data_proto, mock_data_proto]
         metadata_list = [{"test": "data1"}, {"test": "data2"}]
 
-        result = message_queue_client.put_batch(
-            epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list
-        )
+        result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list)
 
         assert result is True
 
@@ -85,7 +83,7 @@ def test_put_samples_without_metadata(self, message_queue_client, mock_data_prot
         """测试不提供metadata时的处理"""
         samples = [mock_data_proto, mock_data_proto]
 
-        result = message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
+        result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
 
         assert result is True
         queue_size = message_queue_client.get_queue_size()
@@ -96,9 +94,7 @@ def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_pro
         samples = [mock_data_proto, mock_data_proto]
         metadata_list = [{"test": "data1"}]  # 长度不匹配
 
-        result = message_queue_client.put_batch(
-            epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list
-        )
+        result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list)
 
         assert result is False  # 应该失败
         queue_size = message_queue_client.get_queue_size()
@@ -111,9 +107,8 @@ def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto
 
         # 尝试放入版本过旧的batch（版本差异>=3会被拒绝）
         samples = [mock_data_proto]
-        result = message_queue_client.put_batch(
-            epoch=1,
-            batch=samples,
+        result = message_queue_client.put_samples(
+            samples=samples,
             param_version=2,  # 5-2=3, 达到阈值
             rollout_metadata_list=None,
         )
@@ -129,7 +124,7 @@ def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto)
         # 填满队列（最大容量10）
         for i in range(6):  # 每次放入2个，总共12个，超过最大容量10
             samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
+            message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
 
         # 队列大小应该保持在最大值
         queue_size = message_queue_client.get_queue_size()
@@ -144,10 +139,10 @@ def test_get_samples_success(self, message_queue_client, mock_data_proto):
         # 先放入一些samples
         samples = [mock_data_proto, mock_data_proto, mock_data_proto]
         metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}]
-        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list)
+        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list)
 
         # 获取2个samples
-        retrieved_samples = message_queue_client.get_batch(min_batch_count=2)
+        retrieved_samples = message_queue_client.get_samples(min_batch_count=2)
 
         assert retrieved_samples is not None
         assert len(retrieved_samples) == 2
@@ -167,13 +162,13 @@ def test_get_samples_blocking_behavior(self, message_queue_client, mock_data_pro
 
         def get_samples():
             # 这会阻塞直到有足够样本
-            samples = message_queue_client.get_batch(min_batch_count=2)
+            samples = message_queue_client.get_samples(min_batch_count=2)
             result.append(samples)
 
         def put_samples_later():
             time.sleep(0.5)  # 延迟放入
             samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
+            message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
 
         # 启动消费者线程
         consumer_thread = threading.Thread(target=get_samples)
@@ -199,7 +194,7 @@ def test_clear_queue(self, message_queue_client, mock_data_proto):
         """测试清空队列"""
         # 先添加一些样本
         samples = [mock_data_proto, mock_data_proto, mock_data_proto]
-        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
+        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
 
         # 清空队列
         message_queue_client.clear_queue()
@@ -213,7 +208,7 @@ def test_get_queue_size(self, message_queue_client, mock_data_proto):
         assert message_queue_client.get_queue_size() == 0
 
         samples = [mock_data_proto]
-        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
+        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
         assert message_queue_client.get_queue_size() == 1
 
     def test_get_statistics(self, message_queue_client):
@@ -238,7 +233,7 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto):
         """测试获取内存使用统计"""
         # 添加一些样本
         samples = [mock_data_proto, mock_data_proto]
-        message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None)
+        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
 
         memory_stats = message_queue_client.get_memory_usage()
 
@@ -287,14 +282,14 @@ def test_concurrent_put_get(self, mock_data_proto):
             def producer():
                 for i in range(50):
                     samples = [mock_data_proto, mock_data_proto]
-                    result = client.put_batch(epoch=i, batch=samples, param_version=1, rollout_metadata_list=None)
+                    result = client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
                     results.append(("put", result))
                     time.sleep(0.1)
 
             def consumer():
                 for _ in range(100):
                     try:
-                        retrieved_samples = client.get_batch(min_batch_count=1)
+                        retrieved_samples = client.get_samples(min_batch_count=1)
                         results.append(("get", len(retrieved_samples) > 0))
                     except Exception as e:
                         print(e)

From f653a8eb39d5c39b11d8d7aa45074af420e2bf32 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 5 Aug 2025 17:32:58 +0800
Subject: [PATCH 022/182] gen data to queue

---
 .../config/fully_async_ppo_trainer.yaml       |  12 +-
 recipe/fully_async_policy/fully_async_main.py |   2 +-
 .../fully_async_rollouter.py                  | 258 +++++++++---------
 .../fully_async_policy/fully_async_trainer.py |   4 -
 recipe/fully_async_policy/message_queue.py    |  18 +-
 .../run_fully_async_example.sh                |   2 -
 6 files changed, 141 insertions(+), 155 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index d97484d88f4..f9aa06cd4b6 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -11,16 +11,6 @@ defaults:
 async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
-  max_staleness_allowed: 5            # 最大允许的样本陈旧度
-
-  # 队列管理 (Queue Management)
-  max_queue_size: 1000               # 消息队列最大大小
-  min_batch_count: 1                 # 每次获取的最小batch数量
-  batch_timeout: 30.0                # 获取batch的超时时间(秒)
-
-  # 生成控制 (Generation Control)
-  generation_timeout: 30.0           # 单次生成的超时时间(秒)
-  batch_generation_interval: 0.1     # batch生成间隔(秒)
 
   # 参数同步 (Parameter Synchronization)
   max_sync_retries: 3                # 参数同步最大重试次数
@@ -35,3 +25,5 @@ rollout:
   name: vllm                         # rollout引擎: vllm, sglang
   n: 4                               # 每个prompt生成的响应数量
 
+data:
+  gen_batch_size: 32
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index dd544e9b49a..d7079d4af2b 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -224,7 +224,7 @@ def _initialize_components(self, config) -> None:
 
         # 创建MessageQueue
         print("Creating MessageQueue...")
-        max_queue_size = config.async_training.get("max_queue_size", 1000)
+        max_queue_size = config.async_training.staleness_threshold * config.data.train_batch_size
         message_queue = MessageQueue.remote(config, max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index f273b3a45a4..9196dc08e94 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -21,6 +21,7 @@
 from tqdm import tqdm
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient
+from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.debug import marked_timer
@@ -35,16 +36,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -111,8 +112,6 @@ def __init__(
         # 新鲜度控制 - 改进的配置管理
         async_config = config.async_training
         self.staleness_threshold = async_config.get("staleness_threshold", 3)
-        self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5)
-        self.generation_timeout = async_config.get("generation_timeout", 30.0)
 
         # 统计信息
         self.total_generated_samples = 0
@@ -145,6 +144,8 @@ def __init__(
         self.sync_in_progress = False
         self.sync_lock = threading.Lock()
 
+        self.max_queue_size = self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
+
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
         with self.lock:
@@ -176,105 +177,6 @@ def _init_models(self):
         self.rollout_wg.init_model()
         self.actor_rollout_wg = self.rollout_wg
 
-    def update_rollout_weights(self, param_version: int) -> bool:
-        """
-        更新rollout模型参数 - 改进的参数同步实现
-        这个方法由外部Trainer调用
-
-        Args:
-            param_version: 新的参数版本号
-
-        Returns:
-            bool: 是否成功更新参数
-        """
-        print(f"Updating rollout weights to version {param_version}")
-
-        with self.sync_lock:
-            if self.sync_in_progress:
-                print(f"Sync already in progress, skipping version {param_version}")
-                return False
-
-            self.sync_in_progress = True
-
-        try:
-            # 暂停rollout - 带超时机制
-            if not self.rollout_controller.pause(timeout=10.0):
-                print("Failed to pause rollout within timeout")
-                return False
-
-            # 等待当前generation完成（如果有的话）
-            time.sleep(0.1)
-
-            # 执行参数同步
-            sync_success = self._execute_parameter_sync(param_version)
-
-            if sync_success:
-                self.current_param_version = param_version
-                self.param_sync_requests += 1
-                self.last_sync_time = time.time()
-                print(f"Successfully updated rollout weights to version {param_version}")
-            else:
-                print(f"Failed to sync parameters to version {param_version}")
-
-        except Exception as e:
-            print(f"Error during parameter sync: {e}")
-            sync_success = False
-        finally:
-            # 恢复rollout
-            self.rollout_controller.resume()
-            self.sync_in_progress = False
-
-        return sync_success
-
-    def _execute_parameter_sync(self, param_version: int) -> bool:
-        """
-        执行实际的参数同步 - 改进的同步逻辑
-
-        Args:
-            param_version: 目标参数版本
-
-        Returns:
-            bool: 是否同步成功
-        """
-        try:
-            # 暂停推理引擎
-            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
-                # 对于异步模式，暂停服务器
-                pass  # 异步服务器的暂停在 pause() 中已经处理
-            else:
-                # 对于同步模式，使用sleep/wake_up机制
-                sleep_futures = self.rollout_wg.sleep()
-                ray.get(sleep_futures)
-
-            # 执行参数同步
-            if self.param_synchronizer:
-                self.param_synchronizer.sync_weights()
-                print("Parameter synchronization completed via synchronizer")
-            else:
-                # 直接使用rollout worker group的同步机制
-                if hasattr(self.rollout_wg, "sync_rollout_weights"):
-                    sync_futures = self.rollout_wg.sync_rollout_weights()
-                    ray.get(sync_futures)
-                    print("Parameter synchronization completed via rollout worker group")
-                else:
-                    print("No parameter synchronization mechanism available")
-                    return False
-
-            # 恢复推理引擎
-            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
-                # 对于异步模式，恢复服务器
-                pass  # 异步服务器的恢复在 resume() 中已经处理
-            else:
-                # 对于同步模式，唤醒workers
-                wake_futures = self.rollout_wg.wake_up()
-                ray.get(wake_futures)
-
-            return True
-
-        except Exception as e:
-            print(f"Parameter sync execution failed: {e}")
-            return False
-
     def _create_continuous_iterator(self):
         """
         Create a continuous data iterator across epoch
@@ -349,9 +251,6 @@ def _generation_loop(self):
             if self.config.trainer.get("val_only", False):
                 return
 
-        # add tqdm
-        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
-
         # we start from step 1
         self.global_steps += 1
         last_val_metrics = None
@@ -409,26 +308,36 @@ def _generation_loop(self):
                     "generation_timestamp": time.time(),
                     "rollout_param_version": self.current_param_version,
                 }
+
+                gen_batch_output: DataProto = gen_batch_output
+                print(gen_batch_output)
+                for i in gen_batch_output:
+                    print(i)
+
                 # 放入队列
                 success = self.message_queue_client.put_samples(
                     samples=gen_batch_output,
                     param_version=self.current_param_version,
                     rollout_metadata_list=rollout_metadata,
                 )
-
+                print(f"put samples {success}")
                 with self.lock:
                     if success:
                         self.total_generated_samples += 1
-                        if self.total_generated_samples % 10 == 0:
-                            print(
-                                f"Generated {self.total_generated_samples} batches, "
-                                f"param_version={self.current_param_version}, "
-                                f"errors={self.generation_errors}"
-                            )
                     else:
                         self.dropped_stale_samples += 1
-                        if self.dropped_stale_samples % 5 == 0:
-                            print(f"Dropped stale samples: {self.dropped_stale_samples}")
+
+                if self.global_steps % 1 == 0:
+                    print(f"Generated {self.total_generated_samples} batches, \n"
+                          f"param_version={self.current_param_version}, \n"
+                          f"errors={self.generation_errors}, \n"
+                          f"Dropped stale samples: {self.dropped_stale_samples}\n")
+
+            self.global_steps += 1
+
+            if is_last_step:
+                pprint(f"Final validation metrics: {last_val_metrics}")
+                return
 
     def _monitor_loop(self):
         """监控线程 - 监控状态并处理控制信号"""
@@ -506,7 +415,7 @@ def _should_pause_generation(self) -> bool:
             version_diff = self.current_param_version - current_trainer_version
 
             # 如果版本差异过大，暂停生成
-            if version_diff >= self.max_staleness_allowed:
+            if version_diff >= self.staleness_threshold:
                 print(
                     f"Should pause due to staleness: rollout_version={self.current_param_version}, "
                     f"trainer_version={current_trainer_version}, diff={version_diff}"
@@ -514,7 +423,7 @@ def _should_pause_generation(self) -> bool:
                 return True
 
             # 如果队列太满，也暂停生成
-            max_queue_size = self.staleness_threshold * self.config.data.train_batch_size
+
             if queue_size >= max_queue_size:
                 print(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
                 return True
@@ -604,3 +513,102 @@ def get_statistics(self) -> dict:
                 "queue_size": f"{queue_stats['queue_size']}",
             }
             return stats
+
+    def update_rollout_weights(self, param_version: int) -> bool:
+        """
+        更新rollout模型参数 - 改进的参数同步实现
+        这个方法由外部Trainer调用
+
+        Args:
+            param_version: 新的参数版本号
+
+        Returns:
+            bool: 是否成功更新参数
+        """
+        print(f"Updating rollout weights to version {param_version}")
+
+        with self.sync_lock:
+            if self.sync_in_progress:
+                print(f"Sync already in progress, skipping version {param_version}")
+                return False
+
+            self.sync_in_progress = True
+
+        try:
+            # 暂停rollout - 带超时机制
+            if not self.rollout_controller.pause(timeout=10.0):
+                print("Failed to pause rollout within timeout")
+                return False
+
+            # 等待当前generation完成（如果有的话）
+            time.sleep(0.1)
+
+            # 执行参数同步
+            sync_success = self._execute_parameter_sync(param_version)
+
+            if sync_success:
+                self.current_param_version = param_version
+                self.param_sync_requests += 1
+                self.last_sync_time = time.time()
+                print(f"Successfully updated rollout weights to version {param_version}")
+            else:
+                print(f"Failed to sync parameters to version {param_version}")
+
+        except Exception as e:
+            print(f"Error during parameter sync: {e}")
+            sync_success = False
+        finally:
+            # 恢复rollout
+            self.rollout_controller.resume()
+            self.sync_in_progress = False
+
+        return sync_success
+
+    def _execute_parameter_sync(self, param_version: int) -> bool:
+        """
+        执行实际的参数同步 - 改进的同步逻辑
+
+        Args:
+            param_version: 目标参数版本
+
+        Returns:
+            bool: 是否同步成功
+        """
+        try:
+            # 暂停推理引擎
+            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
+                # 对于异步模式，暂停服务器
+                pass  # 异步服务器的暂停在 pause() 中已经处理
+            else:
+                # 对于同步模式，使用sleep/wake_up机制
+                sleep_futures = self.rollout_wg.sleep()
+                ray.get(sleep_futures)
+
+            # 执行参数同步
+            if self.param_synchronizer:
+                self.param_synchronizer.sync_weights()
+                print("Parameter synchronization completed via synchronizer")
+            else:
+                # 直接使用rollout worker group的同步机制
+                if hasattr(self.rollout_wg, "sync_rollout_weights"):
+                    sync_futures = self.rollout_wg.sync_rollout_weights()
+                    ray.get(sync_futures)
+                    print("Parameter synchronization completed via rollout worker group")
+                else:
+                    print("No parameter synchronization mechanism available")
+                    return False
+
+            # 恢复推理引擎
+            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
+                # 对于异步模式，恢复服务器
+                pass  # 异步服务器的恢复在 resume() 中已经处理
+            else:
+                # 对于同步模式，唤醒workers
+                wake_futures = self.rollout_wg.wake_up()
+                ray.get(wake_futures)
+
+            return True
+
+        except Exception as e:
+            print(f"Parameter sync execution failed: {e}")
+            return False
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 1a3076f19c2..db6bdfeaebc 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -93,10 +93,6 @@ def __init__(
         self.use_rm = Role.RewardModel in role_worker_mapping
         self.ray_worker_group_cls = ray_worker_group_cls
         self.device_name = device_name if device_name else self.config.trainer.device
-        self.validation_generations_logger = ValidationGenerationsLogger(
-            project_name=self.config.trainer.project_name,
-            experiment_name=self.config.trainer.experiment_name,
-        )
 
         # if ref_in_actor is True, the reference policy will be actor without lora applied
         self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index dcd3c27ed15..61723cde953 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -76,7 +76,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         )
 
     def put_samples(
-        self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
+            self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None
     ) -> bool:
         """
         放入一个batch样本到队列
@@ -84,7 +84,7 @@ def put_samples(
         Args:
             samples: 样本数据
             param_version: 参数版本号
-            rollout_metadata_list: rollout相关的元数据
+            rollout_metadata: rollout相关的元数据
 
         Returns:
             bool: 是否成功放入队列
@@ -97,21 +97,13 @@ def put_samples(
                 logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
                 return False
 
-            # 处理 rollout_metadatas 为 None 的情况
-            if rollout_metadata_list is None:
-                rollout_metadata_list = [{}] * len(samples)
-
-            if len(rollout_metadata_list) != len(samples):
-                logger.warning(f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}")
-                return False
-
-            for sample, meta in zip(samples, rollout_metadata_list, strict=False):
+            for sample in samples:
                 queue_sample = QueueSample(
                     id=str(uuid.uuid4()),
                     data=sample,
                     param_version=param_version,
                     timestamp=time.time(),
-                    rollout_metadata=meta or {},
+                    rollout_metadata=rollout_metadata or {},
                 )
 
                 # 如果队列满了，移除最旧的样本，一般不会发生
@@ -235,7 +227,7 @@ def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
     def put_samples(
-        self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
+            self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
     ) -> bool:
         """放入batch到队列"""
         return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list))
diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh
index 180071318a1..cd2265cde0d 100644
--- a/recipe/fully_async_policy/run_fully_async_example.sh
+++ b/recipe/fully_async_policy/run_fully_async_example.sh
@@ -55,7 +55,6 @@ max_response_length=1024
 
 # 异步训练参数
 staleness_threshold=3
-max_staleness_allowed=5
 max_queue_size=1000
 min_batch_count=1
 batch_timeout=30.0
@@ -121,7 +120,6 @@ python -m recipe.one_step_off_policy.fully_async_main \
     \
     # 异步训练配置
     async_training.staleness_threshold=$staleness_threshold \
-    async_training.max_staleness_allowed=$max_staleness_allowed \
     async_training.max_queue_size=$max_queue_size \
     async_training.min_batch_count=$min_batch_count \
     async_training.batch_timeout=$batch_timeout \

From 352066c12fe9362f3bc974e60a474dd913b6d19c Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 5 Aug 2025 17:48:54 +0800
Subject: [PATCH 023/182] gen data to queue

---
 tests/special_e2e/run_fully_async_policy.sh | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 9692aab0d44..2949316228a 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -33,7 +33,8 @@ overlong_penalty_factor=1.0
 
 # Training parameters
 loss_agg_mode="token-mean"
-train_prompt_bsz=8
+train_prompt_bsz=32
+gen_prompt_bsz=4
 n_resp_per_prompt=3
 train_prompt_mini_bsz=4
 
@@ -50,8 +51,6 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
 # Async training specific configurations
 staleness_threshold=3
-max_staleness_allowed=5
-max_queue_size=1000
 min_batch_count=1
 batch_timeout=30.0
 generation_timeout=30.0
@@ -74,6 +73,7 @@ common_params=(
     data.max_prompt_length=${max_prompt_length}
     data.max_response_length=${max_response_length}
     data.train_batch_size=${train_prompt_bsz}
+    data.gen_batch_size=${gen_prompt_bsz}
     actor_rollout_ref.rollout.n=${n_resp_per_prompt}
     algorithm.adv_estimator=${adv_estimator}
     algorithm.use_kl_in_reward=${use_kl_in_reward}
@@ -115,7 +115,7 @@ common_params=(
     trainer.test_freq=-1
     trainer.save_freq=-1
     trainer.total_epochs=2
-    trainer.total_training_steps=4
+    trainer.total_training_steps=10
     trainer.resume_mode=disable
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
@@ -123,13 +123,6 @@ common_params=(
     rollout.n_gpus_per_node=${n_gpus_rollout}
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}
-    async_training.max_staleness_allowed=${max_staleness_allowed}
-    async_training.max_queue_size=${max_queue_size}
-    async_training.min_batch_count=${min_batch_count}
-    async_training.batch_timeout=${batch_timeout}
-    async_training.generation_timeout=${generation_timeout}
-    async_training.batch_generation_interval=${batch_generation_interval}
-    async_training.max_sync_retries=${max_sync_retries}
     async_training.sync_timeout=${sync_timeout}
     async_training.sync_retry_delay=${sync_retry_delay}
 )

From 5fac1d8441cebc77ef5f353739e8abde69374edf Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 5 Aug 2025 20:16:21 +0800
Subject: [PATCH 024/182] train get data

---
 .../README_async_trainer.md                   |  92 +++++
 recipe/fully_async_policy/TEST_GUIDE.md       | 312 +++++++++++++++++
 .../fully_async_rollouter.py                  |  40 +--
 .../fully_async_policy/fully_async_trainer.py | 210 ++++++++++--
 recipe/fully_async_policy/message_queue.py    |  14 +-
 recipe/fully_async_policy/run_benchmark.sh    |   0
 .../test_components_pytest.py                 | 315 ++++++++++++++++++
 7 files changed, 937 insertions(+), 46 deletions(-)
 create mode 100644 recipe/fully_async_policy/README_async_trainer.md
 create mode 100644 recipe/fully_async_policy/TEST_GUIDE.md
 mode change 100644 => 100755 recipe/fully_async_policy/run_benchmark.sh
 create mode 100644 recipe/fully_async_policy/test_components_pytest.py

diff --git a/recipe/fully_async_policy/README_async_trainer.md b/recipe/fully_async_policy/README_async_trainer.md
new file mode 100644
index 00000000000..9fbaa336be6
--- /dev/null
+++ b/recipe/fully_async_policy/README_async_trainer.md
@@ -0,0 +1,92 @@
+# FullyAsyncTrainer 队列数据获取实现
+
+## 概述
+
+本实现为 `FullyAsyncTrainer` 类添加了从消息队列获取样本并组成 `gen_batch_output` 的功能，实现了完全异步的训练流程。
+
+## 核心功能
+
+### 1. 样本计算逻辑
+
+```python
+# 计算需要获取的样本数量
+n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
+batch_size = self.config.data.train_batch_size
+required_samples = n_responses_per_prompt * batch_size
+```
+
+训练器会根据配置自动计算需要从队列获取的样本数量：
+- `rollout.n`: 每个prompt生成的响应数量
+- `train_batch_size`: 训练批次大小
+- 总样本数 = n × batch_size
+
+### 2. 主要方法
+
+#### `_get_samples_from_queue()`
+- 从消息队列获取指定数量的样本
+- 组装成 `gen_batch_output` 格式
+- 提取原始batch信息构造 `batch_dict`
+
+#### `_assemble_gen_batch_output_from_queue_samples()`
+- 将队列中的多个样本重新组装成 `DataProto` 对象
+- 处理tensor和non-tensor数据
+- 合并timing信息和metadata
+
+#### `_extract_batch_dict_from_sample()`
+- 从样本数据中提取原始输入信息
+- 过滤掉生成的输出，保留prompt相关数据
+
+#### `_async_get_next_batch_from_queue()`
+- 异步获取下一批队列数据
+- 使用线程池实现非阻塞操作
+
+### 3. 数据流程
+
+1. **样本生成**: Rollouter生成样本并放入MessageQueue
+2. **样本获取**: Trainer从队列异步获取 `n × batch_size` 个样本
+3. **数据重组**: 将队列样本重新组装成标准的 `gen_batch_output` 格式
+4. **训练处理**: 样本进入标准的PPO训练流程
+
+### 4. 使用示例
+
+```python
+# 初始化trainer
+trainer = FullyAsyncTrainer(config, tokenizer, role_worker_mapping, resource_pool_manager)
+
+# 设置消息队列客户端
+trainer.set_message_queue_client(message_queue_client)
+
+# 开始训练（自动从队列获取数据）
+trainer.fit()
+```
+
+## 配置要求
+
+确保配置中包含以下参数：
+
+```yaml
+data:
+  train_batch_size: 128  # 训练批次大小
+
+actor_rollout_ref:
+  rollout:
+    n: 4  # 每个prompt的响应数量
+```
+
+## 特性
+
+- **异步处理**: 使用异步方式从队列获取数据，不阻塞训练流程
+- **数据完整性**: 保持原有的tensor和non-tensor数据结构
+- **元数据保留**: 保留timing、参数版本等重要信息
+- **兼容性**: 与现有的PPO训练流程完全兼容
+
+## 监控指标
+
+训练器提供以下统计指标：
+- `queue_sample_count`: 当前批次的样本数量
+- `rollout_param_versions`: 样本对应的参数版本
+- `sample_timestamps`: 样本生成时间戳
+- timing信息的平均值
+
+通过 `trainer.get_statistics()` 可以获取详细的训练统计信息。
+
diff --git a/recipe/fully_async_policy/TEST_GUIDE.md b/recipe/fully_async_policy/TEST_GUIDE.md
new file mode 100644
index 00000000000..558920e5e84
--- /dev/null
+++ b/recipe/fully_async_policy/TEST_GUIDE.md
@@ -0,0 +1,312 @@
+# Fully Async Policy 测试指南
+
+本文档介绍如何测试完全异步PPO训练系统的各种功能和性能。
+
+## 📋 测试概览
+
+我们提供了多种类型的测试，涵盖从单元测试到端到端测试的完整测试套件：
+
+### 测试类型
+1. **单元测试** - 测试各个组件的独立功能
+2. **集成测试** - 测试组件间的协作
+3. **端到端测试** - 测试完整的训练流程
+4. **性能基准测试** - 评估系统性能特征
+5. **压力测试** - 测试系统在极限条件下的表现
+
+## 🚀 快速开始
+
+### 1. 端到端测试
+最简单的方式是运行端到端测试，验证系统基本功能：
+
+```bash
+# 基本E2E测试
+./run_e2e_test.sh
+
+# 使用环境变量自定义配置
+NUM_GPUS=4 MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct ./run_e2e_test.sh
+```
+
+### 2. 单元测试
+运行组件级别的单元测试：
+
+```bash
+# 运行所有单元测试
+cd unittest/
+python test_fully_async_components.py
+
+# 或者使用pytest（如果安装）
+pytest test_components_pytest.py -v
+```
+
+### 3. 性能基准测试
+评估系统性能特征：
+
+```bash
+# 运行完整的性能基准测试
+./run_benchmark.sh
+
+# 自定义GPU数量和策略
+NUM_GPUS=8 ACTOR_STRATEGY=fsdp2 ./run_benchmark.sh
+```
+
+## 📊 测试脚本详解
+
+### run_e2e_test.sh
+- **目的**: 端到端功能验证
+- **配置**: 最小化配置，快速验证基本功能
+- **时长**: 约5-10分钟
+- **用法**: `./run_e2e_test.sh`
+
+**环境变量**:
+- `NUM_GPUS`: GPU数量 (默认: 4)
+- `MODEL_ID`: 使用的模型ID (默认: Qwen/Qwen2.5-0.5B-Instruct)
+- `MODEL_PATH`: 模型存储路径
+
+### run_benchmark.sh
+- **目的**: 性能基准测试
+- **配置**: 多种配置组合，评估性能影响
+- **时长**: 约30-60分钟
+- **用法**: `./run_benchmark.sh`
+
+**测试覆盖**:
+1. 不同新鲜度阈值的影响
+2. 不同队列大小的性能表现
+3. 生成间隔对吞吐量的影响
+4. GPU资源分配的优化
+5. 暂停/恢复功能测试
+
+### test_fully_async_components.py
+- **目的**: 单元和集成测试
+- **配置**: 使用Mock对象的孤立测试
+- **时长**: 约2-5分钟
+- **用法**: `python unittest/test_fully_async_components.py`
+
+**测试覆盖**:
+- MessageQueue的基本功能
+- 参数同步器的重试机制
+- Rollouter的暂停/恢复
+- 新鲜度指标计算
+- 错误处理和超时机制
+
+## 🔧 测试配置
+
+### 最小化测试配置
+用于快速验证功能：
+
+```yaml
+# 基本配置
+data:
+  train_batch_size: 4
+  max_prompt_length: 512
+  max_response_length: 1024
+
+trainer:
+  total_training_steps: 2
+  n_gpus_per_node: 2
+
+rollout:
+  n_gpus_per_node: 2
+
+async_training:
+  staleness_threshold: 3
+  max_queue_size: 100
+```
+
+### 性能测试配置
+用于评估系统性能：
+
+```yaml
+# 性能配置
+data:
+  train_batch_size: 16
+  max_prompt_length: 512
+  max_response_length: 1024
+
+trainer:
+  total_training_steps: 10
+  n_gpus_per_node: 6
+
+rollout:
+  n_gpus_per_node: 2
+
+async_training:
+  staleness_threshold: 3
+  max_queue_size: 1000
+  generation_timeout: 30.0
+```
+
+## 📈 测试结果分析
+
+### 成功指标
+测试成功应满足以下条件：
+
+1. **功能正确性**:
+   - 样本成功生成和消费
+   - 参数同步正常工作
+   - 暂停/恢复功能响应
+
+2. **性能表现**:
+   - 样本生成速率 > 目标吞吐量
+   - 队列利用率在合理范围(50-80%)
+   - 新鲜度指标符合预期
+
+3. **稳定性**:
+   - 无内存泄漏
+   - 无死锁或竞争条件
+   - 优雅处理错误情况
+
+### 失败排查
+常见问题及解决方案：
+
+1. **Ray连接失败**:
+   ```bash
+   # 重新初始化Ray
+   ray stop
+   ray start --head
+   ```
+
+2. **GPU内存不足**:
+   ```bash
+   # 减少批大小或使用梯度检查点
+   data.train_batch_size=2
+   actor_rollout_ref.model.enable_gradient_checkpointing=True
+   ```
+
+3. **队列阻塞**:
+   ```bash
+   # 调整队列大小和新鲜度阈值
+   async_training.max_queue_size=500
+   async_training.staleness_threshold=5
+   ```
+
+## 🎯 特定功能测试
+
+### 测试暂停/恢复功能
+```python
+# 在Python脚本中测试
+import ray
+from fully_async_rollouter import FullyAsyncRollouter
+
+rollouter = FullyAsyncRollouter.remote(config, ...)
+
+# 测试暂停
+result = ray.get(rollouter.pause_rollout.remote())
+assert result == True
+
+# 测试恢复
+result = ray.get(rollouter.resume_rollout.remote())
+assert result == True
+```
+
+### 测试新鲜度控制
+```python
+# 测试样本过期机制
+queue = MessageQueueClient.remote(max_staleness=3)
+
+# 放入旧版本样本
+queue.put_samples.remote(sample, param_version=1)
+
+# 用新版本获取（应该被拒绝）
+result = ray.get(queue.get_samples.remote(current_param_version=5))
+assert result is None
+```
+
+### 测试参数同步
+```python
+# 测试同步重试机制
+sync = ParameterSynchronizer.remote(config, actor_wg, rollout_wg)
+
+# 测试成功同步
+result = ray.get(sync.sync_weights.remote())
+assert result == True
+```
+
+## 📝 测试报告
+
+### 基准测试报告
+运行`./run_benchmark.sh`后，会在`benchmark_results_*/`目录下生成：
+
+- `performance_report.md` - 详细的性能报告
+- `summary.txt` - 关键指标摘要
+- `*.log` - 各项测试的详细日志
+
+### 关键指标
+需要关注的性能指标：
+
+1. **吞吐量指标**:
+   - 样本生成速率 (samples/second)
+   - 训练步数完成速率 (steps/second)
+
+2. **延迟指标**:
+   - 样本平均年龄 (average sample age)
+   - 参数同步延迟 (sync latency)
+
+3. **资源利用率**:
+   - GPU利用率 (GPU utilization)
+   - 内存使用量 (memory usage)
+   - 队列利用率 (queue utilization)
+
+## 🔄 CI/CD 集成
+
+### GitHub Actions 示例
+```yaml
+name: Fully Async Policy Tests
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      run: |
+        pip install -r requirements.txt
+        pip install pytest
+
+    - name: Run unit tests
+      run: |
+        cd recipe/fully_async_policy/unittest/
+        python test_fully_async_components.py
+
+    - name: Run E2E test (if GPUs available)
+      run: |
+        if nvidia-smi; then
+          cd recipe/fully_async_policy/
+          ./run_e2e_test.sh
+        fi
+```
+
+## 🛠️ 开发者测试
+
+### 添加新测试
+1. **单元测试**: 在`unittest/test_fully_async_components.py`中添加新的测试类
+2. **集成测试**: 在相应的集成测试类中添加新方法
+3. **性能测试**: 在`run_benchmark.sh`中添加新的基准测试场景
+
+### 测试最佳实践
+1. **隔离性**: 每个测试应该独立，不依赖其他测试
+2. **可重现性**: 使用固定的随机种子和确定性配置
+3. **清理**: 测试结束后清理资源，避免影响后续测试
+4. **文档**: 为新测试添加清晰的文档说明
+
+## ❓ 常见问题
+
+**Q: 测试失败，提示Ray连接错误**
+A: 确保Ray集群正常运行，或重新启动Ray
+
+**Q: 内存不足错误**
+A: 减少批大小或在测试配置中启用参数卸载
+
+**Q: 测试运行时间过长**
+A: 使用更小的模型或减少训练步数进行快速测试
+
+**Q: 如何添加自定义测试？**
+A: 参考现有测试模式，在对应的测试文件中添加新的测试方法
+
+通过这套完整的测试系统，可以确保fully async policy系统的可靠性、性能和稳定性。
+
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 9196dc08e94..57d3eed243a 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -18,7 +18,6 @@
 
 import ray
 from omegaconf import OmegaConf
-from tqdm import tqdm
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl import DataProto
@@ -36,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -144,7 +143,9 @@ def __init__(
         self.sync_in_progress = False
         self.sync_lock = threading.Lock()
 
-        self.max_queue_size = self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
+        self.max_queue_size = (
+            self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
+        )
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
@@ -287,7 +288,6 @@ def _generation_loop(self):
                 if not self.running:
                     break
 
-            metrics = {}
             timing_raw = {}
             batch, gen_batch = self._prepare_generate_batch(batch_dict)
             is_last_step = self.global_steps >= self.total_training_steps
@@ -328,10 +328,12 @@ def _generation_loop(self):
                         self.dropped_stale_samples += 1
 
                 if self.global_steps % 1 == 0:
-                    print(f"Generated {self.total_generated_samples} batches, \n"
-                          f"param_version={self.current_param_version}, \n"
-                          f"errors={self.generation_errors}, \n"
-                          f"Dropped stale samples: {self.dropped_stale_samples}\n")
+                    print(
+                        f"Generated {self.total_generated_samples} batches, \n"
+                        f"param_version={self.current_param_version}, \n"
+                        f"errors={self.generation_errors}, \n"
+                        f"Dropped stale samples: {self.dropped_stale_samples}\n"
+                    )
 
             self.global_steps += 1
 
@@ -424,8 +426,8 @@ def _should_pause_generation(self) -> bool:
 
             # 如果队列太满，也暂停生成
 
-            if queue_size >= max_queue_size:
-                print(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}")
+            if queue_size >= self.max_queue_size:
+                print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
                 return True
 
             return False
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index db6bdfeaebc..5db63c9fab9 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 import logging
+import threading
 import time
 import warnings
 from pprint import pprint
+from typing import Any
 
 import numpy as np
 import ray
@@ -33,7 +35,6 @@
     WorkerType,
 )
 from verl.utils.debug import marked_timer
-from verl.utils.tracking import ValidationGenerationsLogger
 
 logger = logging.getLogger(__name__)
 
@@ -115,11 +116,192 @@ def __init__(
             self.use_critic = False
 
         self._validate_config()
+
+        self.lock = threading.RLock()
         self.message_queue_client = None
+        self.param_synchronizer = None
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
-        self.message_queue_client = message_queue_client
+        with self.lock:
+            self.message_queue_client = message_queue_client
+
+    def set_parameter_synchronizer(self, param_synchronizer):
+        """设置参数同步器"""
+        with self.lock:
+            self.param_synchronizer = param_synchronizer
+
+    def _get_samples_from_queue(self) -> tuple[None, None, None] | tuple[int, dict, Any]:
+        """
+        从消息队列获取样本并组成gen_batch_output
+
+        Returns:
+            tuple: (epoch, batch_dict, gen_batch_output)
+        """
+        if self.message_queue_client is None:
+            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+
+        # 计算需要获取的样本数量
+        n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
+        batch_size = self.config.data.train_batch_size
+        required_samples = n_responses_per_prompt * batch_size
+
+        logger.info(
+            f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})"
+        )
+
+        # 从队列获取样本
+        queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples)
+
+        if not queue_samples or len(queue_samples) == 0:
+            logger.warning("required_samples is empty")
+            return None, None, None
+
+        logger.info(f"Retrieved {len(queue_samples)} samples from queue")
+
+        # 组装gen_batch_output
+        gen_batch_output = self._assemble_gen_batch_output_from_queue_samples(
+            queue_samples, n_responses_per_prompt, batch_size
+        )
+
+        # 从第一个样本中提取原始batch信息来构造batch_dict
+        first_sample = queue_samples[0].data
+        batch_dict = self._extract_batch_dict_from_sample(first_sample, batch_size)
+
+        return 0, batch_dict, gen_batch_output
+
+    def _assemble_gen_batch_output_from_queue_samples(
+        self, queue_samples: list[QueueSample], n_responses_per_prompt: int, batch_size: int
+    ):
+        """
+        从队列样本中组装gen_batch_output
+
+        Args:
+            queue_samples: 队列中的样本列表
+            n_responses_per_prompt: 每个prompt的响应数量
+            batch_size: 批次大小
+
+        Returns:
+            DataProto: 组装好的gen_batch_output
+        """
+        import numpy as np
+        import torch
+
+        from verl.protocol import DataProto
+
+        # 提取所有样本的数据
+        sample_data_list = []
+        rollout_metadata_list = []
+        timing_info = {}
+
+        for sample in queue_samples:
+            sample_data_list.append(sample.data)
+            rollout_metadata_list.append(sample.rollout_metadata)
+
+        # 假设所有样本具有相同的数据结构，从第一个样本推断结构
+        first_sample_data = sample_data_list[0]
+
+        # 组装tensor数据
+        tensor_dict = {}
+        non_tensor_dict = {}
+
+        # 获取第一个样本的结构来初始化
+        if hasattr(first_sample_data, "batch") and first_sample_data.batch is not None:
+            # 处理tensor数据
+            for key in first_sample_data.batch.keys():
+                tensor_list = []
+                for sample_data in sample_data_list:
+                    if hasattr(sample_data, "batch") and sample_data.batch is not None and key in sample_data.batch:
+                        tensor_list.append(sample_data.batch[key])
+                    else:
+                        logger.warning(f"Missing key '{key}' in sample batch data")
+
+                if tensor_list:
+                    # 连接所有tensor
+                    tensor_dict[key] = torch.cat(tensor_list, dim=0)
+
+        if hasattr(first_sample_data, "non_tensor_batch") and first_sample_data.non_tensor_batch:
+            # 处理non_tensor数据
+            for key in first_sample_data.non_tensor_batch.keys():
+                non_tensor_list = []
+                for sample_data in sample_data_list:
+                    if (
+                        hasattr(sample_data, "non_tensor_batch")
+                        and sample_data.non_tensor_batch
+                        and key in sample_data.non_tensor_batch
+                    ):
+                        non_tensor_list.extend(sample_data.non_tensor_batch[key])
+                    else:
+                        logger.warning(f"Missing key '{key}' in sample non_tensor_batch data")
+
+                if non_tensor_list:
+                    non_tensor_dict[key] = np.array(non_tensor_list, dtype=object)
+
+        # 收集timing信息和metadata
+        for sample, metadata in zip(queue_samples, rollout_metadata_list, strict=False):
+            if "timing" in metadata:
+                for timing_key, timing_value in metadata["timing"].items():
+                    if timing_key not in timing_info:
+                        timing_info[timing_key] = []
+                    timing_info[timing_key].append(timing_value)
+
+        # 计算平均timing
+        avg_timing = {}
+        for key, values in timing_info.items():
+            if values:
+                avg_timing[key] = sum(values) / len(values)
+
+        # 创建meta_info
+        meta_info = {
+            "timing": avg_timing,
+            "queue_sample_count": len(queue_samples),
+            "rollout_param_versions": [sample.param_version for sample in queue_samples],
+            "sample_timestamps": [sample.timestamp for sample in queue_samples],
+        }
+
+        # 创建DataProto对象
+        if tensor_dict or non_tensor_dict:
+            gen_batch_output = DataProto.from_dict(
+                tensors=tensor_dict if tensor_dict else None,
+                non_tensors=non_tensor_dict if non_tensor_dict else None,
+                meta_info=meta_info,
+            )
+        else:
+            # 如果没有数据，创建空的DataProto
+            logger.warning("No tensor or non_tensor data found in samples, creating empty DataProto")
+            gen_batch_output = DataProto.from_dict(meta_info=meta_info)
+
+        logger.info(f"Assembled gen_batch_output with {len(gen_batch_output)} samples")
+        return gen_batch_output
+
+    def _extract_batch_dict_from_sample(self, sample_data, batch_size: int) -> dict:
+        """
+        从样本数据中提取batch_dict信息
+
+        Args:
+            sample_data: 样本数据
+            batch_size: 批次大小
+
+        Returns:
+            dict: batch字典
+        """
+        batch_dict = {}
+
+        # 从样本中提取原始输入信息
+        if hasattr(sample_data, "batch") and sample_data.batch is not None:
+            for key, value in sample_data.batch.items():
+                # 只保留输入相关的key，去掉生成的输出
+                if key in ["input_ids", "attention_mask", "position_ids"]:
+                    # 由于我们有多个响应，需要取出原始prompt部分
+                    batch_dict[key] = value[:batch_size] if len(value) >= batch_size else value
+
+        if hasattr(sample_data, "non_tensor_batch") and sample_data.non_tensor_batch:
+            for key, value in sample_data.non_tensor_batch.items():
+                # 保留非tensor的批次数据
+                if key in ["raw_prompt_ids", "raw_prompt", "multi_modal_data", "tools_kwargs", "interaction_kwargs"]:
+                    batch_dict[key] = np.array(value[:batch_size]) if len(value) >= batch_size else np.array(value)
+
+        return batch_dict
 
     def _create_actor_rollout_classes(self):
         # create actor
@@ -156,7 +338,6 @@ def fit(self):
         to construct the PPO dataflow.
         The light-weight advantage computation is done on the driver process.
         """
-        logger.info("Starting Trainer...")
 
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
@@ -193,13 +374,13 @@ def fit(self):
         last_val_metrics = None
         self.max_steps_duration = 0
 
-        # across epoch iterator
-        continuous_iterator = self._create_continuous_iterator()
-
-        # Start the first asynchronous generation task.
-        batch_data_future = self._async_gen_next_batch(continuous_iterator)
+        # 使用队列模式，不需要传统的dataloader迭代器
+        # 初始化获取第一批数据
+        while True:
+            epoch, batch, gen_batch_output = self._get_samples_from_queue()
+            if gen_batch_output is None:
+                break
 
-        while batch_data_future is not None:
             metrics = {}
             timing_raw = {}
 
@@ -213,17 +394,6 @@ def fit(self):
             is_last_step = self.global_steps >= self.total_training_steps
 
             with marked_timer("step", timing_raw):
-                # wait for the previous batch
-                with marked_timer("wait_prev_gen", timing_raw, color="red"):
-                    epoch, batch, gen_batch_output = batch_data_future.get()
-                    timing_raw.update(gen_batch_output.meta_info["timing"])
-                    gen_batch_output.meta_info.pop("timing", None)
-
-                # asys next generation (with syns weights from actor to rollout)
-                with marked_timer("sync_rollout_weights", timing_raw, color="purple"):
-                    if not is_last_step:
-                        batch_data_future = self._async_gen_next_batch(continuous_iterator)
-
                 batch = self._post_generate_batch(batch, gen_batch_output, metrics)
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 61723cde953..b72b9482e09 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -76,7 +76,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         )
 
     def put_samples(
-            self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None
+        self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None
     ) -> bool:
         """
         放入一个batch样本到队列
@@ -123,26 +123,26 @@ def put_samples(
 
             return True
 
-    def get_samples(self, min_batch: int = 1) -> list[QueueSample]:
+    def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]:
         """
         从队列获取batch样本，一直等待直到有足够样本
 
         Args:
-            min_batch: sample数量满足min_batch，一次性获取
+            min_batch_count: sample数量满足min_batch，一次性获取
 
         Returns:
             List[QueueSample]: 获取的样本列表
         """
         with self.lock:
-            while len(self.queue) < min_batch and self.running:
+            while len(self.queue) < min_batch_count and self.running:
                 self.consumer_condition.wait()
 
             # 如果队列已关闭且没有足够样本，返回空列表
-            if not self.running and len(self.queue) < min_batch:
+            if not self.running and len(self.queue) < min_batch_count:
                 return []
 
             # 获取指定数量的样本
-            batch_count = min(min_batch, len(self.queue))
+            batch_count = min(min_batch_count, len(self.queue))
             samples = []
             for _ in range(batch_count):
                 if self.queue:
@@ -227,7 +227,7 @@ def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
     def put_samples(
-            self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
+        self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
     ) -> bool:
         """放入batch到队列"""
         return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list))
diff --git a/recipe/fully_async_policy/run_benchmark.sh b/recipe/fully_async_policy/run_benchmark.sh
old mode 100644
new mode 100755
diff --git a/recipe/fully_async_policy/test_components_pytest.py b/recipe/fully_async_policy/test_components_pytest.py
new file mode 100644
index 00000000000..d887e17fc12
--- /dev/null
+++ b/recipe/fully_async_policy/test_components_pytest.py
@@ -0,0 +1,315 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pytest测试文件，用于测试完全异步PPO训练系统的各个组件
+"""
+
+import time
+from unittest.mock import Mock
+
+import pytest
+import ray
+from omegaconf import OmegaConf
+
+
+@pytest.fixture
+def ray_setup():
+    """Ray初始化fixture"""
+    if not ray.is_initialized():
+        ray.init(ignore_reinit_error=True, num_cpus=2)
+    yield
+    # 测试后不关闭Ray，因为其他测试可能还需要
+
+
+@pytest.fixture
+def basic_config():
+    """基本配置fixture"""
+    return OmegaConf.create(
+        {
+            "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}},
+            "algorithm": {"use_kl_in_reward": False},
+            "critic": {"enable": False},
+            "trainer": {
+                "device": "cpu",
+                "project_name": "test",
+                "experiment_name": "test",
+                "total_epochs": 1,
+                "total_training_steps": 2,
+            },
+            "async_training": {
+                "staleness_threshold": 3,
+                "max_staleness_allowed": 5,
+                "generation_timeout": 10.0,
+                "batch_timeout": 5.0,
+            },
+            "data": {"train_batch_size": 4},
+        }
+    )
+
+
+class TestMessageQueue:
+    """测试MessageQueue功能"""
+
+    def test_message_queue_creation(self, ray_setup):
+        """测试MessageQueue创建"""
+        try:
+            from message_queue import MessageQueueClient
+
+            queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
+
+            # 测试基本功能
+            stats = ray.get(queue.get_statistics.remote())
+            assert "queue_size" in stats
+            assert stats["queue_size"] == 0
+
+            ray.kill(queue)
+
+        except ImportError:
+            pytest.skip("MessageQueue not available")
+
+    def test_queue_put_get(self, ray_setup):
+        """测试队列的put/get操作"""
+        try:
+            from message_queue import MessageQueueClient
+
+            queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
+
+            # 创建模拟样本
+            mock_sample = Mock()
+            mock_sample.batch_size = 4
+
+            # 测试放入样本
+            success = ray.get(
+                queue.put_samples.remote(
+                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+                )
+            )
+            assert success
+
+            # 测试获取样本
+            result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1))
+            assert result is not None
+
+            ray.kill(queue)
+
+        except ImportError:
+            pytest.skip("MessageQueue not available")
+
+
+class TestRollouter:
+    """测试Rollouter功能"""
+
+    def test_rollouter_pause_resume(self, ray_setup, basic_config):
+        """测试Rollouter的暂停恢复功能"""
+        try:
+            from fully_async_rollouter import FullyAsyncRollouter
+
+            # 创建模拟依赖
+            mock_tokenizer = Mock()
+            mock_role_worker_mapping = {}
+            mock_resource_pool_manager = Mock()
+
+            # 创建Rollouter
+            rollouter = FullyAsyncRollouter.remote(
+                config=basic_config,
+                tokenizer=mock_tokenizer,
+                role_worker_mapping=mock_role_worker_mapping,
+                resource_pool_manager=mock_resource_pool_manager,
+            )
+
+            # 测试暂停
+            result = ray.get(rollouter.pause_rollout.remote())
+            assert result is True
+
+            # 检查状态
+            is_paused = ray.get(rollouter.is_rollout_paused.remote())
+            assert is_paused is True
+
+            # 测试恢复
+            result = ray.get(rollouter.resume_rollout.remote())
+            assert result is True
+
+            # 检查状态
+            is_paused = ray.get(rollouter.is_rollout_paused.remote())
+            assert is_paused is False
+
+            ray.kill(rollouter)
+
+        except ImportError:
+            pytest.skip("FullyAsyncRollouter not available")
+
+    def test_rollouter_statistics(self, ray_setup, basic_config):
+        """测试Rollouter统计功能"""
+        try:
+            from fully_async_rollouter import FullyAsyncRollouter
+
+            mock_tokenizer = Mock()
+            mock_role_worker_mapping = {}
+            mock_resource_pool_manager = Mock()
+
+            rollouter = FullyAsyncRollouter.remote(
+                config=basic_config,
+                tokenizer=mock_tokenizer,
+                role_worker_mapping=mock_role_worker_mapping,
+                resource_pool_manager=mock_resource_pool_manager,
+            )
+
+            # 获取统计信息
+            stats = ray.get(rollouter.get_statistics.remote())
+
+            # 验证必要字段存在
+            required_fields = [
+                "total_generated_samples",
+                "dropped_stale_samples",
+                "generation_errors",
+                "current_param_version",
+                "is_paused",
+                "pause_count",
+            ]
+
+            for field in required_fields:
+                assert field in stats
+
+            ray.kill(rollouter)
+
+        except ImportError:
+            pytest.skip("FullyAsyncRollouter not available")
+
+
+class TestTrainer:
+    """测试Trainer功能"""
+
+    def test_trainer_creation(self, ray_setup, basic_config):
+        """测试Trainer创建"""
+        try:
+            from fully_async_trainer import FullyAsyncTrainer
+
+            mock_tokenizer = Mock()
+            mock_role_worker_mapping = {}
+            mock_resource_pool_manager = Mock()
+
+            trainer = FullyAsyncTrainer.remote(
+                config=basic_config,
+                tokenizer=mock_tokenizer,
+                role_worker_mapping=mock_role_worker_mapping,
+                resource_pool_manager=mock_resource_pool_manager,
+            )
+
+            # 基本验证
+            assert trainer is not None
+
+            ray.kill(trainer)
+
+        except ImportError:
+            pytest.skip("FullyAsyncTrainer not available")
+
+
+class TestParameterSync:
+    """测试参数同步功能"""
+
+    def test_param_sync_creation(self, ray_setup):
+        """测试参数同步器创建"""
+        try:
+            from param_sync import ParameterSynchronizer
+
+            config = OmegaConf.create(
+                {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}}
+            )
+
+            mock_actor_wg = Mock()
+            mock_rollout_wg = Mock()
+
+            synchronizer = ParameterSynchronizer.remote(
+                config=config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg
+            )
+
+            assert synchronizer is not None
+
+            ray.kill(synchronizer)
+
+        except ImportError:
+            pytest.skip("ParameterSynchronizer not available")
+
+
+class TestIntegration:
+    """集成测试"""
+
+    def test_basic_workflow_simulation(self, ray_setup):
+        """测试基本工作流模拟"""
+        # 这是一个简化的集成测试，模拟基本的工作流
+        try:
+            from message_queue import MessageQueueClient
+
+            # 创建消息队列
+            queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2)
+
+            # 模拟生产者（Rollouter）
+            mock_sample = Mock()
+            mock_sample.batch_size = 2
+
+            # 放入样本
+            success = ray.get(
+                queue.put_samples.remote(
+                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
+                )
+            )
+            assert success
+
+            # 模拟消费者（Trainer）
+            result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1))
+            assert result is not None
+
+            samples, metadata_list = result
+            assert len(samples) == 1
+            assert len(metadata_list) == 1
+
+            ray.kill(queue)
+
+        except ImportError:
+            pytest.skip("Integration test components not available")
+
+
+class TestErrorHandling:
+    """错误处理测试"""
+
+    def test_timeout_handling(self, ray_setup):
+        """测试超时处理"""
+        try:
+            from message_queue import MessageQueueClient
+
+            queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2)
+
+            # 测试从空队列超时获取
+            start_time = time.time()
+            result = ray.get(
+                queue.get_samples.remote(
+                    min_batch_count=1,
+                    timeout=1.0,  # 1秒超时
+                    current_param_version=1,
+                )
+            )
+            elapsed = time.time() - start_time
+
+            assert result is None
+            assert 0.9 <= elapsed <= 2.0  # 允许一些误差
+
+            ray.kill(queue)
+
+        except ImportError:
+            pytest.skip("MessageQueue not available")
+
+
+if __name__ == "__main__":
+    # 如果直接运行此文件，执行所有测试
+    pytest.main([__file__, "-v"])

From 459aa7157c2abc71e53a69d357b0f52e5d0c8ccd Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 6 Aug 2025 10:46:54 +0800
Subject: [PATCH 025/182] put data to queue

---
 .../fully_async_policy/README_fully_async.md  |  28 +++
 recipe/fully_async_policy/TEST_GUIDE.md       |   3 +-
 recipe/fully_async_policy/fully_async_main.py |  11 +-
 .../fully_async_rollouter.py                  | 159 ++++++------------
 recipe/fully_async_policy/message_queue.py    |  40 ++---
 .../test_components_pytest.py                 |   4 +-
 .../unittest/test_fully_async.py              |   8 +-
 .../unittest/test_fully_async_components.py   |  12 +-
 recipe/fully_async_policy/unittest/test_mq.py |  26 +--
 9 files changed, 127 insertions(+), 164 deletions(-)

diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md
index 1708be5ae34..916633a4a81 100644
--- a/recipe/fully_async_policy/README_fully_async.md
+++ b/recipe/fully_async_policy/README_fully_async.md
@@ -306,3 +306,31 @@ def custom_monitor(trainer_stats, rollouter_stats):
 - 简单的消息队列实现
 - 基本的参数同步功能
 
+
+```python
+DataProtoItem(
+    batch=TensorDict(
+        fields={
+            attention_mask: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False),
+            input_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False),
+            position_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False),
+            prompts: Tensor(shape=torch.Size([1024]), device=cpu, dtype=torch.int64, is_shared=False),
+            response_mask: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False),
+            responses: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False)},
+        batch_size=torch.Size([]),
+        device=None,
+        is_shared=False), 
+    non_tensor_batch={'data_source': 'openai/gsm8k',
+                      'ability': 'math', 
+                      'reward_model': {'ground_truth': '35', 'style': 'rule'},
+                      'extra_info': {
+                          'answer': 'The total number of green and red plates is 28 + 21 = <<28+21=49>>49.\nXavier should buy 84 − 49 = 35 more plates.\n#### 35',
+                          'index': 1421, 
+                          'question': 'Xavier needs 84 paper plates for a housewarming party. He already has 21 green plates and 28 red plates. How many more plates should Xavier buy?', 'split': 'train'},
+                      'uid': 'fab3e910-67b3-4653-bc69-377250049267', 
+                      'tools_kwargs': {}, 
+                      'interaction_kwargs': {}, 
+                      'index': 1421},
+    meta_info={'global_token_num': [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]})
+```
+
diff --git a/recipe/fully_async_policy/TEST_GUIDE.md b/recipe/fully_async_policy/TEST_GUIDE.md
index 558920e5e84..3933998cd84 100644
--- a/recipe/fully_async_policy/TEST_GUIDE.md
+++ b/recipe/fully_async_policy/TEST_GUIDE.md
@@ -199,12 +199,13 @@ assert result == True
 ```
 
 ### 测试新鲜度控制
+
 ```python
 # 测试样本过期机制
 queue = MessageQueueClient.remote(max_staleness=3)
 
 # 放入旧版本样本
-queue.put_samples.remote(sample, param_version=1)
+queue.put_sample.remote(sample, param_version=1)
 
 # 用新版本获取（应该被拒绝）
 result = ray.get(queue.get_samples.remote(current_param_version=5))
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index d7079d4af2b..c17b2d8dbdd 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -223,9 +223,13 @@ def _initialize_components(self, config) -> None:
         self.components["val_reward_fn"] = val_reward_fn
 
         # 创建MessageQueue
+        self.max_queue_size = (
+            config.async_training.staleness_threshold
+            * config.data.train_batch_size
+            * config.actor_rollout_ref.rollout.n
+        )
         print("Creating MessageQueue...")
-        max_queue_size = config.async_training.staleness_threshold * config.data.train_batch_size
-        message_queue = MessageQueue.remote(config, max_queue_size)
+        message_queue = MessageQueue.remote(config, self.max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
 
         self.components["message_queue"] = message_queue
@@ -260,6 +264,7 @@ def _create_rollouter(self, config) -> None:
             ray_worker_group_cls=self.components["ray_worker_group_cls"],
             processor=self.components["processor"],
             device_name=config.trainer.device,
+            max_queue_size=self.max_queue_size,
         )
         print(rollouter)
 
@@ -311,6 +316,8 @@ def _run_training_loop(self):
         ray.get(rollouter_future)
         # ray.get(trainer_future)
 
+        self.components['message_queue_client'].clear_queue.remote()
+        
         print("Training completed or interrupted")
 
     def _monitor_components(self):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 57d3eed243a..61b21b43fd5 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -19,8 +19,7 @@
 import ray
 from omegaconf import OmegaConf
 
-from recipe.fully_async_policy.message_queue import MessageQueueClient
-from verl import DataProto
+from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.debug import marked_timer
@@ -45,6 +44,7 @@ def __init__(
         reward_fn=None,
         val_reward_fn=None,
         device_name=None,
+        max_queue_size=1000,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -59,10 +59,6 @@ def __init__(
             processor: Optional data processor, used for multimodal data
             reward_fn: Function for computing rewards during training.
             val_reward_fn: Function for computing rewards during validation.
-            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
-            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
-            collate_fn: Function to collate data samples into batches.
-            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
             device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
         """
         # Store the tokenizer for text processing
@@ -115,7 +111,6 @@ def __init__(
         # 统计信息
         self.total_generated_samples = 0
         self.dropped_stale_samples = 0
-        self.generation_errors = 0
         self.param_sync_requests = 0
 
         # Worker groups
@@ -143,9 +138,7 @@ def __init__(
         self.sync_in_progress = False
         self.sync_lock = threading.Lock()
 
-        self.max_queue_size = (
-            self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
-        )
+        self.max_queue_size = max_queue_size
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
@@ -257,19 +250,6 @@ def _generation_loop(self):
         last_val_metrics = None
         self.max_steps_duration = 0
 
-        """
-        主要的生成循环
-
-        循环入口，需要
-        1. running 判断
-        4. 中断判断
-        3. 新鲜度判断
-
-        生成样本过程中，需要
-        1. running 判断
-        2. 中断判断
-        """
-
         continuous_iterator = self._create_continuous_iterator()
         for epoch, batch_dict in continuous_iterator:
             with self.lock:
@@ -288,6 +268,7 @@ def _generation_loop(self):
                 if not self.running:
                     break
 
+            metrics = {}
             timing_raw = {}
             batch, gen_batch = self._prepare_generate_batch(batch_dict)
             is_last_step = self.global_steps >= self.total_training_steps
@@ -308,101 +289,65 @@ def _generation_loop(self):
                     "generation_timestamp": time.time(),
                     "rollout_param_version": self.current_param_version,
                 }
+                batch = self._post_generate_batch(batch, gen_batch_output, metrics)
 
-                gen_batch_output: DataProto = gen_batch_output
-                print(gen_batch_output)
-                for i in gen_batch_output:
-                    print(i)
-
-                # 放入队列
-                success = self.message_queue_client.put_samples(
-                    samples=gen_batch_output,
-                    param_version=self.current_param_version,
-                    rollout_metadata_list=rollout_metadata,
-                )
-                print(f"put samples {success}")
-                with self.lock:
-                    if success:
-                        self.total_generated_samples += 1
-                    else:
-                        self.dropped_stale_samples += 1
-
-                if self.global_steps % 1 == 0:
-                    print(
-                        f"Generated {self.total_generated_samples} batches, \n"
-                        f"param_version={self.current_param_version}, \n"
-                        f"errors={self.generation_errors}, \n"
-                        f"Dropped stale samples: {self.dropped_stale_samples}\n"
+                for sample in batch:
+                    # for sample in samples:
+                    queue_sample = QueueSample(
+                        data=sample,
+                        rollout_metadata=rollout_metadata,
+                    )
+                    # 放入队列
+                    success = self.message_queue_client.put_sample(
+                        sample=ray.cloudpickle.dumps(queue_sample),
+                        param_version=self.current_param_version,
                     )
+                    print(f"put samples {success}")
+                    with self.lock:
+                        if success:
+                            self.total_generated_samples += 1
+                        else:
+                            self.dropped_stale_samples += 1
+
+                    if self.global_steps % 1 == 0:
+                        print(
+                            f"Generated {self.total_generated_samples} batches, \n"
+                            f"param_version={self.current_param_version}, \n"
+                            f"Dropped stale samples: {self.dropped_stale_samples}\n"
+                        )
 
             self.global_steps += 1
 
             if is_last_step:
                 pprint(f"Final validation metrics: {last_val_metrics}")
-                return
+                break
+
+        with self.lock:
+            self.running = False
 
     def _monitor_loop(self):
         """监控线程 - 监控状态并处理控制信号"""
-        try:
-            # 主线程保持运行，处理控制信号和状态监控
-            last_stats_time = time.time()
-            stats_interval = 30.0  # 30秒报告一次统计
-            check_interval = 5.0  # 5秒检查一次状态
-
-            while True:
+        # 主线程保持运行，处理控制信号和状态监控
+        last_stats_time = time.time()
+        stats_interval = 30.0  # 30秒报告一次统计
+        check_interval = 5.0  # 5秒检查一次状态
+        while True:
+            with self.lock:
+                if not self.running:
+                    break
+            time.sleep(check_interval)
+            # 定期打印统计信息
+            current_time = time.time()
+            if current_time - last_stats_time >= stats_interval:
+                print(self.get_statistics())
+                last_stats_time = current_time
+            # 检查是否应该恢复生成
+            if not self._should_pause_generation():
                 with self.lock:
-                    if not self.running:
-                        break
-
-                time.sleep(check_interval)
-
-                # 定期打印统计信息
-                current_time = time.time()
-                if current_time - last_stats_time >= stats_interval:
-                    self._log_statistics()
-                    last_stats_time = current_time
-
-                # 检查是否应该恢复生成
-                if not self._should_pause_generation():
-                    with self.lock:
-                        if self.paused:
-                            self.paused = False
-                            self.condition.notify_all()
-                            print("Generation resumed")
-
-        except Exception as e:
-            print(f"Error in monitor loop: {e}")
-        finally:
-            print("Monitor thread exiting")
-
-    def _report_loop(self):
-        try:
-            # 主线程保持运行，处理控制信号和状态监控
-            last_stats_time = time.time()
-            stats_interval = 10.0
-
-            while self.running:
-                time.sleep(1.0)
-
-                # 定期打印统计信息
-                current_time = time.time()
-                if current_time - last_stats_time >= stats_interval:
-                    self.get_statistics()
-                    last_stats_time = current_time
-                    if not self._should_pause_generation():
-                        self.resume()
-
-                # 检查生成线程状态
-                if not self.generation_thread.is_alive():
-                    print("Generation thread died, restarting...")
-                    raise RuntimeError("generation_thread not alive")
-
-        except KeyboardInterrupt:
-            print("Received interrupt signal, shutting down...")
-        except Exception as e:
-            print(f"Error in main loop: {e}")
-        finally:
-            self.shutdown()
+                    if self.paused:
+                        self.paused = False
+                        self.condition.notify_all()
+                        print("Generation resumed")
 
     def _should_pause_generation(self) -> bool:
         """
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index b72b9482e09..f4dcd1f522d 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -30,10 +30,7 @@
 class QueueSample:
     """单个batch样本，包含参数版本和新鲜度信息"""
 
-    id: str
     data: Any
-    param_version: int
-    timestamp: float
     rollout_metadata: dict[str, Any]
 
 
@@ -75,16 +72,13 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
             "staleness_threshold={self.staleness_threshold}"
         )
 
-    def put_samples(
-        self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None
-    ) -> bool:
+    def put_sample(self, sample: Any, param_version: int) -> bool:
         """
         放入一个batch样本到队列
 
         Args:
-            samples: 样本数据
+            sample: 样本数据
             param_version: 参数版本号
-            rollout_metadata: rollout相关的元数据
 
         Returns:
             bool: 是否成功放入队列
@@ -97,23 +91,13 @@ def put_samples(
                 logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
                 return False
 
-            for sample in samples:
-                queue_sample = QueueSample(
-                    id=str(uuid.uuid4()),
-                    data=sample,
-                    param_version=param_version,
-                    timestamp=time.time(),
-                    rollout_metadata=rollout_metadata or {},
-                )
-
-                # 如果队列满了，移除最旧的样本，一般不会发生
-                if len(self.queue) >= self.max_queue_size:
-                    removed = self.queue.popleft()
-                    self.dropped_samples += 1
-                    logger.warning(f"Queue full, dropped sample {removed.id}")
-
-                self.queue.append(queue_sample)
-                self.total_produced += 1
+            # 如果队列满了，移除最旧的样本，一般不会发生
+            if len(self.queue) >= self.max_queue_size:
+                removed = self.queue.popleft()
+                self.dropped_samples += 1
+                logger.warning(f"Queue full, dropped sample {removed.id}")
+            self.queue.append(sample)
+            self.total_produced += 1
 
             # 通知等待的消费者
             self.consumer_condition.notify()
@@ -226,11 +210,9 @@ class MessageQueueClient:
     def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
-    def put_samples(
-        self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None
-    ) -> bool:
+    def put_sample(self, sample: Any, param_version: int) -> bool:
         """放入batch到队列"""
-        return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list))
+        return ray.get(self.queue_actor.put_sample.remote(sample, param_version))
 
     def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]:
         """从队列获取batch，一直等待直到有足够样本"""
diff --git a/recipe/fully_async_policy/test_components_pytest.py b/recipe/fully_async_policy/test_components_pytest.py
index d887e17fc12..fd2e207cbe4 100644
--- a/recipe/fully_async_policy/test_components_pytest.py
+++ b/recipe/fully_async_policy/test_components_pytest.py
@@ -91,7 +91,7 @@ def test_queue_put_get(self, ray_setup):
 
             # 测试放入样本
             success = ray.get(
-                queue.put_samples.remote(
+                queue.put_sample.remote(
                     epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
                 )
             )
@@ -260,7 +260,7 @@ def test_basic_workflow_simulation(self, ray_setup):
 
             # 放入样本
             success = ray.get(
-                queue.put_samples.remote(
+                queue.put_sample.remote(
                     epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
                 )
             )
diff --git a/recipe/fully_async_policy/unittest/test_fully_async.py b/recipe/fully_async_policy/unittest/test_fully_async.py
index a6646b17575..126ff489bf2 100644
--- a/recipe/fully_async_policy/unittest/test_fully_async.py
+++ b/recipe/fully_async_policy/unittest/test_fully_async.py
@@ -61,7 +61,7 @@ def test_basic_put_get(self):
         mock_batch = Mock(spec=DataProto)
 
         # 放入样本
-        success = self.client.put_samples(samples=mock_batch, param_version=1, rollout_metadata={"test": "data"})
+        success = self.client.put_sample(sample=mock_batch, param_version=1, rollout_metadata={"test": "data"})
         self.assertTrue(success)
 
         # 获取样本
@@ -78,8 +78,8 @@ def test_freshness_control(self):
         self.client.update_param_version(10)
 
         # 尝试放入过期样本
-        success = self.client.put_samples(
-            samples=mock_batch,
+        success = self.client.put_sample(
+            sample=mock_batch,
             param_version=5,  # 版本差异为5，超过阈值3
             rollout_metadata={},
         )
@@ -159,7 +159,7 @@ def test_integration():
 
         # 生产样本
         for i in range(5):
-            success = client.put_samples(samples=mock_batch, param_version=i, rollout_metadata={"batch_id": i})
+            success = client.put_sample(sample=mock_batch, param_version=i, rollout_metadata={"batch_id": i})
             assert success, f"Failed to put batch {i}"
 
         # 消费样本
diff --git a/recipe/fully_async_policy/unittest/test_fully_async_components.py b/recipe/fully_async_policy/unittest/test_fully_async_components.py
index 8e5279b84bb..8a5bc85d562 100644
--- a/recipe/fully_async_policy/unittest/test_fully_async_components.py
+++ b/recipe/fully_async_policy/unittest/test_fully_async_components.py
@@ -58,7 +58,7 @@ def test_put_and_get_samples(self):
 
         # 测试放入样本
         success = ray.get(
-            self.message_queue.put_samples.remote(
+            self.message_queue.put_sample.remote(
                 epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
             )
         )
@@ -79,7 +79,7 @@ def test_staleness_control(self):
 
         # 放入一个参数版本较老的样本
         success = ray.get(
-            self.message_queue.put_samples.remote(
+            self.message_queue.put_sample.remote(
                 epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
             )
         )
@@ -109,7 +109,7 @@ def test_queue_statistics(self):
 
         for i in range(3):
             ray.get(
-                self.message_queue.put_samples.remote(
+                self.message_queue.put_sample.remote(
                     epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
                 )
             )
@@ -339,7 +339,7 @@ def test_message_queue_trainer_integration(self):
         mock_sample.batch_size = 4
 
         ray.get(
-            message_queue.put_samples.remote(
+            message_queue.put_sample.remote(
                 epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
             )
         )
@@ -377,7 +377,7 @@ def test_message_queue_overflow(self):
         # 填满队列
         for i in range(2):
             result = ray.get(
-                message_queue.put_samples.remote(
+                message_queue.put_sample.remote(
                     epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
                 )
             )
@@ -385,7 +385,7 @@ def test_message_queue_overflow(self):
 
         # 尝试再放入一个样本（应该失败或者覆盖旧样本）
         result = ray.get(
-            message_queue.put_samples.remote(
+            message_queue.put_sample.remote(
                 epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
             )
         )
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
index 02e9839bcfd..2fff49d6576 100644
--- a/recipe/fully_async_policy/unittest/test_mq.py
+++ b/recipe/fully_async_policy/unittest/test_mq.py
@@ -66,7 +66,7 @@ def test_put_samples_success(self, message_queue_client, mock_data_proto):
         samples = [mock_data_proto, mock_data_proto]
         metadata_list = [{"test": "data1"}, {"test": "data2"}]
 
-        result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list)
+        result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list)
 
         assert result is True
 
@@ -83,7 +83,7 @@ def test_put_samples_without_metadata(self, message_queue_client, mock_data_prot
         """测试不提供metadata时的处理"""
         samples = [mock_data_proto, mock_data_proto]
 
-        result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
+        result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
 
         assert result is True
         queue_size = message_queue_client.get_queue_size()
@@ -94,7 +94,7 @@ def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_pro
         samples = [mock_data_proto, mock_data_proto]
         metadata_list = [{"test": "data1"}]  # 长度不匹配
 
-        result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list)
+        result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list)
 
         assert result is False  # 应该失败
         queue_size = message_queue_client.get_queue_size()
@@ -107,10 +107,10 @@ def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto
 
         # 尝试放入版本过旧的batch（版本差异>=3会被拒绝）
         samples = [mock_data_proto]
-        result = message_queue_client.put_samples(
-            samples=samples,
+        result = message_queue_client.put_sample(
+            sample=samples,
             param_version=2,  # 5-2=3, 达到阈值
-            rollout_metadata_list=None,
+            rollout_metadata=None,
         )
 
         assert result is False
@@ -124,7 +124,7 @@ def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto)
         # 填满队列（最大容量10）
         for i in range(6):  # 每次放入2个，总共12个，超过最大容量10
             samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
+            message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
 
         # 队列大小应该保持在最大值
         queue_size = message_queue_client.get_queue_size()
@@ -139,7 +139,7 @@ def test_get_samples_success(self, message_queue_client, mock_data_proto):
         # 先放入一些samples
         samples = [mock_data_proto, mock_data_proto, mock_data_proto]
         metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}]
-        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list)
+        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list)
 
         # 获取2个samples
         retrieved_samples = message_queue_client.get_samples(min_batch_count=2)
@@ -168,7 +168,7 @@ def get_samples():
         def put_samples_later():
             time.sleep(0.5)  # 延迟放入
             samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
+            message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
 
         # 启动消费者线程
         consumer_thread = threading.Thread(target=get_samples)
@@ -194,7 +194,7 @@ def test_clear_queue(self, message_queue_client, mock_data_proto):
         """测试清空队列"""
         # 先添加一些样本
         samples = [mock_data_proto, mock_data_proto, mock_data_proto]
-        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
+        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
 
         # 清空队列
         message_queue_client.clear_queue()
@@ -208,7 +208,7 @@ def test_get_queue_size(self, message_queue_client, mock_data_proto):
         assert message_queue_client.get_queue_size() == 0
 
         samples = [mock_data_proto]
-        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
+        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
         assert message_queue_client.get_queue_size() == 1
 
     def test_get_statistics(self, message_queue_client):
@@ -233,7 +233,7 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto):
         """测试获取内存使用统计"""
         # 添加一些样本
         samples = [mock_data_proto, mock_data_proto]
-        message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
+        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
 
         memory_stats = message_queue_client.get_memory_usage()
 
@@ -282,7 +282,7 @@ def test_concurrent_put_get(self, mock_data_proto):
             def producer():
                 for i in range(50):
                     samples = [mock_data_proto, mock_data_proto]
-                    result = client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None)
+                    result = client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
                     results.append(("put", result))
                     time.sleep(0.1)
 

From c65b6279b6b16c72109d60bb36ad8724fc42e906 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 6 Aug 2025 14:23:35 +0800
Subject: [PATCH 026/182] merge data proto item

---
 recipe/fully_async_policy/fully_async_main.py |   4 +-
 .../fully_async_policy/fully_async_trainer.py | 265 +++++-----
 recipe/fully_async_policy/message_queue.py    |   2 -
 .../unittest/protocol_examples.py             | 202 ++++++++
 .../unittest/test_protocol_split_merge.py     | 466 ++++++++++++++++++
 verl/protocol.py                              | 166 ++++++-
 6 files changed, 979 insertions(+), 126 deletions(-)
 create mode 100644 recipe/fully_async_policy/unittest/protocol_examples.py
 create mode 100644 recipe/fully_async_policy/unittest/test_protocol_split_merge.py

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index c17b2d8dbdd..404ffba4874 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -316,8 +316,8 @@ def _run_training_loop(self):
         ray.get(rollouter_future)
         # ray.get(trainer_future)
 
-        self.components['message_queue_client'].clear_queue.remote()
-        
+        self.components["message_queue_client"].clear_queue()
+
         print("Training completed or interrupted")
 
     def _monitor_components(self):
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 5db63c9fab9..5d69e9091ba 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -121,6 +121,12 @@ def __init__(
         self.message_queue_client = None
         self.param_synchronizer = None
 
+        # 统计信息
+        self.processed_samples = 0
+        self.stale_samples_processed = 0
+        self.current_param_version = 0
+        self.param_sync_count = 0
+
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
         with self.lock:
@@ -131,7 +137,7 @@ def set_parameter_synchronizer(self, param_synchronizer):
         with self.lock:
             self.param_synchronizer = param_synchronizer
 
-    def _get_samples_from_queue(self) -> tuple[None, None, None] | tuple[int, dict, Any]:
+    def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
         从消息队列获取样本并组成gen_batch_output
 
@@ -155,24 +161,16 @@ def _get_samples_from_queue(self) -> tuple[None, None, None] | tuple[int, dict,
 
         if not queue_samples or len(queue_samples) == 0:
             logger.warning("required_samples is empty")
-            return None, None, None
+            return None, None
 
         logger.info(f"Retrieved {len(queue_samples)} samples from queue")
 
-        # 组装gen_batch_output
-        gen_batch_output = self._assemble_gen_batch_output_from_queue_samples(
-            queue_samples, n_responses_per_prompt, batch_size
-        )
-
-        # 从第一个样本中提取原始batch信息来构造batch_dict
-        first_sample = queue_samples[0].data
-        batch_dict = self._extract_batch_dict_from_sample(first_sample, batch_size)
+        # 组装 batch
+        batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
 
-        return 0, batch_dict, gen_batch_output
+        return 0, batch
 
-    def _assemble_gen_batch_output_from_queue_samples(
-        self, queue_samples: list[QueueSample], n_responses_per_prompt: int, batch_size: int
-    ):
+    def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]):
         """
         从队列样本中组装gen_batch_output
 
@@ -185,123 +183,57 @@ def _assemble_gen_batch_output_from_queue_samples(
             DataProto: 组装好的gen_batch_output
         """
         import numpy as np
-        import torch
 
         from verl.protocol import DataProto
 
-        # 提取所有样本的数据
+        if not queue_samples:
+            raise ValueError("Empty queue_samples provided for batch assembly")
+
+        logger.debug(f"Assembling batch from {len(queue_samples)} queue samples")
+
+        # 提取所有样本的数据和元数据
         sample_data_list = []
         rollout_metadata_list = []
         timing_info = {}
 
-        for sample in queue_samples:
+        for i, sample in enumerate(queue_samples):
             sample_data_list.append(sample.data)
             rollout_metadata_list.append(sample.rollout_metadata)
 
-        # 假设所有样本具有相同的数据结构，从第一个样本推断结构
-        first_sample_data = sample_data_list[0]
-
-        # 组装tensor数据
-        tensor_dict = {}
-        non_tensor_dict = {}
-
-        # 获取第一个样本的结构来初始化
-        if hasattr(first_sample_data, "batch") and first_sample_data.batch is not None:
-            # 处理tensor数据
-            for key in first_sample_data.batch.keys():
-                tensor_list = []
-                for sample_data in sample_data_list:
-                    if hasattr(sample_data, "batch") and sample_data.batch is not None and key in sample_data.batch:
-                        tensor_list.append(sample_data.batch[key])
-                    else:
-                        logger.warning(f"Missing key '{key}' in sample batch data")
-
-                if tensor_list:
-                    # 连接所有tensor
-                    tensor_dict[key] = torch.cat(tensor_list, dim=0)
-
-        if hasattr(first_sample_data, "non_tensor_batch") and first_sample_data.non_tensor_batch:
-            # 处理non_tensor数据
-            for key in first_sample_data.non_tensor_batch.keys():
-                non_tensor_list = []
-                for sample_data in sample_data_list:
-                    if (
-                        hasattr(sample_data, "non_tensor_batch")
-                        and sample_data.non_tensor_batch
-                        and key in sample_data.non_tensor_batch
-                    ):
-                        non_tensor_list.extend(sample_data.non_tensor_batch[key])
-                    else:
-                        logger.warning(f"Missing key '{key}' in sample non_tensor_batch data")
-
-                if non_tensor_list:
-                    non_tensor_dict[key] = np.array(non_tensor_list, dtype=object)
+        batch = DataProto.from_items(sample_data_list)
 
         # 收集timing信息和metadata
-        for sample, metadata in zip(queue_samples, rollout_metadata_list, strict=False):
+        param_versions = []
+        sample_timestamps = []
+        for metadata in rollout_metadata_list:
+            # 提取参数版本和时间戳
+            param_versions.append(metadata.get("rollout_param_version", 0))
+            sample_timestamps.append(metadata.get("generation_timestamp", time.time()))
             if "timing" in metadata:
                 for timing_key, timing_value in metadata["timing"].items():
                     if timing_key not in timing_info:
                         timing_info[timing_key] = []
-                    timing_info[timing_key].append(timing_value)
-
+                    # if isinstance(timing_value, (int, float)):
+                    #     timing_info[timing_key].append(timing_value)
         # 计算平均timing
         avg_timing = {}
         for key, values in timing_info.items():
-            if values:
+            if values and len(values) > 0:
                 avg_timing[key] = sum(values) / len(values)
 
         # 创建meta_info
         meta_info = {
             "timing": avg_timing,
             "queue_sample_count": len(queue_samples),
-            "rollout_param_versions": [sample.param_version for sample in queue_samples],
-            "sample_timestamps": [sample.timestamp for sample in queue_samples],
+            "rollout_param_versions": param_versions,
+            "sample_timestamps": sample_timestamps,
+            "param_version_diversity": len(set(param_versions)),
+            "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]),
         }
 
-        # 创建DataProto对象
-        if tensor_dict or non_tensor_dict:
-            gen_batch_output = DataProto.from_dict(
-                tensors=tensor_dict if tensor_dict else None,
-                non_tensors=non_tensor_dict if non_tensor_dict else None,
-                meta_info=meta_info,
-            )
-        else:
-            # 如果没有数据，创建空的DataProto
-            logger.warning("No tensor or non_tensor data found in samples, creating empty DataProto")
-            gen_batch_output = DataProto.from_dict(meta_info=meta_info)
-
-        logger.info(f"Assembled gen_batch_output with {len(gen_batch_output)} samples")
-        return gen_batch_output
-
-    def _extract_batch_dict_from_sample(self, sample_data, batch_size: int) -> dict:
-        """
-        从样本数据中提取batch_dict信息
-
-        Args:
-            sample_data: 样本数据
-            batch_size: 批次大小
-
-        Returns:
-            dict: batch字典
-        """
-        batch_dict = {}
-
-        # 从样本中提取原始输入信息
-        if hasattr(sample_data, "batch") and sample_data.batch is not None:
-            for key, value in sample_data.batch.items():
-                # 只保留输入相关的key，去掉生成的输出
-                if key in ["input_ids", "attention_mask", "position_ids"]:
-                    # 由于我们有多个响应，需要取出原始prompt部分
-                    batch_dict[key] = value[:batch_size] if len(value) >= batch_size else value
-
-        if hasattr(sample_data, "non_tensor_batch") and sample_data.non_tensor_batch:
-            for key, value in sample_data.non_tensor_batch.items():
-                # 保留非tensor的批次数据
-                if key in ["raw_prompt_ids", "raw_prompt", "multi_modal_data", "tools_kwargs", "interaction_kwargs"]:
-                    batch_dict[key] = np.array(value[:batch_size]) if len(value) >= batch_size else np.array(value)
+        print(meta_info)
 
-        return batch_dict
+        return batch
 
     def _create_actor_rollout_classes(self):
         # create actor
@@ -377,10 +309,6 @@ def fit(self):
         # 使用队列模式，不需要传统的dataloader迭代器
         # 初始化获取第一批数据
         while True:
-            epoch, batch, gen_batch_output = self._get_samples_from_queue()
-            if gen_batch_output is None:
-                break
-
             metrics = {}
             timing_raw = {}
 
@@ -394,7 +322,41 @@ def fit(self):
             is_last_step = self.global_steps >= self.total_training_steps
 
             with marked_timer("step", timing_raw):
-                batch = self._post_generate_batch(batch, gen_batch_output, metrics)
+                with marked_timer("gen", timing_raw, color="red"):
+                    epoch, batch = self._get_samples_from_queue()
+                    if batch is None:
+                        break
+
+                # 更新统计信息
+                with self.lock:
+                    self.processed_samples += len(batch) if isinstance(batch, list) else 1
+
+                    # 从meta_info中获取参数版本信息
+                    if hasattr(batch, "meta_info") and batch.meta_info:
+                        rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
+                        if rollout_param_versions:
+                            # 统计陈旧样本
+                            stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
+                            self.stale_samples_processed += stale_count
+
+                        # 添加新鲜度指标到metrics
+                        if rollout_param_versions:
+                            param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
+                            avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
+
+                            metrics.update(
+                                {
+                                    "freshness/param_version_diversity": param_version_diversity,
+                                    "freshness/avg_sample_age": avg_sample_age,
+                                    "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
+                                    if rollout_param_versions
+                                    else 0,
+                                    "statistics/processed_samples": self.processed_samples,
+                                    "statistics/stale_samples_processed": self.stale_samples_processed,
+                                    "statistics/current_param_version": self.current_param_version,
+                                }
+                            )
+
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
                 last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
@@ -430,17 +392,80 @@ def get_statistics(self) -> dict:
             "queue_dropped_samples": queue_stats.get("dropped_samples", 0),
         }
 
+    def update_param_version(self, param_version: int) -> bool:
+        """
+        更新trainer的参数版本，用于跟踪与rollouter的参数同步状态
+
+        Args:
+            param_version: 新的参数版本号
+
+        Returns:
+            bool: 是否成功更新
+        """
+        try:
+            with self.lock:
+                old_version = self.current_param_version
+                self.current_param_version = param_version
+                self.param_sync_count += 1
+
+                # 更新消息队列的参数版本
+                if self.message_queue_client:
+                    self.message_queue_client.update_param_version(param_version)
+
+                logger.info(f"Updated trainer param version from {old_version} to {param_version}")
+                return True
+        except Exception as e:
+            logger.error(f"Error updating param version: {e}")
+            return False
+
     def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
-        """计算样本新鲜度指标"""
-        sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples]
-        current_time = time.time()
-        sample_latencies = [current_time - sample.timestamp for sample in batch_samples]
+        """
+        计算样本新鲜度指标
 
-        return {
-            "freshness/avg_sample_age": np.mean(sample_ages),
-            "freshness/max_sample_age": max(sample_ages),
-            "freshness/min_sample_age": min(sample_ages),
-            "freshness/avg_sample_latency": np.mean(sample_latencies),
-            "freshness/max_sample_latency": max(sample_latencies),
-            "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages),
-        }
+        Args:
+            batch_samples: 队列样本列表
+
+        Returns:
+            dict: 新鲜度指标字典
+        """
+        if not batch_samples:
+            return {}
+
+        try:
+            # 提取参数版本和时间戳
+            sample_ages = []
+            sample_latencies = []
+            current_time = time.time()
+
+            for sample in batch_samples:
+                # 从rollout_metadata中获取信息
+                if hasattr(sample, "rollout_metadata") and sample.rollout_metadata:
+                    rollout_version = sample.rollout_metadata.get("rollout_param_version", 0)
+                    generation_time = sample.rollout_metadata.get("generation_timestamp", current_time)
+                else:
+                    rollout_version = 0
+                    generation_time = current_time
+
+                age = max(0, self.current_param_version - rollout_version)
+                latency = max(0, current_time - generation_time)
+
+                sample_ages.append(age)
+                sample_latencies.append(latency)
+
+            if not sample_ages:
+                return {}
+
+            return {
+                "freshness/avg_sample_age": np.mean(sample_ages),
+                "freshness/max_sample_age": max(sample_ages),
+                "freshness/min_sample_age": min(sample_ages),
+                "freshness/avg_sample_latency": np.mean(sample_latencies),
+                "freshness/max_sample_latency": max(sample_latencies),
+                "freshness/min_sample_latency": min(sample_latencies),
+                "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages),
+                "freshness/sample_count": len(sample_ages),
+            }
+
+        except Exception as e:
+            logger.error(f"Error computing freshness metrics: {e}")
+            return {"freshness/error": str(e)}
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index f4dcd1f522d..ae4ba6c45ad 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -14,8 +14,6 @@
 
 import logging
 import threading
-import time
-import uuid
 from collections import deque
 from dataclasses import dataclass
 from typing import Any
diff --git a/recipe/fully_async_policy/unittest/protocol_examples.py b/recipe/fully_async_policy/unittest/protocol_examples.py
new file mode 100644
index 00000000000..b695c163c23
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/protocol_examples.py
@@ -0,0 +1,202 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+
+from verl.protocol import DataProto, DataProtoItem
+
+
+def example_basic_split_merge():
+    """Basic example of splitting DataProto into DataProtoItems and merging back."""
+    print("=== Basic Split and Merge Example ===")
+
+    # Create sample data
+    batch_size = 3
+    seq_len = 5
+
+    # Create tensors
+    input_ids = torch.randint(0, 1000, (batch_size, seq_len))
+    attention_mask = torch.ones(batch_size, seq_len)
+
+    # Create non-tensor data
+    prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object)
+    scores = np.array([0.8, 0.9, 0.7], dtype=object)
+
+    # Create DataProto
+    data_proto = DataProto.from_dict(
+        tensors={"input_ids": input_ids, "attention_mask": attention_mask},
+        non_tensors={"prompts": prompts, "scores": scores},
+        meta_info={"model_name": "test_model", "version": "1.0"},
+    )
+
+    print(f"Original DataProto length: {len(data_proto)}")
+    print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}")
+    print(f"Prompts: {data_proto.non_tensor_batch['prompts']}")
+
+    # Split into DataProtoItems
+    items = data_proto.to_items()
+    print(f"\nSplit into {len(items)} items")
+
+    for i, item in enumerate(items):
+        print(f"Item {i}:")
+        print(f"  Input IDs shape: {item.batch['input_ids'].shape}")
+        print(f"  Prompt: {item.non_tensor_batch['prompts']}")
+        print(f"  Score: {item.non_tensor_batch['scores']}")
+
+    # Merge back to DataProto
+    merged_proto = DataProto.from_items(items)
+    print(f"\nMerged DataProto length: {len(merged_proto)}")
+    print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}")
+    print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}")
+
+    # Verify they're identical
+    assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"])
+    assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"])
+    assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"])
+    assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"])
+
+    print("\n✓ Original and merged DataProto are identical!")
+
+
+def example_item_processing():
+    """Example showing individual item processing before merging."""
+    print("\n=== Individual Item Processing Example ===")
+
+    # Create initial data
+    #    batch_size = 4
+
+    values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1)  # Shape: (4, 1)
+    labels = np.array(["A", "B", "C", "D"], dtype=object)
+
+    original_proto = DataProto.from_dict(
+        tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0}
+    )
+
+    print(f"Original values: {original_proto.batch['values'].flatten()}")
+    print(f"Original labels: {original_proto.non_tensor_batch['labels']}")
+
+    # Split and process each item individually
+    items = original_proto.to_items()
+    processed_items = []
+
+    for i, item in enumerate(items):
+        # Process the tensor data (multiply by 2)
+        processed_value = item.batch["values"] * 2
+
+        # Process the non-tensor data (add suffix)
+        processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}"
+
+        # Create new processed item
+        processed_item = DataProtoItem(
+            batch=item.batch.clone(),  # Clone the TensorDict
+            non_tensor_batch=item.non_tensor_batch.copy(),
+            meta_info=item.meta_info.copy(),
+        )
+
+        # Update with processed data
+        processed_item.batch["values"] = processed_value
+        processed_item.non_tensor_batch["labels"] = processed_label
+        processed_item.meta_info["processing_step"] = 1
+
+        processed_items.append(processed_item)
+
+        print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'")
+
+    # Merge processed items back
+    processed_proto = DataProto.from_items(processed_items)
+
+    print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}")
+    print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}")
+    print(f"Processing step: {processed_proto.meta_info['processing_step']}")
+
+
+def example_convenience_methods():
+    """Example showing convenience methods."""
+    print("\n=== Convenience Methods Example ===")
+
+    # Create a single DataProtoItem
+    single_tensor = torch.tensor([42]).unsqueeze(0)  # Shape: (1,)
+    single_item = DataProtoItem(
+        batch=None,  # We'll create TensorDict manually
+        non_tensor_batch={"text": "Hello"},
+        meta_info={"source": "manual"},
+    )
+
+    # Create TensorDict manually for the single item
+    from tensordict import TensorDict
+
+    single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,))
+
+    print(f"Single item data: {single_item.batch['data']}")
+    print(f"Single item text: {single_item.non_tensor_batch['text']}")
+
+    # Convert single item to DataProto using convenience method
+    single_proto = single_item.to_proto()
+    print(f"Converted to DataProto length: {len(single_proto)}")
+
+    # Create multiple items and use static convenience method
+    items = [single_item]
+    for i in range(2):
+        new_item = single_item.copy()  # Use the copy method
+        new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0)
+        new_item.non_tensor_batch["text"] = f"Item {i + 1}"
+        items.append(new_item)
+
+    # Use DataProtoItem.from_items() convenience method
+    merged_proto = DataProtoItem.from_items(items)
+    print(f"Merged using convenience method - length: {len(merged_proto)}")
+    print(f"Data: {merged_proto.batch['data'].flatten()}")
+    print(f"Texts: {merged_proto.non_tensor_batch['text']}")
+
+
+def example_error_handling():
+    """Example showing error handling."""
+    print("\n=== Error Handling Example ===")
+
+    # Try to create DataProto from empty list
+    try:
+        DataProto.from_items([])
+        print("ERROR: Should have raised exception for empty list")
+    except ValueError as e:
+        print(f"✓ Correctly caught error for empty list: {e}")
+
+    # Try to merge items with inconsistent structure
+    try:
+        item1 = DataProtoItem(
+            batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)),
+            non_tensor_batch={"text": "Hello"},
+        )
+        item2 = DataProtoItem(
+            batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)),
+            non_tensor_batch={"text": "World"},
+        )
+
+        DataProto.from_items([item1, item2])
+        print("ERROR: Should have raised exception for inconsistent structure")
+    except ValueError as e:
+        print(f"✓ Correctly caught error for inconsistent structure: {e}")
+
+
+if __name__ == "__main__":
+    # Import tensordict for the examples
+    from tensordict import TensorDict
+
+    # Run all examples
+    example_basic_split_merge()
+    example_item_processing()
+    example_convenience_methods()
+    example_error_handling()
+
+    print("\n🎉 All examples completed successfully!")
diff --git a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py
new file mode 100644
index 00000000000..7c959a791bb
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py
@@ -0,0 +1,466 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+from tensordict import TensorDict
+
+from verl.protocol import DataProto
+
+
+def create_sample_dataproto():
+    """Create a DataProto similar to the provided example."""
+
+    # Create tensor data similar to the example
+    batch_size = 12
+
+    # Tensor data
+    attention_mask = torch.ones(batch_size, 3072, dtype=torch.int64)
+    input_ids = torch.randint(0, 32000, (batch_size, 3072), dtype=torch.int64)
+    position_ids = torch.arange(3072).unsqueeze(0).repeat(batch_size, 1).long()
+    prompts = torch.randint(0, 32000, (batch_size, 1024), dtype=torch.int64)
+    response_mask = torch.ones(batch_size, 2048, dtype=torch.int64)
+    responses = torch.randint(0, 32000, (batch_size, 2048), dtype=torch.int64)
+
+    # Non-tensor data similar to the example
+    data_source = np.array(["openai/gsm8k"] * batch_size, dtype=object)
+    ability = np.array(["math"] * batch_size, dtype=object)
+
+    reward_model = np.array(
+        [
+            {"ground_truth": "6", "style": "rule"},
+            {"ground_truth": "6", "style": "rule"},
+            {"ground_truth": "220000", "style": "rule"},
+            {"ground_truth": "277", "style": "rule"},
+            {"ground_truth": "277", "style": "rule"},
+            {"ground_truth": "35", "style": "rule"},
+            {"ground_truth": "6", "style": "rule"},
+            {"ground_truth": "220000", "style": "rule"},
+            {"ground_truth": "220000", "style": "rule"},
+            {"ground_truth": "277", "style": "rule"},
+            {"ground_truth": "35", "style": "rule"},
+            {"ground_truth": "35", "style": "rule"},
+        ],
+        dtype=object,
+    )
+
+    extra_info = np.array(
+        [
+            {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"},
+            {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"},
+            {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"},
+            {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"},
+            {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"},
+            {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"},
+            {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"},
+            {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"},
+            {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"},
+            {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"},
+            {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"},
+            {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"},
+        ],
+        dtype=object,
+    )
+
+    uid = np.array(
+        [
+            "80ae1835-a8db-4faa-8b42-2ffa2ca63f28",
+            "80ae1835-a8db-4faa-8b42-2ffa2ca63f28",
+            "cc529271-c2ba-4fe1-a16e-50c5f090538d",
+            "237ea082-350f-4193-b9a2-3a153a3a38b9",
+            "237ea082-350f-4193-b9a2-3a153a3a38b9",
+            "fab3e910-67b3-4653-bc69-377250049267",
+            "80ae1835-a8db-4faa-8b42-2ffa2ca63f28",
+            "cc529271-c2ba-4fe1-a16e-50c5f090538d",
+            "cc529271-c2ba-4fe1-a16e-50c5f090538d",
+            "237ea082-350f-4193-b9a2-3a153a3a38b9",
+            "fab3e910-67b3-4653-bc69-377250049267",
+            "fab3e910-67b3-4653-bc69-377250049267",
+        ],
+        dtype=object,
+    )
+
+    tools_kwargs = np.array([{}] * batch_size, dtype=object)
+    interaction_kwargs = np.array([{}] * batch_size, dtype=object)
+    index = np.array([4570, 4570, 460, 6613, 6613, 1421, 4570, 460, 460, 6613, 1421, 1421], dtype=object)
+
+    # Create DataProto
+    data_proto = DataProto.from_dict(
+        tensors={
+            "attention_mask": attention_mask,
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "prompts": prompts,
+            "response_mask": response_mask,
+            "responses": responses,
+        },
+        non_tensors={
+            "data_source": data_source,
+            "ability": ability,
+            "reward_model": reward_model,
+            "extra_info": extra_info,
+            "uid": uid,
+            "tools_kwargs": tools_kwargs,
+            "interaction_kwargs": interaction_kwargs,
+            "index": index,
+        },
+        meta_info={"global_token_num": [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]},
+    )
+
+    return data_proto
+
+
+def test_basic_split_and_merge():
+    """Test basic split and merge functionality."""
+    print("=== Testing Basic Split and Merge ===")
+
+    # Create sample data
+    original_proto = create_sample_dataproto()
+    original_length = len(original_proto)
+
+    print(f"Original DataProto length: {original_length}")
+    print(f"Original tensor keys: {list(original_proto.batch.keys())}")
+    print(f"Original non_tensor keys: {list(original_proto.non_tensor_batch.keys())}")
+
+    # Test split
+    items = original_proto.to_items()
+
+    print(f"Split into {len(items)} items")
+    assert len(items) == original_length, f"Expected {original_length} items, got {len(items)}"
+
+    # Verify individual items
+    for i, item in enumerate(items):
+        print(f"Item {i}: batch_size={item.batch.batch_size}, non_tensor keys={list(item.non_tensor_batch.keys())}")
+
+        # Check that tensor shapes are correct (no batch dimension)
+        assert item.batch.batch_size == torch.Size([]), (
+            f"Item {i} should have empty batch_size, got {item.batch.batch_size}"
+        )
+
+        # Check tensor shapes
+        assert item.batch["attention_mask"].shape == torch.Size([3072]), (
+            f"Unexpected attention_mask shape: {item.batch['attention_mask'].shape}"
+        )
+        assert item.batch["input_ids"].shape == torch.Size([3072]), (
+            f"Unexpected input_ids shape: {item.batch['input_ids'].shape}"
+        )
+        assert item.batch["prompts"].shape == torch.Size([1024]), (
+            f"Unexpected prompts shape: {item.batch['prompts'].shape}"
+        )
+
+        # Check non-tensor data types
+        assert isinstance(item.non_tensor_batch["data_source"], str), (
+            f"data_source should be str, got {type(item.non_tensor_batch['data_source'])}"
+        )
+        assert isinstance(item.non_tensor_batch["reward_model"], dict), (
+            f"reward_model should be dict, got {type(item.non_tensor_batch['reward_model'])}"
+        )
+        assert isinstance(item.non_tensor_batch["extra_info"], dict), (
+            f"extra_info should be dict, got {type(item.non_tensor_batch['extra_info'])}"
+        )
+
+    # Test merge
+    merged_proto = DataProto.from_items(items)
+
+    print(f"Merged DataProto length: {len(merged_proto)}")
+    assert len(merged_proto) == original_length, f"Merged length should be {original_length}, got {len(merged_proto)}"
+
+    # Verify tensor data consistency
+    for key in original_proto.batch.keys():
+        original_tensor = original_proto.batch[key]
+        merged_tensor = merged_proto.batch[key]
+
+        assert original_tensor.shape == merged_tensor.shape, (
+            f"Shape mismatch for {key}: {original_tensor.shape} vs {merged_tensor.shape}"
+        )
+        assert torch.equal(original_tensor, merged_tensor), f"Tensor data mismatch for {key}"
+
+    # Verify non-tensor data consistency
+    for key in original_proto.non_tensor_batch.keys():
+        original_array = original_proto.non_tensor_batch[key]
+        merged_array = merged_proto.non_tensor_batch[key]
+
+        assert original_array.shape == merged_array.shape, (
+            f"Shape mismatch for {key}: {original_array.shape} vs {merged_array.shape}"
+        )
+        assert np.array_equal(original_array, merged_array), f"Non-tensor data mismatch for {key}"
+
+    # Verify meta_info consistency
+    assert original_proto.meta_info == merged_proto.meta_info, "Meta info mismatch"
+
+    print("✓ Basic split and merge test passed!")
+
+
+def test_individual_item_access():
+    """Test accessing individual items matches split results."""
+    print("\n=== Testing Individual Item Access ===")
+
+    original_proto = create_sample_dataproto()
+    items = original_proto.to_items()
+
+    # Compare direct indexing with split results
+    for i in range(len(original_proto)):
+        direct_item = original_proto[i]
+        split_item = items[i]
+
+        # Check tensor data
+        for key in original_proto.batch.keys():
+            assert torch.equal(direct_item.batch[key], split_item.batch[key]), (
+                f"Tensor mismatch at index {i}, key {key}"
+            )
+
+        # Check non-tensor data
+        for key in original_proto.non_tensor_batch.keys():
+            if isinstance(direct_item.non_tensor_batch[key], np.ndarray):
+                assert np.array_equal(direct_item.non_tensor_batch[key], split_item.non_tensor_batch[key]), (
+                    f"Non-tensor mismatch at index {i}, key {key}"
+                )
+            else:
+                assert direct_item.non_tensor_batch[key] == split_item.non_tensor_batch[key], (
+                    f"Non-tensor mismatch at index {i}, key {key}"
+                )
+
+    print("✓ Individual item access test passed!")
+
+
+def test_partial_merge():
+    """Test merging a subset of items."""
+    print("\n=== Testing Partial Merge ===")
+
+    original_proto = create_sample_dataproto()
+    items = original_proto.to_items()
+
+    # Take a subset of items
+    subset_indices = [0, 2, 4, 7, 9]
+    subset_items = [items[i] for i in subset_indices]
+
+    # Merge the subset
+    subset_proto = DataProto.from_items(subset_items)
+
+    assert len(subset_proto) == len(subset_indices), (
+        f"Subset length should be {len(subset_indices)}, got {len(subset_proto)}"
+    )
+
+    # Verify the subset contains correct data
+    for i, original_idx in enumerate(subset_indices):
+        # Compare with original data at original_idx
+        for key in original_proto.batch.keys():
+            expected_tensor = original_proto.batch[key][original_idx]
+            actual_tensor = subset_proto.batch[key][i]
+            assert torch.equal(expected_tensor, actual_tensor), f"Subset tensor mismatch at {i}, key {key}"
+
+        for key in original_proto.non_tensor_batch.keys():
+            expected_value = original_proto.non_tensor_batch[key][original_idx]
+            actual_value = subset_proto.non_tensor_batch[key][i]
+
+            if isinstance(expected_value, np.ndarray):
+                assert np.array_equal(expected_value, actual_value), f"Subset non-tensor mismatch at {i}, key {key}"
+            else:
+                assert expected_value == actual_value, f"Subset non-tensor mismatch at {i}, key {key}"
+
+    print("✓ Partial merge test passed!")
+
+
+def test_item_processing():
+    """Test processing individual items before merging."""
+    print("\n=== Testing Item Processing ===")
+
+    original_proto = create_sample_dataproto()
+    items = original_proto.to_items()
+
+    # Process each item (e.g., add a prefix to uid)
+    processed_items = []
+    for i, item in enumerate(items):
+        processed_item = item.copy()  # Create a copy to avoid modifying original
+
+        # Modify some data
+        processed_item.non_tensor_batch["uid"] = f"processed_{i}_{processed_item.non_tensor_batch['uid']}"
+        processed_item.non_tensor_batch["processing_step"] = i
+        processed_item.meta_info["processed"] = True
+
+        processed_items.append(processed_item)
+
+    # Merge processed items
+    processed_proto = DataProto.from_items(processed_items)
+
+    # Verify processing was applied
+    for i in range(len(processed_proto)):
+        expected_uid = f"processed_{i}_{items[i].non_tensor_batch['uid']}"
+        actual_uid = processed_proto.non_tensor_batch["uid"][i]
+        assert actual_uid == expected_uid, (
+            f"Processing failed for uid at {i}: expected {expected_uid}, got {actual_uid}"
+        )
+
+        expected_step = i
+        actual_step = processed_proto.non_tensor_batch["processing_step"][i]
+        assert actual_step == expected_step, (
+            f"Processing step mismatch at {i}: expected {expected_step}, got {actual_step}"
+        )
+
+    #    assert processed_proto.meta_info.get("processed") == True, "Meta info processing failed"
+
+    print("✓ Item processing test passed!")
+
+
+def test_error_conditions():
+    """Test error conditions."""
+    print("\n=== Testing Error Conditions ===")
+
+    # Test empty list
+    try:
+        DataProto.from_items([])
+    except ValueError as e:
+        print(f"✓ Correctly caught empty list error: {e}")
+
+    # Test inconsistent structure
+    try:
+        # Create items with different tensor keys
+        original_proto = create_sample_dataproto()
+        items = original_proto.to_items()
+
+        # Modify one item to have different keys
+        modified_item = items[1].copy()
+        modified_item.batch = TensorDict({"different_key": torch.randn(3072)}, batch_size=torch.Size([]))
+
+        inconsistent_items = [items[0], modified_item]
+        DataProto.from_items(inconsistent_items)
+    except ValueError as e:
+        print(f"✓ Correctly caught inconsistent structure error: {e}")
+
+    print("✓ Error conditions test passed!")
+
+
+def test_roundtrip_integrity():
+    """Test multiple split/merge cycles maintain data integrity."""
+    print("\n=== Testing Roundtrip Integrity ===")
+
+    original_proto = create_sample_dataproto()
+    current_proto = original_proto
+
+    # Perform multiple split/merge cycles
+    for cycle in range(3):
+        print(f"Cycle {cycle + 1}")
+
+        # Split
+        items = current_proto.to_items()
+
+        # Merge
+        current_proto = DataProto.from_items(items)
+
+        # Verify integrity
+        assert len(current_proto) == len(original_proto), f"Length changed in cycle {cycle + 1}"
+
+        for key in original_proto.batch.keys():
+            assert torch.equal(original_proto.batch[key], current_proto.batch[key]), (
+                f"Tensor {key} changed in cycle {cycle + 1}"
+            )
+
+        for key in original_proto.non_tensor_batch.keys():
+            assert np.array_equal(original_proto.non_tensor_batch[key], current_proto.non_tensor_batch[key]), (
+                f"Non-tensor {key} changed in cycle {cycle + 1}"
+            )
+
+        assert original_proto.meta_info == current_proto.meta_info, f"Meta info changed in cycle {cycle + 1}"
+
+    print("✓ Roundtrip integrity test passed!")
+
+
+def run_visual_comparison():
+    """Run a visual comparison similar to the user's example."""
+    print("\n=== Visual Comparison (Like User Example) ===")
+
+    original_proto = create_sample_dataproto()
+
+    print("Original DataProto:")
+    print(f"batch_size: {original_proto.batch.batch_size}")
+    print(f"tensor keys: {list(original_proto.batch.keys())}")
+    print(f"non_tensor keys: {list(original_proto.non_tensor_batch.keys())}")
+    print(f"Sample data_source: {original_proto.non_tensor_batch['data_source'][:3]}")
+    print(f"Sample uid: {original_proto.non_tensor_batch['uid'][:3]}")
+
+    print("\n" + "=" * 50)
+    print("============= SPLIT =============")
+    print("=" * 50)
+
+    items = original_proto.to_items()
+
+    # Show first few items
+    for i in range(min(3, len(items))):
+        print(f"\nDataProtoItem {i}:")
+        print(f"batch_size: {items[i].batch.batch_size}")
+        print(f"attention_mask shape: {items[i].batch['attention_mask'].shape}")
+        print(f"input_ids shape: {items[i].batch['input_ids'].shape}")
+        print(f"data_source: {items[i].non_tensor_batch['data_source']}")
+        print(f"uid: {items[i].non_tensor_batch['uid']}")
+        print(f"reward_model: {items[i].non_tensor_batch['reward_model']}")
+        print("-" * 30)
+
+    print("\n" + "=" * 50)
+    print("============= MERGE =============")
+    print("=" * 50)
+
+    merged_proto = DataProto.from_items(items)
+
+    print("Merged DataProto:")
+    print(f"batch_size: {merged_proto.batch.batch_size}")
+    print(f"tensor keys: {list(merged_proto.batch.keys())}")
+    print(f"non_tensor keys: {list(merged_proto.non_tensor_batch.keys())}")
+    print(f"Sample data_source: {merged_proto.non_tensor_batch['data_source'][:3]}")
+    print(f"Sample uid: {merged_proto.non_tensor_batch['uid'][:3]}")
+
+    # Verify they're identical
+    success = True
+    try:
+        for key in original_proto.batch.keys():
+            assert torch.equal(original_proto.batch[key], merged_proto.batch[key])
+        for key in original_proto.non_tensor_batch.keys():
+            assert np.array_equal(original_proto.non_tensor_batch[key], merged_proto.non_tensor_batch[key])
+        assert original_proto.meta_info == merged_proto.meta_info
+        print("\n✓ Original and merged DataProto are identical!")
+    except Exception as e:
+        print(f"\n✗ Verification failed: {e}")
+        success = False
+
+    return success
+
+
+if __name__ == "__main__":
+    print("Testing DataProto Split/Merge Functionality")
+    print("=" * 60)
+
+    try:
+        # Run all tests
+        test_basic_split_and_merge()
+        test_individual_item_access()
+        test_partial_merge()
+        test_item_processing()
+        test_error_conditions()
+        test_roundtrip_integrity()
+
+        # Run visual comparison
+        visual_success = run_visual_comparison()
+
+        if visual_success:
+            print("\n" + "=" * 60)
+            print("🎉 ALL TESTS PASSED!")
+            print("DataProto split/merge functionality is working correctly.")
+        else:
+            print("\n" + "=" * 60)
+            print("❌ SOME TESTS FAILED!")
+
+    except Exception as e:
+        print(f"\n❌ Test failed with exception: {e}")
+        import traceback
+
+        traceback.print_exc()
diff --git a/verl/protocol.py b/verl/protocol.py
index a4d394af97d..17b3b10c1f6 100644
--- a/verl/protocol.py
+++ b/verl/protocol.py
@@ -38,7 +38,7 @@
 from verl.utils.py_functional import union_two_dict
 from verl.utils.torch_functional import allgather_dict_tensors
 
-__all__ = ["DataProto", "union_tensor_dict"]
+__all__ = ["DataProto", "DataProtoItem", "union_tensor_dict"]
 
 with contextlib.suppress(Exception):
     tensordict.set_lazy_legacy(False).set()
@@ -198,11 +198,83 @@ def collate_fn(x: list["DataProtoItem"]):
 
 @dataclass
 class DataProtoItem:
-    # TODO(zhangchi.usc1992) add consistency check
+    """
+    A single item from a DataProto batch, representing one sample.
+    This is typically used when accessing individual elements from a DataProto.
+    """
+
     batch: TensorDict = None
     non_tensor_batch: dict = field(default_factory=dict)
     meta_info: dict = field(default_factory=dict)
 
+    def __post_init__(self):
+        """Perform consistency checking after initialization."""
+        self._check_consistency()
+
+    def _check_consistency(self):
+        """Check the consistency of the DataProtoItem."""
+        # For DataProtoItem, batch can have no batch dimension (batch_size=[]) or batch size 1
+        if self.batch is not None:
+            # Allow both cases: tensors without batch dim (batch_size=[]) and tensors with batch size 1
+            if hasattr(self.batch, "batch_size") and len(self.batch.batch_size) > 0:
+                if self.batch.batch_size[0] > 1:
+                    raise ValueError(
+                        f"DataProtoItem batch should have batch size 0 or 1, got {self.batch.batch_size[0]}"
+                    )
+
+        # Check non_tensor_batch consistency
+        if self.non_tensor_batch:
+            for key, val in self.non_tensor_batch.items():
+                # For DataProtoItem, non_tensor values should be individual items, not arrays
+                if isinstance(val, np.ndarray) and val.shape != ():
+                    # Allow only scalar numpy arrays (shape=()) for individual items
+                    if val.shape[0] > 1:
+                        raise ValueError(
+                            f"DataProtoItem non_tensor_batch['{key}']"
+                            "should be a single item, got array with shape {val.shape}"
+                        )
+
+    def to_proto(self) -> "DataProto":
+        """Convert this DataProtoItem to a DataProto with batch size 1.
+
+        Returns:
+            DataProto: A DataProto containing this single item
+        """
+        return DataProto.from_items([self])
+
+    @staticmethod
+    def from_items(items: list["DataProtoItem"]) -> "DataProto":
+        """Create a DataProto from a list of DataProtoItem objects.
+
+        This is a convenience method that calls DataProto.from_items().
+
+        Args:
+            items (List[DataProtoItem]): A list of DataProtoItem objects to merge
+
+        Returns:
+            DataProto: A new DataProto containing all the items as a batch
+        """
+        return DataProto.from_items(items)
+
+    def copy(self) -> "DataProtoItem":
+        """Create a deep copy of this DataProtoItem.
+
+        Returns:
+            DataProtoItem: A deep copy of this item
+        """
+        import copy
+
+        # Deep copy the batch TensorDict
+        batch_copy = copy.deepcopy(self.batch) if self.batch is not None else None
+
+        # Deep copy non_tensor_batch
+        non_tensor_copy = copy.deepcopy(self.non_tensor_batch)
+
+        # Deep copy meta_info
+        meta_info_copy = copy.deepcopy(self.meta_info)
+
+        return DataProtoItem(batch=batch_copy, non_tensor_batch=non_tensor_copy, meta_info=meta_info_copy)
+
 
 @dataclass
 class DataProto:
@@ -738,6 +810,96 @@ def split(self, split_size: int) -> list["DataProto"]:
         """
         return [self[i : i + split_size] for i in range(0, len(self), split_size)]
 
+    def to_items(self) -> list["DataProtoItem"]:
+        """Convert DataProto to a list of DataProtoItem objects.
+
+        Returns:
+            List[DataProtoItem]: A list containing individual DataProtoItem objects,
+                                 one for each sample in the batch
+        """
+        items = []
+        for i in range(len(self)):
+            # Use the existing __getitem__ implementation for single integer access
+            items.append(self[i])
+        return items
+
+    @staticmethod
+    def from_items(items: list["DataProtoItem"]) -> "DataProto":
+        """Create a DataProto from a list of DataProtoItem objects.
+
+        Args:
+            items (List[DataProtoItem]): A list of DataProtoItem objects to merge
+
+        Returns:
+            DataProto: A new DataProto containing all the items as a batch
+
+        Raises:
+            ValueError: If the input list is empty or items have inconsistent structure
+        """
+        if not items:
+            raise ValueError("Cannot create DataProto from empty list of items")
+
+        # Get the first item to determine structure and meta_info
+        first_item = items[0]
+        meta_info = first_item.meta_info
+
+        # Collect all tensor batches
+        batch_tensors = {}
+        non_tensor_batches = {}
+
+        # Process tensor data
+        if first_item.batch is not None:
+            # Get all keys from the first item's batch
+            tensor_keys = list(first_item.batch.keys())
+
+            for key in tensor_keys:
+                tensor_list = []
+                for i, item in enumerate(items):
+                    if item.batch is None or key not in item.batch:
+                        raise ValueError(f"Item {i} missing tensor key '{key}' in batch")
+
+                    tensor = item.batch[key]
+                    # Handle tensors from DataProtoItem which may not have batch dimension
+                    # (as shown in the user's example where batch_size=torch.Size([]))
+                    if tensor.dim() == 0:
+                        # Scalar tensor - add batch dimension
+                        tensor = tensor.unsqueeze(0)
+                    else:
+                        # Multi-dimensional tensor without batch dimension - add batch dimension
+                        tensor = tensor.unsqueeze(0)
+
+                    tensor_list.append(tensor)
+
+                # Concatenate tensors along batch dimension
+                if tensor_list:
+                    batch_tensors[key] = torch.cat(tensor_list, dim=0)
+
+        # Process non-tensor data
+        if first_item.non_tensor_batch:
+            non_tensor_keys = list(first_item.non_tensor_batch.keys())
+
+            for key in non_tensor_keys:
+                non_tensor_list = []
+                for i, item in enumerate(items):
+                    if key not in item.non_tensor_batch:
+                        raise ValueError(f"Item {i} missing non_tensor key '{key}'")
+
+                    non_tensor_data = item.non_tensor_batch[key]
+                    non_tensor_list.append(non_tensor_data)
+
+                # Stack non-tensor data
+                if non_tensor_list:
+                    non_tensor_batches[key] = np.array(non_tensor_list, dtype=object)
+
+        # Create TensorDict for batch
+        if batch_tensors:
+            batch_size = len(items)
+            batch = TensorDict(source=batch_tensors, batch_size=(batch_size,))
+        else:
+            batch = None
+
+        return DataProto(batch=batch, non_tensor_batch=non_tensor_batches, meta_info=meta_info)
+
     @staticmethod
     def concat(data: list["DataProto"]) -> "DataProto":
         """Concat a list of DataProto. The batch is concatenated among dim=0.

From bc6aedd8f0caf8a97b2e7a90547158fef47d8d77 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 6 Aug 2025 17:44:48 +0800
Subject: [PATCH 027/182] train one step

---
 recipe/fully_async_policy/fully_async_main.py |  12 +-
 .../fully_async_policy/fully_async_trainer.py |  97 ++++---
 recipe/fully_async_policy/message_queue.py    |   8 +-
 recipe/fully_async_policy/unittest/test_mq.py | 242 +++++++++++++++++-
 .../fully_async_policy/unittest/test_mq2.py   | 171 +++++++++++++
 tests/special_e2e/run_fully_async_policy.sh   |   6 +-
 verl/trainer/ppo/ray_trainer.py               |   9 +
 7 files changed, 496 insertions(+), 49 deletions(-)
 create mode 100644 recipe/fully_async_policy/unittest/test_mq2.py

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 404ffba4874..888c6c73594 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -241,7 +241,7 @@ def _initialize_components(self, config) -> None:
 
         # 创建Trainer
         print("Creating FullyAsyncTrainer...")
-        # self._create_trainer(config)
+        self._create_trainer(config)
 
         # 设置参数同步
         # print("Setting up parameter synchronization...")
@@ -311,11 +311,15 @@ def _run_training_loop(self):
 
         print("Starting Rollouter in background...")
         rollouter_future = self.components["rollouter"].fit.remote()
-        # trainer_future = self.components["trainer"].fit.remote()
+        trainer_future = self.components["trainer"].fit.remote()
         # self._monitor_components()
-        ray.get(rollouter_future)
-        # ray.get(trainer_future)
 
+        print("Starting Trainer...")
+        time.sleep(10)
+        print("Starting Trainer...")
+
+        ray.get(rollouter_future)
+        ray.get(trainer_future)
         self.components["message_queue_client"].clear_queue()
 
         print("Training completed or interrupted")
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 5d69e9091ba..9830aef595e 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -152,22 +152,31 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         batch_size = self.config.data.train_batch_size
         required_samples = n_responses_per_prompt * batch_size
 
-        logger.info(
-            f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})"
+        print(
+            f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})",
+            flush=True,
         )
 
         # 从队列获取样本
+        consumer_start = time.time()
         queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples)
+        consumer_end = time.time()
 
         if not queue_samples or len(queue_samples) == 0:
             logger.warning("required_samples is empty")
             return None, None
 
-        logger.info(f"Retrieved {len(queue_samples)} samples from queue")
+        print(f"Retrieved {len(queue_samples)} samples from queue. wait time {consumer_end - consumer_start}")
+
+        queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
+        print(queue_samples)
 
         # 组装 batch
         batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
 
+        print("=" * 200)
+        print(batch)
+
         return 0, batch
 
     def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]):
@@ -189,7 +198,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
         if not queue_samples:
             raise ValueError("Empty queue_samples provided for batch assembly")
 
-        logger.debug(f"Assembling batch from {len(queue_samples)} queue samples")
+        print(f"Assembling batch from {len(queue_samples)} queue samples")
 
         # 提取所有样本的数据和元数据
         sample_data_list = []
@@ -271,6 +280,8 @@ def fit(self):
         The light-weight advantage computation is done on the driver process.
         """
 
+        print("FullyAsyncTrainer run")
+
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
 
@@ -294,10 +305,13 @@ def fit(self):
             val_metrics = self._validate()
             assert val_metrics, f"{val_metrics=}"
             pprint(f"Initial validation metrics: {val_metrics}")
-            logger.log(data=val_metrics, step=self.global_steps)
+            print(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
 
+        self.total_training_steps = self.config.trainer.total_training_steps
+
+        print(f"Total training steps: {self.total_training_steps}")
         # add tqdm
         progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
 
@@ -309,6 +323,7 @@ def fit(self):
         # 使用队列模式，不需要传统的dataloader迭代器
         # 初始化获取第一批数据
         while True:
+            print("while True", flush=True)
             metrics = {}
             timing_raw = {}
 
@@ -327,47 +342,55 @@ def fit(self):
                     if batch is None:
                         break
 
-                # 更新统计信息
-                with self.lock:
-                    self.processed_samples += len(batch) if isinstance(batch, list) else 1
-
-                    # 从meta_info中获取参数版本信息
-                    if hasattr(batch, "meta_info") and batch.meta_info:
-                        rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
-                        if rollout_param_versions:
-                            # 统计陈旧样本
-                            stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
-                            self.stale_samples_processed += stale_count
-
-                        # 添加新鲜度指标到metrics
-                        if rollout_param_versions:
-                            param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
-                            avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
-
-                            metrics.update(
-                                {
-                                    "freshness/param_version_diversity": param_version_diversity,
-                                    "freshness/avg_sample_age": avg_sample_age,
-                                    "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
-                                    if rollout_param_versions
-                                    else 0,
-                                    "statistics/processed_samples": self.processed_samples,
-                                    "statistics/stale_samples_processed": self.stale_samples_processed,
-                                    "statistics/current_param_version": self.current_param_version,
-                                }
-                            )
-
+                print("_get_samples_from_queue end")
+
+                # # 更新统计信息
+                # with self.lock:
+                #     self.processed_samples += len(batch) if isinstance(batch, list) else 1
+                #
+                #     # 从meta_info中获取参数版本信息
+                #     if hasattr(batch, "meta_info") and batch.meta_info:
+                #         rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
+                #         if rollout_param_versions:
+                #             # 统计陈旧样本
+                #             stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
+                #             self.stale_samples_processed += stale_count
+                #
+                #         # 添加新鲜度指标到metrics
+                #         if rollout_param_versions:
+                #             param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
+                #             avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
+                #
+                #             metrics.update(
+                #                 {
+                #                     "freshness/param_version_diversity": param_version_diversity,
+                #                     "freshness/avg_sample_age": avg_sample_age,
+                #                     "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
+                #                     if rollout_param_versions
+                #                     else 0,
+                #                     "statistics/processed_samples": self.processed_samples,
+                #                     "statistics/stale_samples_processed": self.stale_samples_processed,
+                #                     "statistics/current_param_version": self.current_param_version,
+                #                 }
+                #             )
+                print("_process_batch_common")
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+                print("_log_rollout")
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+                print("_validate_metrics")
                 last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
+                print("_check_save_checkpoint")
                 self._check_save_checkpoint(is_last_step, timing_raw)
 
+            print("_stop_profiling")
             self._stop_profiling(do_profile, timing_raw)
+            print("_collect_metrics")
             self._collect_metrics(batch, epoch, metrics, timing_raw)
+            print("_post_batch_processing")
             self._post_batch_processing(batch)
 
             # TODO: make a canonical logger that supports various backend
-            logger.log(data=metrics, step=self.global_steps)
+            print(data=metrics, step=self.global_steps)
 
             progress_bar.update(1)
             self.global_steps += 1
@@ -412,7 +435,7 @@ def update_param_version(self, param_version: int) -> bool:
                 if self.message_queue_client:
                     self.message_queue_client.update_param_version(param_version)
 
-                logger.info(f"Updated trainer param version from {old_version} to {param_version}")
+                print(f"Updated trainer param version from {old_version} to {param_version}")
                 return True
         except Exception as e:
             logger.error(f"Error updating param version: {e}")
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index ae4ba6c45ad..c6116f0c432 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -93,7 +93,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
             if len(self.queue) >= self.max_queue_size:
                 removed = self.queue.popleft()
                 self.dropped_samples += 1
-                logger.warning(f"Queue full, dropped sample {removed.id}")
+                logger.warning(f"Queue full, dropped sample {removed}")
             self.queue.append(sample)
             self.total_produced += 1
 
@@ -105,7 +105,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
 
             return True
 
-    def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]:
+    def get_samples(self, min_batch_count: int = 1) -> list[Any]:
         """
         从队列获取batch样本，一直等待直到有足够样本
 
@@ -113,7 +113,7 @@ def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]:
             min_batch_count: sample数量满足min_batch，一次性获取
 
         Returns:
-            List[QueueSample]: 获取的样本列表
+            List[Any]: 获取的样本列表
         """
         with self.lock:
             while len(self.queue) < min_batch_count and self.running:
@@ -212,7 +212,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
         """放入batch到队列"""
         return ray.get(self.queue_actor.put_sample.remote(sample, param_version))
 
-    def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]:
+    def get_samples(self, min_batch_count: int = 1) -> list[Any]:
         """从队列获取batch，一直等待直到有足够样本"""
         return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
index 2fff49d6576..b766c60f858 100644
--- a/recipe/fully_async_policy/unittest/test_mq.py
+++ b/recipe/fully_async_policy/unittest/test_mq.py
@@ -316,7 +316,247 @@ def consumer():
         finally:
             client.shutdown()
 
+    def test_consume_first_produce_later(self, message_queue_client, mock_data_proto):
+        """测试先消费、后生产的场景 - 验证阻塞和唤醒机制"""
+        consumer_result = []
+        producer_result = []
+        start_time = time.time()
+
+        def consumer_task():
+            """消费者任务：先启动，等待生产者生产数据"""
+            try:
+                # 记录开始消费的时间
+                consumer_start = time.time()
+                # 这里会阻塞等待，直到有至少2个样本可用
+                samples = message_queue_client.get_samples(min_batch_count=2)
+                consumer_end = time.time()
+
+                consumer_result.append(
+                    {
+                        "success": True,
+                        "samples_count": len(samples),
+                        "wait_time": consumer_end - consumer_start,
+                        "samples": samples,
+                    }
+                )
+            except Exception as e:
+                consumer_result.append({"success": False, "error": str(e), "wait_time": time.time() - consumer_start})
+
+        def producer_task():
+            """生产者任务：延迟1秒后开始生产"""
+            try:
+                # 延迟1秒，确保消费者先开始等待
+                time.sleep(1.0)
+                producer_start = time.time()
+
+                # 分两次放入，验证消费者会等到足够的样本数量
+                samples_1 = mock_data_proto
+                result1 = message_queue_client.put_sample(
+                    sample=samples_1, param_version=1, rollout_metadata=[{"batch": "first"}]
+                )
+
+                # 短暂延迟后放入第二批
+                time.sleep(0.1)
+                samples_2 = mock_data_proto
+                result2 = message_queue_client.put_sample(
+                    sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}]
+                )
+
+                samples_2 = mock_data_proto
+                result3 = message_queue_client.put_sample(
+                    sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}]
+                )
+
+                producer_end = time.time()
+                producer_result.append(
+                    {
+                        "success": result1 and result2,
+                        "put_count": 2,
+                        "produce_time": producer_end - producer_start,
+                        "result1": result1,
+                        "result2": result2,
+                    }
+                )
+
+                print("produce finish")
+
+            except Exception as e:
+                producer_result.append({"success": False, "error": str(e)})
+
+        # 启动消费者线程（先启动）
+        consumer_thread = threading.Thread(target=consumer_task, name="Consumer")
+        # 启动生产者线程（后启动）
+        producer_thread = threading.Thread(target=producer_task, name="Producer")
+
+        consumer_thread.start()
+        time.sleep(0.1)  # 确保消费者先开始等待
+        producer_thread.start()
+
+        print("=========")
+        #
+        # # 等待两个线程完成（设置超时避免死锁）
+        producer_thread.join()
+        # print("producer_result", producer_result)
+        # consumer_thread.join()
+        # print("consumer_thread", consumer_result)
+        #
+        # total_time = time.time() - start_time
+        #
+        # # 验证结果
+        # assert len(consumer_result) == 1, "消费者应该执行一次"
+        #
+        # consumer_data = consumer_result[0]
+        # producer_data = producer_result[0]
+        #
+        # # 验证生产者成功
+        # assert producer_data['success'], f"生产者失败: {producer_data.get('error', '')}"
+        # assert producer_data['put_count'] == 2, "应该生产2批数据"
+        #
+        # # 验证消费者成功
+        # assert consumer_data['success'], f"消费者失败: {consumer_data.get('error', '')}"
+        # assert consumer_data['samples_count'] == 2, "消费者应该获取到2个样本"
+        #
+        # # 验证时序：消费者等待时间应该大于1秒（生产者的延迟时间）
+        # assert consumer_data['wait_time'] >= 1.0, f"消费者等待时间应该≥1秒，实际: {consumer_data['wait_time']:.2f}秒"
+        #
+        # # 验证数据完整性
+        # assert all(isinstance(sample, QueueSample) for sample in consumer_data['samples']), "获取的样本应该是QueueSample类型"
+        #
+        # # 验证队列状态
+        # final_queue_size = message_queue_client.get_queue_size()
+        # assert final_queue_size == 0, "队列应该被清空"
+        #
+        # stats = message_queue_client.get_statistics()
+        # assert stats['total_produced'] == 2, "应该生产了2个样本"
+        # assert stats['total_consumed'] == 2, "应该消费了2个样本"
+        #
+        # print(f"测试成功完成，总耗时: {total_time:.2f}秒")
+        # print(f"消费者等待时间: {consumer_data['wait_time']:.2f}秒")
+        # print(f"生产者执行时间: {producer_data['produce_time']:.2f}秒")
+
+    def test_multiple_consumers_single_producer(self, message_queue_client, mock_data_proto):
+        """测试多个消费者等待单个生产者的场景"""
+        consumer_results = []
+        producer_result = []
+
+        def consumer_task(consumer_id):
+            """消费者任务"""
+            try:
+                start_time = time.time()
+                samples = message_queue_client.get_samples(min_batch_count=1)
+                end_time = time.time()
+
+                consumer_results.append(
+                    {
+                        "id": consumer_id,
+                        "success": True,
+                        "samples_count": len(samples),
+                        "wait_time": end_time - start_time,
+                    }
+                )
+            except Exception as e:
+                consumer_results.append({"id": consumer_id, "success": False, "error": str(e)})
+
+        def producer_task():
+            """生产者任务：延迟后批量生产"""
+            try:
+                time.sleep(1.5)  # 确保所有消费者都在等待
+
+                # 生产3批数据，每批1个样本，供3个消费者消费
+                for i in range(3):
+                    samples = [mock_data_proto]
+                    result = message_queue_client.put_sample(
+                        sample=samples, param_version=1, rollout_metadata=[{"batch_id": i}]
+                    )
+                    producer_result.append(result)
+                    time.sleep(0.1)  # 短暂间隔
+
+            except Exception as e:
+                producer_result.append(False)
+
+        print("# 启动3个消费者线程")
+        # consumer_threads = []
+        # for i in range(3):
+        #     thread = threading.Thread(target=consumer_task, args=(i,), name=f"Consumer-{i}")
+        #     consumer_threads.append(thread)
+        #     thread.start()
+        #     time.sleep(0.1)  # 错开启动时间
+        #
+        # # 启动生产者线程
+        # producer_thread = threading.Thread(target=producer_task, name="Producer")
+        # producer_thread.start()
+        #
+        # # 等待所有线程完成
+        # producer_thread.join(timeout=10)
+        # for thread in consumer_threads:
+        #     thread.join(timeout=10)
+        #
+        # # 验证结果
+        # assert len(consumer_results) == 3, "应该有3个消费者结果"
+        # assert len(producer_result) == 3, "应该生产3批数据"
+        #
+        # # 验证所有消费者都成功
+        # for result in consumer_results:
+        #     assert result['success'], f"消费者{result['id']}失败: {result.get('error', '')}"
+        #     assert result['samples_count'] == 1, f"消费者{result['id']}应该获取1个样本"
+        #     assert result['wait_time'] >= 1.5, f"消费者{result['id']}等待时间应该≥1.5秒"
+        #
+        # # 验证生产者都成功
+        # assert all(producer_result), "所有生产操作都应该成功"
+        #
+        # # 验证最终状态
+        # final_stats = message_queue_client.get_statistics()
+        # assert final_stats['total_produced'] == 3, "应该总共生产3个样本"
+        # assert final_stats['total_consumed'] == 3, "应该总共消费3个样本"
+        # assert final_stats['queue_size'] == 0, "队列应该被清空"
+
+    def test_consumer_timeout_scenario(self, message_queue_client, mock_data_proto):
+        """测试消费者超时场景（通过关闭队列来模拟）"""
+        consumer_result = []
+
+        def consumer_task():
+            """消费者任务：等待样本"""
+            try:
+                start_time = time.time()
+                # 尝试获取样本，但没有生产者会生产数据
+                samples = message_queue_client.get_samples(min_batch_count=2)
+                end_time = time.time()
+
+                consumer_result.append(
+                    {"success": True, "samples_count": len(samples), "wait_time": end_time - start_time}
+                )
+            except Exception as e:
+                consumer_result.append({"success": False, "error": str(e)})
+
+        def shutdown_task():
+            """延迟关闭队列，模拟超时场景"""
+            time.sleep(2.0)  # 让消费者等待2秒
+            message_queue_client.shutdown()
+
+        # 启动消费者和关闭任务
+        consumer_thread = threading.Thread(target=consumer_task, name="Consumer")
+        shutdown_thread = threading.Thread(target=shutdown_task, name="Shutdown")
+
+        consumer_thread.start()
+        time.sleep(0.1)
+        shutdown_thread.start()
+
+        # 等待线程完成
+        shutdown_thread.join(timeout=5)
+        consumer_thread.join(timeout=5)
+
+        # 验证结果
+        assert len(consumer_result) == 1, "应该有一个消费者结果"
+
+        result = consumer_result[0]
+        # 消费者应该在队列关闭后返回空列表
+        if result["success"]:
+            assert result["samples_count"] == 0, "关闭后应该返回空样本列表"
+
+        print(f"消费者等待了 {result.get('wait_time', 0):.2f} 秒后退出")
+
+    # 运行测试的示例配置
+
 
-# 运行测试的示例配置
 if __name__ == "__main__":
     pytest.main([__file__, "-v", "--tb=short"])
diff --git a/recipe/fully_async_policy/unittest/test_mq2.py b/recipe/fully_async_policy/unittest/test_mq2.py
new file mode 100644
index 00000000000..d846a16dcb7
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/test_mq2.py
@@ -0,0 +1,171 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import threading
+import time
+from unittest.mock import Mock
+
+import pytest
+import ray
+from omegaconf import DictConfig
+
+from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample
+
+
+@pytest.fixture
+def mock_data_proto():
+    """Mock数据对象"""
+    return Mock()
+
+
+@pytest.fixture
+def basic_config():
+    """基础配置"""
+    return DictConfig({"async_training": {"staleness_threshold": 3}})
+
+
+@pytest.fixture
+def queue_config():
+    """队列配置"""
+    return DictConfig({"async_training": {"staleness_threshold": 2}})
+
+
+@pytest.fixture
+def ray_setup():
+    """设置Ray环境"""
+    if not ray.is_initialized():
+        ray.init(local_mode=True, ignore_reinit_error=True)
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture
+def message_queue_client(ray_setup, basic_config):
+    """创建MessageQueue actor并返回其客户端"""
+    actor = MessageQueue.remote(basic_config, max_queue_size=10)
+    client = MessageQueueClient(actor)
+    yield client
+    client.shutdown()
+
+
+class TestConcurrency:
+    """测试并发场景"""
+
+    def setup_method(self):
+        """每个测试方法前的设置"""
+        if not ray.is_initialized():
+            ray.init()
+
+    def teardown_method(self):
+        """每个测试方法后的清理"""
+        if ray.is_initialized():
+            ray.shutdown()
+
+    def create_message_queue_client(self, config=None):
+        """创建MessageQueue client的辅助方法"""
+        if config is None:
+            config = DictConfig({"async_training": {"staleness_threshold": 3}})
+        actor = MessageQueue.remote(config, max_queue_size=10)
+        return MessageQueueClient(actor)
+
+    def test_consume_first_produce_later(self, message_queue_client, mock_data_proto):
+        """测试先消费、后生产的场景 - 验证阻塞和唤醒机制"""
+        consumer_result = []
+        producer_result = []
+        start_time = time.time()
+
+        def consumer_task():
+            """消费者任务：先启动，等待生产者生产数据"""
+            # 记录开始消费的时间
+            consumer_start = time.time()
+            # 这里会阻塞等待，直到有至少2个样本可用
+            samples = message_queue_client.get_samples(min_batch_count=3)
+            consumer_end = time.time()
+            consumer_result.append(
+                {
+                    "success": True,
+                    "samples_count": len(samples),
+                    "wait_time": consumer_end - consumer_start,
+                    "samples": samples,
+                }
+            )
+
+        def producer_task():
+            """生产者任务：延迟1秒后开始生产"""
+            time.sleep(4.0)
+            producer_start = time.time()
+            message_queue_client.put_sample(
+                sample=mock_data_proto,
+                param_version=1,
+            )
+            time.sleep(1)
+            message_queue_client.put_sample(
+                sample=mock_data_proto,
+                param_version=1,
+            )
+            time.sleep(1)
+            message_queue_client.put_sample(
+                sample=mock_data_proto,
+                param_version=1,
+            )
+            producer_end = time.time()
+            producer_result.append(
+                {
+                    "put_count": 3,
+                    "produce_time": producer_end - producer_start,
+                }
+            )
+
+            print("produce finish")
+
+        # 启动消费者线程（先启动）
+        consumer_thread = threading.Thread(target=consumer_task, name="Consumer")
+        time.sleep(3)
+        # 启动生产者线程（后启动）
+        producer_thread = threading.Thread(target=producer_task, name="Producer")
+
+        consumer_thread.start()
+        time.sleep(0.1)  # 确保消费者先开始等待
+        producer_thread.start()
+
+        print("=========", flush=True)
+        #
+        # # 等待两个线程完成（设置超时避免死锁）
+        producer_thread.join()
+        print("producer_result", producer_result, flush=True)
+        consumer_thread.join()
+        print("consumer_result", consumer_result, flush=True)
+
+        # 验证结果
+        assert len(consumer_result) == 1, "消费者应该执行一次"
+
+        consumer_data = consumer_result[0]
+        producer_data = producer_result[0]
+
+        # 验证生产者成功
+        assert producer_data["put_count"] == 3, "应该生产2批数据"
+        assert consumer_data["samples_count"] == 3, "消费者应该获取到2个样本"
+
+        # 验证队列状态
+        final_queue_size = message_queue_client.get_queue_size()
+        assert final_queue_size == 0, "队列应该被清空"
+
+        stats = message_queue_client.get_statistics()
+        assert stats["total_produced"] == 3, "应该生产了2个样本"
+        assert stats["total_consumed"] == 3, "应该消费了2个样本"
+        #
+
+
+# 运行测试的示例配置
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 2949316228a..50eb9070314 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -33,10 +33,10 @@ overlong_penalty_factor=1.0
 
 # Training parameters
 loss_agg_mode="token-mean"
-train_prompt_bsz=32
-gen_prompt_bsz=4
+train_prompt_bsz=2
+gen_prompt_bsz=2
 n_resp_per_prompt=3
-train_prompt_mini_bsz=4
+train_prompt_mini_bsz=1
 
 # Temperature parameters
 temperature=1.0
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 26150cc631d..9b87d5a3bd8 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1237,6 +1237,7 @@ def _post_generate_batch(self, batch, gen_batch_output, metrics):
     def _process_batch_common(self, batch, metrics, timing_raw):
         with marked_timer("reward", timing_raw, color="yellow"):
             # compute reward model score
+            print("marked_timer reward")
             if self.use_rm:
                 reward_tensor = self.rm_wg.compute_rm_score(batch)
                 batch = batch.union(reward_tensor)
@@ -1247,6 +1248,8 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
+            print("marked_timer rewold_log_prob")
+
             old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
             entropys = old_log_prob.batch["entropys"]
             response_masks = batch.batch["response_mask"]
@@ -1281,6 +1284,8 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                     }
                 )
         if self.use_reference_policy:
+            print("marked_timer use_reference_policy")
+
             # compute reference log_prob
             with marked_timer("ref", timing_raw, color="olive"):
                 if not self.ref_in_actor:
@@ -1290,10 +1295,12 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 batch = batch.union(ref_log_prob)
         # compute values
         if self.use_critic:
+            print("marked_timer compute use_critic")
             with marked_timer("values", timing_raw, color="cyan"):
                 values = self.critic_wg.compute_values(batch)
                 batch = batch.union(values)
         with marked_timer("adv", timing_raw, color="brown"):
+            print("marked_timer adv")
             # we combine with rule-based rm
             reward_extra_infos_dict: dict[str, list]
             if self.config.reward_model.launch_reward_fn_async:
@@ -1329,6 +1336,7 @@ def _process_batch_common(self, batch, metrics, timing_raw):
             )
         # update critic
         if self.use_critic:
+            print("marked_timer update use_critic")
             with marked_timer("update_critic", timing_raw, color="pink"):
                 critic_output = self.critic_wg.update_critic(batch)
             critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
@@ -1336,6 +1344,7 @@ def _process_batch_common(self, batch, metrics, timing_raw):
         # implement critic warmup
         if self.config.trainer.critic_warmup <= self.global_steps:
             # update actor
+            print("marked_timer update_actor")
             with marked_timer("update_actor", timing_raw, color="red"):
                 batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
                 actor_output = self.actor_rollout_wg.update_actor(batch)

From a8691b0971f84db33bda89186c38ff0b7e981d63 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 6 Aug 2025 19:18:20 +0800
Subject: [PATCH 028/182] train mutil step

---
 .../fully_async_rollouter.py                  |  1 -
 .../fully_async_policy/fully_async_trainer.py | 55 ++++++++++++++-----
 recipe/fully_async_policy/message_queue.py    |  3 +
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 61b21b43fd5..b4ad9796294 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -370,7 +370,6 @@ def _should_pause_generation(self) -> bool:
                 return True
 
             # 如果队列太满，也暂停生成
-
             if queue_size >= self.max_queue_size:
                 print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
                 return True
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 9830aef595e..7d7a1130340 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -308,12 +308,12 @@ def fit(self):
             print(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
-
+        # TODO 需要从
         self.total_training_steps = self.config.trainer.total_training_steps
 
         print(f"Total training steps: {self.total_training_steps}")
         # add tqdm
-        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+        # progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
 
         # we start from step 1
         self.global_steps += 1
@@ -324,6 +324,15 @@ def fit(self):
         # 初始化获取第一批数据
         while True:
             print("while True", flush=True)
+
+            # 检查队列状态
+            if self.message_queue_client:
+                queue_stats = self.message_queue_client.get_statistics()
+                print(f"Queue status before getting samples: {queue_stats}")
+
+                if queue_stats.get('queue_size', 0) == 0:
+                    print("WARNING: Queue is empty, will block waiting for samples")
+
             metrics = {}
             timing_raw = {}
 
@@ -383,22 +392,42 @@ def fit(self):
                 self._check_save_checkpoint(is_last_step, timing_raw)
 
             print("_stop_profiling")
-            self._stop_profiling(do_profile, timing_raw)
+            # self._stop_profiling(do_profile, timing_raw)
             print("_collect_metrics")
-            self._collect_metrics(batch, epoch, metrics, timing_raw)
+            # self._collect_metrics(batch, epoch, metrics, timing_raw)
             print("_post_batch_processing")
-            self._post_batch_processing(batch)
+            # self._post_batch_processing(batch)
+
+            print("step end")
+            #
+            # # TODO: make a canonical logger that supports various backend
+            # print(data=metrics, step=self.global_steps)
+            #
+            # # progress_bar.update(1)
+            # self.global_steps += 1
+            print("is_last_step")
+            # if is_last_step:
+            #     pprint(f"Final validation metrics: {last_val_metrics}")
+            #     print("is_last_step")
+            #     # progress_bar.close()
+            #     return
+            #
+            #
+            # # 检查队列状态
+            # if self.message_queue_client:
+            #     queue_stats = self.message_queue_client.get_statistics()
+            #     print(f"Queue status before getting samples: {queue_stats}")
+            #
+            #     if queue_stats.get('queue_size', 0) == 0:
+            #         print("WARNING: Queue is empty, will block waiting for samples")
+            #
+            # with marked_timer("gen", timing_raw, color="red"):
+            #     epoch, batch = self._get_samples_from_queue()
+            #     if batch is None:
+            #         break
 
-            # TODO: make a canonical logger that supports various backend
-            print(data=metrics, step=self.global_steps)
 
-            progress_bar.update(1)
-            self.global_steps += 1
 
-            if is_last_step:
-                pprint(f"Final validation metrics: {last_val_metrics}")
-                progress_bar.close()
-                return
 
     def get_statistics(self) -> dict:
         """获取训练统计信息"""
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index c6116f0c432..e5c382dec2a 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -115,8 +115,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
         Returns:
             List[Any]: 获取的样本列表
         """
+
+        print("get_samples")
         with self.lock:
             while len(self.queue) < min_batch_count and self.running:
+                print("consumer_condition")
                 self.consumer_condition.wait()
 
             # 如果队列已关闭且没有足够样本，返回空列表

From ee8914ccdd25362072798815d116b984ed6f5131 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 8 Aug 2025 11:00:24 +0800
Subject: [PATCH 029/182] param_sync

---
 .../fully_async_policy/UNIFIED_PARAM_SYNC.md  | 143 +++++++
 recipe/fully_async_policy/fully_async_main.py | 156 +------
 .../fully_async_rollouter.py                  |   5 +
 .../fully_async_policy/fully_async_trainer.py | 150 +++++--
 recipe/fully_async_policy/param_sync.py       | 392 ++----------------
 5 files changed, 330 insertions(+), 516 deletions(-)
 create mode 100644 recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md

diff --git a/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md b/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md
new file mode 100644
index 00000000000..e816968f8fc
--- /dev/null
+++ b/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md
@@ -0,0 +1,143 @@
+# 统一参数同步器使用指南 (Unified Parameter Synchronizer Guide)
+
+本文档说明了新的统一参数同步器 `UnifiedParameterSynchronizer` 的使用方法。该类合并了原有的多个同步器类的功能，提供了更简洁和统一的接口。
+
+## 🏗️ 类合并说明
+
+### 原有类结构（已合并）
+- `ParameterSynchronizer` - 基础参数同步器
+- `ParameterSyncManager` - Ray Actor形式的参数同步管理器
+- `AsyncParameterSynchronizer` - 异步参数同步器
+
+### 新的统一类
+- `UnifiedParameterSynchronizer` - 统一参数同步器，包含所有功能
+
+## 🚀 使用方法
+
+### 1. 异步训练模式（推荐）
+```python
+from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer
+
+# 创建异步模式的参数同步器
+param_synchronizer = UnifiedParameterSynchronizer(
+    config=config,
+    trainer_actor=trainer_actor,
+    rollouter_actor=rollouter_actor
+)
+
+# 同步参数到rollouter
+success = param_synchronizer.sync_to_rollouter(new_version=1)
+```
+
+### 2. Ray Actor模式
+```python
+from recipe.fully_async_policy.param_sync import ParameterSyncManager
+
+# 创建Ray remote参数同步管理器
+sync_manager = ParameterSyncManager.remote(config)
+
+# 注册workers
+success = ray.get(sync_manager.register_workers.remote(actor_workers, rollout_workers))
+
+# 执行同步
+success = ray.get(sync_manager.sync_parameters.remote())
+```
+
+### 3. 传统模式
+```python
+from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer
+
+# 创建传统模式的参数同步器
+synchronizer = UnifiedParameterSynchronizer(config)
+
+# 初始化同步组
+success = synchronizer.initialize_sync_group(actor_workers, rollout_workers)
+
+# 同步权重
+success = synchronizer.sync_weights(actor_workers, rollout_workers)
+```
+
+## 🔄 向后兼容性
+
+为了确保现有代码的兼容性，提供了以下别名：
+
+```python
+# 这些别名指向 UnifiedParameterSynchronizer
+ParameterSynchronizer = UnifiedParameterSynchronizer
+AsyncParameterSynchronizer = UnifiedParameterSynchronizer
+
+# Ray remote版本
+ParameterSyncManager = ray.remote(UnifiedParameterSynchronizer)
+```
+
+现有代码无需修改即可使用新的统一同步器。
+
+## ⚙️ 初始化参数
+
+```python
+def __init__(self, config, trainer_actor=None, rollouter_actor=None, as_ray_actor=False):
+```
+
+- `config`: 配置对象（必需）
+- `trainer_actor`: trainer actor引用（用于async模式）
+- `rollouter_actor`: rollouter actor引用（用于async模式）
+- `as_ray_actor`: 是否作为Ray actor使用
+
+## 📊 主要方法
+
+### 异步模式
+- `sync_to_rollouter(new_version)`: 同步参数到rollouter
+- `get_current_version()`: 获取当前参数版本
+
+### Ray Actor模式
+- `register_workers(actor_workers, rollout_workers)`: 注册workers
+- `sync_parameters()`: 执行参数同步
+
+### 传统模式
+- `initialize_sync_group(actor_workers, rollout_workers)`: 初始化同步组
+- `sync_weights(actor_workers, rollout_workers)`: 同步权重
+
+### 通用方法
+- `get_statistics()`: 获取统计信息
+- `get_weights_info()`: 获取权重信息
+- `cleanup()`: 清理资源
+
+## 📈 统计信息
+
+```python
+stats = synchronizer.get_statistics()
+# 返回：
+{
+    "sync_count": 15,
+    "sync_failures": 0,
+    "last_sync_time": 1640995200.0,
+    "sync_group_initialized": True,
+    "current_param_version": 15,
+    "current_version": 15,
+    "is_ready": True  # 仅在Ray actor模式下
+}
+```
+
+## 🎯 优势
+
+1. **统一接口**: 一个类支持所有同步模式
+2. **向后兼容**: 现有代码无需修改
+3. **灵活配置**: 支持多种初始化方式
+4. **完整功能**: 包含所有原有类的功能
+5. **简化维护**: 减少代码重复，便于维护
+
+## 🔧 配置示例
+
+```yaml
+async_training:
+  max_sync_retries: 3
+  sync_timeout: 30.0
+  sync_retry_delay: 1.0
+  sync_monitor_interval: 60.0
+  staleness_threshold: 3
+```
+
+---
+
+*统一参数同步器简化了参数同步的使用，同时保持了所有原有功能的完整性。*
+
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 888c6c73594..aa5ac81f48a 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -204,6 +204,7 @@ def _initialize_components(self, config) -> None:
 
         self.components["tokenizer"] = tokenizer
         self.components["processor"] = processor
+        self.components["config"] = config  # 保存config以供其他方法使用
 
         # 创建worker映射和资源池
         print("Creating worker mapping and resource pools...")
@@ -244,14 +245,22 @@ def _initialize_components(self, config) -> None:
         self._create_trainer(config)
 
         # 设置参数同步
-        # print("Setting up parameter synchronization...")
-        # param_synchronizer = AsyncParameterSynchronizer(
-        #     config=config,
-        #     actor_wg=self.components["trainer"].actor_wg,
-        #     rollouter=self.components["rollouter"],
-        # )
-        # self.components["param_synchronizer"] = param_synchronizer
-        # print("All components initialized successfully")
+        print("Setting up parameter synchronization...")
+        from recipe.fully_async_policy.param_sync import ParameterSynchronizer
+
+        param_synchronizer = ParameterSynchronizer(
+            config=config,
+            actor_wg=self.components["trainer"],
+            rollout_wg=self.components["rollouter"],
+        )
+
+        # 将参数同步器设置到trainer和rollouter
+        ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
+        ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer))
+
+        self.components["param_synchronizer"] = param_synchronizer
+        print("Parameter synchronizer initialized successfully")
+        print("All components initialized successfully")
 
     def _create_rollouter(self, config) -> None:
         """创建Rollouter"""
@@ -312,7 +321,6 @@ def _run_training_loop(self):
         print("Starting Rollouter in background...")
         rollouter_future = self.components["rollouter"].fit.remote()
         trainer_future = self.components["trainer"].fit.remote()
-        # self._monitor_components()
 
         print("Starting Trainer...")
         time.sleep(10)
@@ -324,136 +332,6 @@ def _run_training_loop(self):
 
         print("Training completed or interrupted")
 
-    def _monitor_components(self):
-        """监控组件状态"""
-        print("Starting component monitoring...")
-
-        last_stats_time = time.time()
-        stats_interval = 60.0  # 60秒报告一次统计
-
-        while self.running and not self.shutdown_event.is_set():
-            try:
-                # 等待一段时间或直到收到停止信号
-                if self.shutdown_event.wait(timeout=10.0):
-                    break
-
-                # 定期报告统计信息
-                current_time = time.time()
-                if current_time - last_stats_time >= stats_interval:
-                    self._log_component_statistics()
-                    last_stats_time = current_time
-
-                # 检查组件健康状态
-                self._check_component_health()
-
-            except Exception as e:
-                print(f"Error in component monitoring: {e}")
-
-        print("Component monitoring stopped")
-
-    def _log_component_statistics(self):
-        """记录组件统计信息"""
-        try:
-            # 获取Trainer统计
-            trainer_stats = self.components["trainer"].get_statistics()
-
-            # 获取Rollouter统计
-            rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0)
-
-            # 获取队列统计
-            queue_stats = self.components["message_queue_client"].get_statistics()
-
-            print("=== Component Statistics ===")
-            print(
-                f"Trainer - Steps: {trainer_stats['global_steps']}, "
-                f"Samples: {trainer_stats['processed_samples']}, "
-                f"Param version: {trainer_stats['current_param_version']}"
-            )
-
-            print(
-                f"Rollouter - Generated: {rollouter_stats['total_generated_samples']}, "
-                f"Dropped: {rollouter_stats['dropped_stale_samples']}, "
-                f"Errors: {rollouter_stats['generation_errors']}"
-            )
-
-            print(
-                f"Queue - Size: {queue_stats['queue_size']}, "
-                f"Produced: {queue_stats['total_produced']}, "
-                f"Consumed: {queue_stats['total_consumed']}"
-            )
-
-        except Exception as e:
-            print(f"Error getting component statistics: {e}")
-
-    def _check_component_health(self):
-        """检查组件健康状态"""
-        try:
-            # 检查trainer是否仍在运行
-            if hasattr(self.components["trainer"], "global_steps"):
-                current_steps = self.components["trainer"].global_steps
-                # 可以添加更多健康检查逻辑
-                print(current_steps)
-
-            # 检查rollouter是否仍在运行
-            rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0)
-
-            if not rollouter_stats["is_running"]:
-                print("Rollouter is not running!")
-                # 可以尝试重启或报告错误
-
-        except Exception as e:
-            print(f"Health check failed: {e}")
-
-    def _cleanup_resources(self):
-        """清理资源"""
-        print("Cleaning up resources...")
-
-        try:
-            # 停止Rollouter
-            if "rollouter" in self.components:
-                print("Shutting down Rollouter...")
-                try:
-                    shutdown_future = self.components["rollouter"].shutdown.remote()
-                    ray.get(shutdown_future, timeout=10.0)
-                except Exception as e:
-                    print(f"Error shutting down Rollouter: {e}")
-
-            # 清理MessageQueue
-            if "message_queue_client" in self.components:
-                print("Cleaning up MessageQueue...")
-                try:
-                    self.components["message_queue_client"].shutdown()
-                except Exception as e:
-                    print(f"Error cleaning up MessageQueue: {e}")
-
-            # 清理参数同步器
-            if "param_synchronizer" in self.components:
-                print("Cleaning up parameter synchronizer...")
-                # TODO: 添加参数同步器的清理逻辑
-
-            print("Resource cleanup completed")
-
-        except Exception as e:
-            print(f"Error during cleanup: {e}")
-
-    def get_training_status(self) -> dict:
-        """获取训练状态"""
-        if not self.running or "trainer" not in self.components:
-            return {"status": "not_running"}
-
-        try:
-            trainer_stats = self.components["trainer"].get_statistics()
-            rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0)
-
-            return {
-                "status": "running",
-                "trainer_stats": trainer_stats,
-                "rollouter_stats": rollouter_stats,
-            }
-        except Exception as e:
-            print(f"Error getting training status: {e}")
-            return {"status": "error", "error": str(e)}
-
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index b4ad9796294..c760215c580 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -138,6 +138,11 @@ def __init__(
         self.sync_in_progress = False
         self.sync_lock = threading.Lock()
 
+        # 参数同步状态 - 基于one_step_off_policy模式
+        self._weights_info = None
+        self._is_rollout = True  # rollouter是rollout角色
+        self._is_actor = False
+
         self.max_queue_size = max_queue_size
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 7d7a1130340..20bae1a6a64 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -22,7 +22,6 @@
 import numpy as np
 import ray
 from omegaconf import OmegaConf
-from tqdm import tqdm
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -127,6 +126,11 @@ def __init__(
         self.current_param_version = 0
         self.param_sync_count = 0
 
+        # 参数同步相关状态
+        self._weights_info = None
+        self._is_actor = False  # 将在init_worker_group中设置
+        self._is_rollout = False
+
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """设置消息队列客户端"""
         with self.lock:
@@ -137,6 +141,60 @@ def set_parameter_synchronizer(self, param_synchronizer):
         with self.lock:
             self.param_synchronizer = param_synchronizer
 
+    def _get_actor_params(self):
+        """
+        获取actor参数 - 基于one_step_off_policy的实现
+        """
+        if not hasattr(self, "actor_wg") or self.actor_wg is None:
+            raise ValueError("Actor worker group not initialized")
+
+        # 从actor worker group获取参数
+        actor_workers = self.actor_wg.workers
+        if not actor_workers:
+            raise ValueError("No actor workers available")
+
+        # 获取第一个actor worker的参数信息
+        params_future = actor_workers[0]._get_actor_params.remote()
+        params = ray.get(params_future, timeout=10.0)
+        return params
+
+    def get_actor_weights_info(self):
+        """
+        获取actor权重信息 - 基于one_step_off_policy的模式
+        """
+        if hasattr(self, "_weights_info") and self._weights_info is not None:
+            return self._weights_info
+
+        if not hasattr(self, "actor_wg") or self.actor_wg is None:
+            raise ValueError("Actor worker group not initialized")
+
+        # 从actor worker group获取权重信息
+        weights_info_future = self.actor_wg.get_actor_weights_info.remote()
+        weights_info = ray.get(weights_info_future, timeout=10.0)
+
+        # 缓存权重信息
+        self._weights_info = weights_info[0] if isinstance(weights_info, list) else weights_info
+        return self._weights_info
+
+    def sync_rollout_weights(self):
+        """
+        同步rollout权重 - Actor端的同步操作
+        """
+        if not hasattr(self, "actor_wg") or self.actor_wg is None:
+            logger.warning("Actor worker group not initialized for sync")
+            return False
+
+        try:
+            # 触发actor worker group的参数同步
+            sync_future = self.actor_wg.sync_rollout_weights.remote()
+            ray.get(sync_future, timeout=30.0)
+            logger.debug("Actor weights sync completed")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to sync actor weights: {e}")
+            return False
+
     def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
         从消息队列获取样本并组成gen_batch_output
@@ -287,7 +345,7 @@ def fit(self):
 
         from verl.utils.tracking import Tracking
 
-        logger = Tracking(
+        self.logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
             default_backend=self.config.trainer.logger,
@@ -330,7 +388,7 @@ def fit(self):
                 queue_stats = self.message_queue_client.get_statistics()
                 print(f"Queue status before getting samples: {queue_stats}")
 
-                if queue_stats.get('queue_size', 0) == 0:
+                if queue_stats.get("queue_size", 0) == 0:
                     print("WARNING: Queue is empty, will block waiting for samples")
 
             metrics = {}
@@ -399,35 +457,22 @@ def fit(self):
             # self._post_batch_processing(batch)
 
             print("step end")
-            #
-            # # TODO: make a canonical logger that supports various backend
-            # print(data=metrics, step=self.global_steps)
-            #
-            # # progress_bar.update(1)
-            # self.global_steps += 1
+
+            # 在训练步骤结束后触发参数同步
+            self._trigger_parameter_sync_after_step()
+
+            # TODO: make a canonical logger that supports various backend
+            print(data=metrics, step=self.global_steps)
+
+            # progress_bar.update(1)
+            self.global_steps += 1
             print("is_last_step")
             # if is_last_step:
             #     pprint(f"Final validation metrics: {last_val_metrics}")
             #     print("is_last_step")
             #     # progress_bar.close()
             #     return
-            #
-            #
-            # # 检查队列状态
-            # if self.message_queue_client:
-            #     queue_stats = self.message_queue_client.get_statistics()
-            #     print(f"Queue status before getting samples: {queue_stats}")
-            #
-            #     if queue_stats.get('queue_size', 0) == 0:
-            #         print("WARNING: Queue is empty, will block waiting for samples")
-            #
-            # with marked_timer("gen", timing_raw, color="red"):
-            #     epoch, batch = self._get_samples_from_queue()
-            #     if batch is None:
-            #         break
-
-
-
+            ray.get(self.param_synchronizer.sync_weights.remote(self.global_steps))
 
     def get_statistics(self) -> dict:
         """获取训练统计信息"""
@@ -444,6 +489,59 @@ def get_statistics(self) -> dict:
             "queue_dropped_samples": queue_stats.get("dropped_samples", 0),
         }
 
+    def _trigger_parameter_sync_after_step(self):
+        """
+        在训练步骤结束后触发参数同步
+        这确保rollouter总是使用最新训练的参数
+        """
+        if not self.param_synchronizer:
+            logger.debug("No parameter synchronizer available, skipping sync")
+            return
+
+        try:
+            # 更新参数版本号
+            new_version = self.current_param_version + 1
+
+            print(
+                f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}"
+            )
+            logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}")
+
+            # 异步触发参数同步，不阻塞训练流程
+            import threading
+
+            sync_thread = threading.Thread(target=self._async_parameter_sync, args=(new_version,), daemon=True)
+            sync_thread.start()
+
+        except Exception as e:
+            logger.error(f"Error triggering parameter sync: {e}")
+
+    def _async_parameter_sync(self, new_version: int):
+        """
+        异步执行参数同步，避免阻塞训练流程
+
+        Args:
+            new_version: 新的参数版本号
+        """
+        try:
+            # 执行参数同步
+            success = self.param_synchronizer.sync_to_rollouter(new_version)
+
+            if success:
+                # 更新本地参数版本
+                with self.lock:
+                    self.current_param_version = new_version
+                    self.param_sync_count += 1
+
+                print(f"[TRAINER] Parameter sync completed successfully for version {new_version}")
+                logger.info(f"Parameter sync completed successfully for version {new_version}")
+            else:
+                print(f"[TRAINER] Parameter sync failed for version {new_version}")
+                logger.warning(f"Parameter sync failed for version {new_version}")
+
+        except Exception as e:
+            logger.error(f"Error in async parameter sync: {e}")
+
     def update_param_version(self, param_version: int) -> bool:
         """
         更新trainer的参数版本，用于跟踪与rollouter的参数同步状态
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 023475ef777..10843302786 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import logging
-import time
 
 import ray
 from ray.util.collective import collective
@@ -21,372 +20,63 @@
 logger = logging.getLogger(__name__)
 
 
-class ParameterSynchronizer:
-    """
-    参数同步器，负责在actor和rollout之间同步模型参数
-    改进版本，具有更好的错误处理和重试机制
-    """
-
-    def __init__(self, config):
-        self.config = config
-        self.weights_info = None
-        self.sync_group_initialized = False
-        self.sync_group_name = "actor_rollout"
-
-        # 同步配置
-        self.max_sync_retries = config.async_training.get("max_sync_retries", 3)
-        self.sync_timeout = config.async_training.get("sync_timeout", 30.0)
-        self.retry_delay = config.async_training.get("sync_retry_delay", 1.0)
-
-        # 统计信息
-        self.sync_count = 0
-        self.sync_failures = 0
-        self.last_sync_time = 0
-
-    def initialize_sync_group(self, actor_workers: list, rollout_workers: list) -> bool:
-        """
-        初始化参数同步组
-
-        Args:
-            actor_workers: actor worker列表
-            rollout_workers: rollout worker列表
-
-        Returns:
-            bool: 是否成功初始化
-        """
-        logger.info("Initializing parameter synchronization group...")
-
-        try:
-            # 验证workers
-            if not actor_workers:
-                raise ValueError("No actor workers provided")
-            if not rollout_workers:
-                raise ValueError("No rollout workers provided")
-
-            # 获取actor的权重信息
-            logger.debug("Getting actor weights info...")
-            weights_info_future = actor_workers[0].get_actor_weights_info.remote()
-            self.weights_info = ray.get(weights_info_future, timeout=10.0)[0]
-
-            if not self.weights_info:
-                raise ValueError("Failed to get actor weights info")
-
-            # 设置rollout的权重信息
-            logger.debug("Setting rollout weights info...")
-            set_weights_futures = []
-            for rollout_worker in rollout_workers:
-                future = rollout_worker.set_actor_weights_info.remote(self.weights_info)
-                set_weights_futures.append(future)
-
-            ray.get(set_weights_futures, timeout=10.0)
-
-            # 创建actor-rollout通信组
-            logger.debug("Creating collective communication group...")
-            all_workers = actor_workers + rollout_workers
-
-            # 清理可能存在的旧组
-            try:
-                collective.destroy_collective_group(self.sync_group_name)
-            except Exception:
-                pass  # 忽略清理错误
-
-            collective.create_collective_group(
-                all_workers,
-                len(all_workers),
-                list(range(0, len(all_workers))),
-                backend="nccl",
-                group_name=self.sync_group_name,
-            )
-
-            self.sync_group_initialized = True
-            logger.info("Parameter synchronization group initialized successfully")
-            return True
-
-        except Exception as e:
-            logger.error(f"Failed to initialize sync group: {e}")
-            self.sync_group_initialized = False
-            return False
-
-    def sync_weights(self, actor_workers: list, rollout_workers: list) -> bool:
-        """
-        同步权重从actor到rollout - 改进版本，具有重试和错误处理
-
-        Args:
-            actor_workers: actor worker列表
-            rollout_workers: rollout worker列表
-
-        Returns:
-            bool: 是否同步成功
-        """
-        if not self.sync_group_initialized:
-            logger.error("Sync group not initialized. Call initialize_sync_group() first.")
-            return False
-
-        logger.debug("Starting weight synchronization...")
-        start_time = time.time()
-
-        for attempt in range(self.max_sync_retries):
-            try:
-                # 执行同步
-                success = self._execute_sync(actor_workers, rollout_workers)
-
-                if success:
-                    self.sync_count += 1
-                    self.last_sync_time = time.time()
-                    sync_duration = self.last_sync_time - start_time
-                    logger.debug(f"Weight synchronization completed in {sync_duration:.2f}s")
-                    return True
-                else:
-                    logger.warning(f"Sync attempt {attempt + 1} failed")
-
-            except Exception as e:
-                logger.warning(f"Sync attempt {attempt + 1} failed with error: {e}")
-
-            # 如果不是最后一次尝试，等待后重试
-            if attempt < self.max_sync_retries - 1:
-                logger.info(f"Retrying sync in {self.retry_delay}s...")
-                time.sleep(self.retry_delay)
-
-        # 所有重试都失败
-        self.sync_failures += 1
-        logger.error(f"All sync attempts failed. Total failures: {self.sync_failures}")
-        return False
-
-    def _execute_sync(self, actor_workers: list, rollout_workers: list) -> bool:
-        """
-        执行实际的同步操作
-
-        Args:
-            actor_workers: actor worker列表
-            rollout_workers: rollout worker列表
-
-        Returns:
-            bool: 是否同步成功
-        """
-        try:
-            sync_futures = []
-
-            # Actor端同步
-            for actor_worker in actor_workers:
-                future = actor_worker.sync_rollout_weights.remote()
-                sync_futures.append(future)
-
-            # Rollout端同步
-            for rollout_worker in rollout_workers:
-                future = rollout_worker.sync_rollout_weights.remote()
-                sync_futures.append(future)
-
-            # 等待所有同步完成，带超时
-            ray.get(sync_futures, timeout=self.sync_timeout)
-            return True
-
-        except Exception as e:
-            logger.error(f"Sync execution failed: {e}")
-            return False
-
-    def cleanup(self):
-        """清理同步组"""
-        if self.sync_group_initialized:
-            try:
-                collective.destroy_collective_group(self.sync_group_name)
-                logger.info("Sync group cleaned up")
-            except Exception as e:
-                logger.warning(f"Error cleaning up sync group: {e}")
-            finally:
-                self.sync_group_initialized = False
-
-    def get_statistics(self) -> dict:
-        """获取同步统计信息"""
-        return {
-            "sync_count": self.sync_count,
-            "sync_failures": self.sync_failures,
-            "last_sync_time": self.last_sync_time,
-            "sync_group_initialized": self.sync_group_initialized,
-        }
-
-
 @ray.remote
-class ParameterSyncManager:
+class ParameterSynchronizer:
     """
-    Ray Actor形式的参数同步管理器 - 改进版本
+    统一的参数同步器，负责在actor和rollout之间同步模型参数
+    基于one_step_off_policy的成熟同步模式实现
+    合并了原有的多个同步器类的功能
     """
 
-    def __init__(self, config):
-        self.config = config
-        self.synchronizer = ParameterSynchronizer(config)
-        self.actor_workers = []
-        self.rollout_workers = []
-        self.is_ready = False
-
-    def register_workers(self, actor_workers: list, rollout_workers: list) -> bool:
-        """
-        注册worker
-
-        Args:
-            actor_workers: actor worker列表
-            rollout_workers: rollout worker列表
-
-        Returns:
-            bool: 是否成功注册
-        """
-        try:
-            self.actor_workers = actor_workers
-            self.rollout_workers = rollout_workers
-
-            # 初始化同步组
-            success = self.synchronizer.initialize_sync_group(actor_workers, rollout_workers)
-            self.is_ready = success
-
-            if success:
-                logger.info("ParameterSyncManager ready")
-            else:
-                logger.error("ParameterSyncManager initialization failed")
-
-            return success
-        except Exception as e:
-            logger.error(f"Failed to register workers: {e}")
-            return False
-
-    def sync_parameters(self) -> bool:
-        """
-        执行参数同步
-
-        Returns:
-            bool: 是否同步成功
+    def __init__(self, config, actor_wg, rollout_wg):
         """
-        if not self.is_ready:
-            logger.error("SyncManager not ready. Call register_workers() first.")
-            return False
-
-        return self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers)
-
-    def get_weights_info(self):
-        """获取权重信息"""
-        return self.synchronizer.weights_info
-
-    def get_statistics(self) -> dict:
-        """获取统计信息"""
-        stats = self.synchronizer.get_statistics()
-        stats["is_ready"] = self.is_ready
-        return stats
-
-    def cleanup(self):
-        """清理资源"""
-        self.synchronizer.cleanup()
-        self.is_ready = False
+        初始化统一参数同步器
 
-
-class AsyncParameterSynchronizer:
-    """
-    异步参数同步器，用于完全异步训练工作流 - 改进版本
-    """
-
-    def __init__(self, config, actor_wg, rollouter_actor):
-        """
         Args:
-            config: 配置
-            actor_wg: actor worker group
-            rollouter_actor: rollouter actor引用
+            config: 配置对象
+            actor_wg: trainer actor引用（用于async模式）
+            rollout_wg: rollouter actor引用（用于async模式）
         """
         self.config = config
         self.actor_wg = actor_wg
-        self.rollouter_actor = rollouter_actor
-        self.current_version = 0
+        self.rollout_wg = rollout_wg
 
-        # 同步配置
-        self.sync_timeout = config.async_training.get("sync_timeout", 30.0)
-        self.max_sync_retries = config.async_training.get("max_sync_retries", 3)
-        self.retry_delay = config.async_training.get("sync_retry_delay", 1.0)
+        # 基础属性
+        self.weights_info = None
+        self.sync_group_initialized = False
+        self.sync_group_name = "actor_rollout"
 
         # 统计信息
-        self.sync_count = 0
-        self.sync_failures = 0
-        self.last_sync_time = 0
+        self.current_version = 0
 
-        # 初始化同步组
+        self._init_weights_info()
         self._init_sync_group()
 
-    def _init_sync_group(self):
-        """初始化同步组"""
-        try:
-            # 获取actor权重信息
-            weights_info = self.actor_wg.get_actor_weights_info()[0]
-
-            # 通知rollouter设置权重信息
-            ray.get(self.rollouter_actor.set_weights_info.remote(weights_info), timeout=10.0)
-
-            # 创建同步通信组
-            actor_workers = self.actor_wg.workers
-            rollout_workers = ray.get(self.rollouter_actor.get_rollout_workers.remote(), timeout=10.0)
-
-            all_workers = actor_workers + rollout_workers
-            collective.create_collective_group(
-                all_workers,
-                len(all_workers),
-                list(range(0, len(all_workers))),
-                backend="nccl",
-                group_name="async_actor_rollout",
-            )
-
-            logger.info("Async parameter synchronizer initialized")
-
-        except Exception as e:
-            logger.warning(f"Failed to initialize async sync group: {e}")
-
-    def sync_to_rollouter(self, new_version: int) -> bool:
-        """
-        将actor参数同步到rollouter - 改进版本，具有重试机制
-
-        Args:
-            new_version: 新的参数版本号
-
-        Returns:
-            bool: 是否同步成功
-        """
-        logger.info(f"Syncing parameters to rollouter, version: {new_version}")
-        start_time = time.time()
-
-        for attempt in range(self.max_sync_retries):
-            try:
-                # 首先同步actor到rollout worker group
-                self.actor_wg.sync_rollout_weights()
-
-                # 然后通知rollouter更新参数版本
-                sync_future = self.rollouter_actor.update_rollout_weights.remote(new_version)
-                sync_result = ray.get(sync_future, timeout=self.sync_timeout)
-
-                if sync_result:
-                    self.current_version = new_version
-                    self.sync_count += 1
-                    self.last_sync_time = time.time()
-                    sync_duration = self.last_sync_time - start_time
-                    logger.info(f"Parameter sync completed in {sync_duration:.2f}s, version: {new_version}")
-                    return True
-                else:
-                    logger.warning(f"Rollouter rejected sync for version {new_version}")
-
-            except Exception as e:
-                logger.warning(f"Sync attempt {attempt + 1} failed: {e}")
-
-            # 如果不是最后一次尝试，等待后重试
-            if attempt < self.max_sync_retries - 1:
-                logger.info(f"Retrying sync in {self.retry_delay}s...")
-                time.sleep(self.retry_delay)
+    def get_current_param_version(self) -> int:
+        """获取当前参数版本号"""
+        return self.current_version
 
-        # 所有重试都失败
-        self.sync_failures += 1
-        logger.error(f"Failed to sync parameters to rollouter after {self.max_sync_retries} attempts")
-        return False
+    def get_weights_info(self):
+        """获取权重信息"""
+        return self.weights_info
 
-    def get_current_version(self) -> int:
-        """获取当前参数版本"""
-        return self.current_version
+    def _init_weights_info(self):
+        self.weights_info = self.actor_wg.get_actor_weights_info()[0]
+        self.rollout_wg.set_actor_weights_info(self.weights_info)
 
-    def get_statistics(self) -> dict:
-        """获取统计信息"""
-        return {
-            "current_version": self.current_version,
-            "sync_count": self.sync_count,
-            "sync_failures": self.sync_failures,
-            "last_sync_time": self.last_sync_time,
-        }
+    def _init_sync_group(self):
+        print("Initializing parameter synchronization group...")
+        actor_rollout_workers = self.actor_wg.workers + self.rollout_wg.workers
+        collective.create_collective_group(
+            actor_rollout_workers,
+            len(actor_rollout_workers),
+            list(range(0, len(actor_rollout_workers))),
+            backend="nccl",
+            group_name=self.sync_group_name,
+        )
+
+    def sync_weights(self, version):
+        self.current_version = version
+        logger.debug(f"Starting weight synchronization (version {self.current_version})...")
+        self.actor_wg.sync_rollout_weights()
+        ray.get(self.rollout_wg.sync_rollout_weights())

From 75fe2af1a35313ccb2eaf5d2d9544d8b3c0e3c0b Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 8 Aug 2025 11:45:48 +0800
Subject: [PATCH 030/182] ParameterSynchronizer

---
 .../README_async_trainer.md                   |  92 -----
 recipe/fully_async_policy/TEST_GUIDE.md       | 313 ------------------
 .../fully_async_policy/UNIFIED_PARAM_SYNC.md  | 143 --------
 recipe/fully_async_policy/fully_async_main.py |  66 +++-
 .../fully_async_rollouter.py                  |   4 +
 .../fully_async_policy/fully_async_trainer.py | 190 ++---------
 recipe/fully_async_policy/param_sync.py       |  21 +-
 recipe/fully_async_policy/run_benchmark.sh    | 307 -----------------
 .../run_fully_async_example.sh                | 147 --------
 .../{ => unittest}/test_components_pytest.py  |   0
 10 files changed, 100 insertions(+), 1183 deletions(-)
 delete mode 100644 recipe/fully_async_policy/README_async_trainer.md
 delete mode 100644 recipe/fully_async_policy/TEST_GUIDE.md
 delete mode 100644 recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md
 delete mode 100755 recipe/fully_async_policy/run_benchmark.sh
 delete mode 100644 recipe/fully_async_policy/run_fully_async_example.sh
 rename recipe/fully_async_policy/{ => unittest}/test_components_pytest.py (100%)

diff --git a/recipe/fully_async_policy/README_async_trainer.md b/recipe/fully_async_policy/README_async_trainer.md
deleted file mode 100644
index 9fbaa336be6..00000000000
--- a/recipe/fully_async_policy/README_async_trainer.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# FullyAsyncTrainer 队列数据获取实现
-
-## 概述
-
-本实现为 `FullyAsyncTrainer` 类添加了从消息队列获取样本并组成 `gen_batch_output` 的功能，实现了完全异步的训练流程。
-
-## 核心功能
-
-### 1. 样本计算逻辑
-
-```python
-# 计算需要获取的样本数量
-n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
-batch_size = self.config.data.train_batch_size
-required_samples = n_responses_per_prompt * batch_size
-```
-
-训练器会根据配置自动计算需要从队列获取的样本数量：
-- `rollout.n`: 每个prompt生成的响应数量
-- `train_batch_size`: 训练批次大小
-- 总样本数 = n × batch_size
-
-### 2. 主要方法
-
-#### `_get_samples_from_queue()`
-- 从消息队列获取指定数量的样本
-- 组装成 `gen_batch_output` 格式
-- 提取原始batch信息构造 `batch_dict`
-
-#### `_assemble_gen_batch_output_from_queue_samples()`
-- 将队列中的多个样本重新组装成 `DataProto` 对象
-- 处理tensor和non-tensor数据
-- 合并timing信息和metadata
-
-#### `_extract_batch_dict_from_sample()`
-- 从样本数据中提取原始输入信息
-- 过滤掉生成的输出，保留prompt相关数据
-
-#### `_async_get_next_batch_from_queue()`
-- 异步获取下一批队列数据
-- 使用线程池实现非阻塞操作
-
-### 3. 数据流程
-
-1. **样本生成**: Rollouter生成样本并放入MessageQueue
-2. **样本获取**: Trainer从队列异步获取 `n × batch_size` 个样本
-3. **数据重组**: 将队列样本重新组装成标准的 `gen_batch_output` 格式
-4. **训练处理**: 样本进入标准的PPO训练流程
-
-### 4. 使用示例
-
-```python
-# 初始化trainer
-trainer = FullyAsyncTrainer(config, tokenizer, role_worker_mapping, resource_pool_manager)
-
-# 设置消息队列客户端
-trainer.set_message_queue_client(message_queue_client)
-
-# 开始训练（自动从队列获取数据）
-trainer.fit()
-```
-
-## 配置要求
-
-确保配置中包含以下参数：
-
-```yaml
-data:
-  train_batch_size: 128  # 训练批次大小
-
-actor_rollout_ref:
-  rollout:
-    n: 4  # 每个prompt的响应数量
-```
-
-## 特性
-
-- **异步处理**: 使用异步方式从队列获取数据，不阻塞训练流程
-- **数据完整性**: 保持原有的tensor和non-tensor数据结构
-- **元数据保留**: 保留timing、参数版本等重要信息
-- **兼容性**: 与现有的PPO训练流程完全兼容
-
-## 监控指标
-
-训练器提供以下统计指标：
-- `queue_sample_count`: 当前批次的样本数量
-- `rollout_param_versions`: 样本对应的参数版本
-- `sample_timestamps`: 样本生成时间戳
-- timing信息的平均值
-
-通过 `trainer.get_statistics()` 可以获取详细的训练统计信息。
-
diff --git a/recipe/fully_async_policy/TEST_GUIDE.md b/recipe/fully_async_policy/TEST_GUIDE.md
deleted file mode 100644
index 3933998cd84..00000000000
--- a/recipe/fully_async_policy/TEST_GUIDE.md
+++ /dev/null
@@ -1,313 +0,0 @@
-# Fully Async Policy 测试指南
-
-本文档介绍如何测试完全异步PPO训练系统的各种功能和性能。
-
-## 📋 测试概览
-
-我们提供了多种类型的测试，涵盖从单元测试到端到端测试的完整测试套件：
-
-### 测试类型
-1. **单元测试** - 测试各个组件的独立功能
-2. **集成测试** - 测试组件间的协作
-3. **端到端测试** - 测试完整的训练流程
-4. **性能基准测试** - 评估系统性能特征
-5. **压力测试** - 测试系统在极限条件下的表现
-
-## 🚀 快速开始
-
-### 1. 端到端测试
-最简单的方式是运行端到端测试，验证系统基本功能：
-
-```bash
-# 基本E2E测试
-./run_e2e_test.sh
-
-# 使用环境变量自定义配置
-NUM_GPUS=4 MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct ./run_e2e_test.sh
-```
-
-### 2. 单元测试
-运行组件级别的单元测试：
-
-```bash
-# 运行所有单元测试
-cd unittest/
-python test_fully_async_components.py
-
-# 或者使用pytest（如果安装）
-pytest test_components_pytest.py -v
-```
-
-### 3. 性能基准测试
-评估系统性能特征：
-
-```bash
-# 运行完整的性能基准测试
-./run_benchmark.sh
-
-# 自定义GPU数量和策略
-NUM_GPUS=8 ACTOR_STRATEGY=fsdp2 ./run_benchmark.sh
-```
-
-## 📊 测试脚本详解
-
-### run_e2e_test.sh
-- **目的**: 端到端功能验证
-- **配置**: 最小化配置，快速验证基本功能
-- **时长**: 约5-10分钟
-- **用法**: `./run_e2e_test.sh`
-
-**环境变量**:
-- `NUM_GPUS`: GPU数量 (默认: 4)
-- `MODEL_ID`: 使用的模型ID (默认: Qwen/Qwen2.5-0.5B-Instruct)
-- `MODEL_PATH`: 模型存储路径
-
-### run_benchmark.sh
-- **目的**: 性能基准测试
-- **配置**: 多种配置组合，评估性能影响
-- **时长**: 约30-60分钟
-- **用法**: `./run_benchmark.sh`
-
-**测试覆盖**:
-1. 不同新鲜度阈值的影响
-2. 不同队列大小的性能表现
-3. 生成间隔对吞吐量的影响
-4. GPU资源分配的优化
-5. 暂停/恢复功能测试
-
-### test_fully_async_components.py
-- **目的**: 单元和集成测试
-- **配置**: 使用Mock对象的孤立测试
-- **时长**: 约2-5分钟
-- **用法**: `python unittest/test_fully_async_components.py`
-
-**测试覆盖**:
-- MessageQueue的基本功能
-- 参数同步器的重试机制
-- Rollouter的暂停/恢复
-- 新鲜度指标计算
-- 错误处理和超时机制
-
-## 🔧 测试配置
-
-### 最小化测试配置
-用于快速验证功能：
-
-```yaml
-# 基本配置
-data:
-  train_batch_size: 4
-  max_prompt_length: 512
-  max_response_length: 1024
-
-trainer:
-  total_training_steps: 2
-  n_gpus_per_node: 2
-
-rollout:
-  n_gpus_per_node: 2
-
-async_training:
-  staleness_threshold: 3
-  max_queue_size: 100
-```
-
-### 性能测试配置
-用于评估系统性能：
-
-```yaml
-# 性能配置
-data:
-  train_batch_size: 16
-  max_prompt_length: 512
-  max_response_length: 1024
-
-trainer:
-  total_training_steps: 10
-  n_gpus_per_node: 6
-
-rollout:
-  n_gpus_per_node: 2
-
-async_training:
-  staleness_threshold: 3
-  max_queue_size: 1000
-  generation_timeout: 30.0
-```
-
-## 📈 测试结果分析
-
-### 成功指标
-测试成功应满足以下条件：
-
-1. **功能正确性**:
-   - 样本成功生成和消费
-   - 参数同步正常工作
-   - 暂停/恢复功能响应
-
-2. **性能表现**:
-   - 样本生成速率 > 目标吞吐量
-   - 队列利用率在合理范围(50-80%)
-   - 新鲜度指标符合预期
-
-3. **稳定性**:
-   - 无内存泄漏
-   - 无死锁或竞争条件
-   - 优雅处理错误情况
-
-### 失败排查
-常见问题及解决方案：
-
-1. **Ray连接失败**:
-   ```bash
-   # 重新初始化Ray
-   ray stop
-   ray start --head
-   ```
-
-2. **GPU内存不足**:
-   ```bash
-   # 减少批大小或使用梯度检查点
-   data.train_batch_size=2
-   actor_rollout_ref.model.enable_gradient_checkpointing=True
-   ```
-
-3. **队列阻塞**:
-   ```bash
-   # 调整队列大小和新鲜度阈值
-   async_training.max_queue_size=500
-   async_training.staleness_threshold=5
-   ```
-
-## 🎯 特定功能测试
-
-### 测试暂停/恢复功能
-```python
-# 在Python脚本中测试
-import ray
-from fully_async_rollouter import FullyAsyncRollouter
-
-rollouter = FullyAsyncRollouter.remote(config, ...)
-
-# 测试暂停
-result = ray.get(rollouter.pause_rollout.remote())
-assert result == True
-
-# 测试恢复
-result = ray.get(rollouter.resume_rollout.remote())
-assert result == True
-```
-
-### 测试新鲜度控制
-
-```python
-# 测试样本过期机制
-queue = MessageQueueClient.remote(max_staleness=3)
-
-# 放入旧版本样本
-queue.put_sample.remote(sample, param_version=1)
-
-# 用新版本获取（应该被拒绝）
-result = ray.get(queue.get_samples.remote(current_param_version=5))
-assert result is None
-```
-
-### 测试参数同步
-```python
-# 测试同步重试机制
-sync = ParameterSynchronizer.remote(config, actor_wg, rollout_wg)
-
-# 测试成功同步
-result = ray.get(sync.sync_weights.remote())
-assert result == True
-```
-
-## 📝 测试报告
-
-### 基准测试报告
-运行`./run_benchmark.sh`后，会在`benchmark_results_*/`目录下生成：
-
-- `performance_report.md` - 详细的性能报告
-- `summary.txt` - 关键指标摘要
-- `*.log` - 各项测试的详细日志
-
-### 关键指标
-需要关注的性能指标：
-
-1. **吞吐量指标**:
-   - 样本生成速率 (samples/second)
-   - 训练步数完成速率 (steps/second)
-
-2. **延迟指标**:
-   - 样本平均年龄 (average sample age)
-   - 参数同步延迟 (sync latency)
-
-3. **资源利用率**:
-   - GPU利用率 (GPU utilization)
-   - 内存使用量 (memory usage)
-   - 队列利用率 (queue utilization)
-
-## 🔄 CI/CD 集成
-
-### GitHub Actions 示例
-```yaml
-name: Fully Async Policy Tests
-on: [push, pull_request]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.9
-
-    - name: Install dependencies
-      run: |
-        pip install -r requirements.txt
-        pip install pytest
-
-    - name: Run unit tests
-      run: |
-        cd recipe/fully_async_policy/unittest/
-        python test_fully_async_components.py
-
-    - name: Run E2E test (if GPUs available)
-      run: |
-        if nvidia-smi; then
-          cd recipe/fully_async_policy/
-          ./run_e2e_test.sh
-        fi
-```
-
-## 🛠️ 开发者测试
-
-### 添加新测试
-1. **单元测试**: 在`unittest/test_fully_async_components.py`中添加新的测试类
-2. **集成测试**: 在相应的集成测试类中添加新方法
-3. **性能测试**: 在`run_benchmark.sh`中添加新的基准测试场景
-
-### 测试最佳实践
-1. **隔离性**: 每个测试应该独立，不依赖其他测试
-2. **可重现性**: 使用固定的随机种子和确定性配置
-3. **清理**: 测试结束后清理资源，避免影响后续测试
-4. **文档**: 为新测试添加清晰的文档说明
-
-## ❓ 常见问题
-
-**Q: 测试失败，提示Ray连接错误**
-A: 确保Ray集群正常运行，或重新启动Ray
-
-**Q: 内存不足错误**
-A: 减少批大小或在测试配置中启用参数卸载
-
-**Q: 测试运行时间过长**
-A: 使用更小的模型或减少训练步数进行快速测试
-
-**Q: 如何添加自定义测试？**
-A: 参考现有测试模式，在对应的测试文件中添加新的测试方法
-
-通过这套完整的测试系统，可以确保fully async policy系统的可靠性、性能和稳定性。
-
diff --git a/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md b/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md
deleted file mode 100644
index e816968f8fc..00000000000
--- a/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md
+++ /dev/null
@@ -1,143 +0,0 @@
-# 统一参数同步器使用指南 (Unified Parameter Synchronizer Guide)
-
-本文档说明了新的统一参数同步器 `UnifiedParameterSynchronizer` 的使用方法。该类合并了原有的多个同步器类的功能，提供了更简洁和统一的接口。
-
-## 🏗️ 类合并说明
-
-### 原有类结构（已合并）
-- `ParameterSynchronizer` - 基础参数同步器
-- `ParameterSyncManager` - Ray Actor形式的参数同步管理器
-- `AsyncParameterSynchronizer` - 异步参数同步器
-
-### 新的统一类
-- `UnifiedParameterSynchronizer` - 统一参数同步器，包含所有功能
-
-## 🚀 使用方法
-
-### 1. 异步训练模式（推荐）
-```python
-from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer
-
-# 创建异步模式的参数同步器
-param_synchronizer = UnifiedParameterSynchronizer(
-    config=config,
-    trainer_actor=trainer_actor,
-    rollouter_actor=rollouter_actor
-)
-
-# 同步参数到rollouter
-success = param_synchronizer.sync_to_rollouter(new_version=1)
-```
-
-### 2. Ray Actor模式
-```python
-from recipe.fully_async_policy.param_sync import ParameterSyncManager
-
-# 创建Ray remote参数同步管理器
-sync_manager = ParameterSyncManager.remote(config)
-
-# 注册workers
-success = ray.get(sync_manager.register_workers.remote(actor_workers, rollout_workers))
-
-# 执行同步
-success = ray.get(sync_manager.sync_parameters.remote())
-```
-
-### 3. 传统模式
-```python
-from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer
-
-# 创建传统模式的参数同步器
-synchronizer = UnifiedParameterSynchronizer(config)
-
-# 初始化同步组
-success = synchronizer.initialize_sync_group(actor_workers, rollout_workers)
-
-# 同步权重
-success = synchronizer.sync_weights(actor_workers, rollout_workers)
-```
-
-## 🔄 向后兼容性
-
-为了确保现有代码的兼容性，提供了以下别名：
-
-```python
-# 这些别名指向 UnifiedParameterSynchronizer
-ParameterSynchronizer = UnifiedParameterSynchronizer
-AsyncParameterSynchronizer = UnifiedParameterSynchronizer
-
-# Ray remote版本
-ParameterSyncManager = ray.remote(UnifiedParameterSynchronizer)
-```
-
-现有代码无需修改即可使用新的统一同步器。
-
-## ⚙️ 初始化参数
-
-```python
-def __init__(self, config, trainer_actor=None, rollouter_actor=None, as_ray_actor=False):
-```
-
-- `config`: 配置对象（必需）
-- `trainer_actor`: trainer actor引用（用于async模式）
-- `rollouter_actor`: rollouter actor引用（用于async模式）
-- `as_ray_actor`: 是否作为Ray actor使用
-
-## 📊 主要方法
-
-### 异步模式
-- `sync_to_rollouter(new_version)`: 同步参数到rollouter
-- `get_current_version()`: 获取当前参数版本
-
-### Ray Actor模式
-- `register_workers(actor_workers, rollout_workers)`: 注册workers
-- `sync_parameters()`: 执行参数同步
-
-### 传统模式
-- `initialize_sync_group(actor_workers, rollout_workers)`: 初始化同步组
-- `sync_weights(actor_workers, rollout_workers)`: 同步权重
-
-### 通用方法
-- `get_statistics()`: 获取统计信息
-- `get_weights_info()`: 获取权重信息
-- `cleanup()`: 清理资源
-
-## 📈 统计信息
-
-```python
-stats = synchronizer.get_statistics()
-# 返回：
-{
-    "sync_count": 15,
-    "sync_failures": 0,
-    "last_sync_time": 1640995200.0,
-    "sync_group_initialized": True,
-    "current_param_version": 15,
-    "current_version": 15,
-    "is_ready": True  # 仅在Ray actor模式下
-}
-```
-
-## 🎯 优势
-
-1. **统一接口**: 一个类支持所有同步模式
-2. **向后兼容**: 现有代码无需修改
-3. **灵活配置**: 支持多种初始化方式
-4. **完整功能**: 包含所有原有类的功能
-5. **简化维护**: 减少代码重复，便于维护
-
-## 🔧 配置示例
-
-```yaml
-async_training:
-  max_sync_retries: 3
-  sync_timeout: 30.0
-  sync_retry_delay: 1.0
-  sync_monitor_interval: 60.0
-  staleness_threshold: 3
-```
-
----
-
-*统一参数同步器简化了参数同步的使用，同时保持了所有原有功能的完整性。*
-
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index aa5ac81f48a..cf5c0e29d5c 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -248,16 +248,18 @@ def _initialize_components(self, config) -> None:
         print("Setting up parameter synchronization...")
         from recipe.fully_async_policy.param_sync import ParameterSynchronizer
 
-        param_synchronizer = ParameterSynchronizer(
+        param_synchronizer = ParameterSynchronizer.remote(
             config=config,
-            actor_wg=self.components["trainer"],
-            rollout_wg=self.components["rollouter"],
+            trainer=self.components["trainer"],
+            rollouter=self.components["rollouter"],
         )
 
         # 将参数同步器设置到trainer和rollouter
         ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
         ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer))
 
+        ray.get(param_synchronizer.sync_weights.remote(0))
+
         self.components["param_synchronizer"] = param_synchronizer
         print("Parameter synchronizer initialized successfully")
         print("All components initialized successfully")
@@ -332,6 +334,64 @@ def _run_training_loop(self):
 
         print("Training completed or interrupted")
 
+    def _cleanup_resources(self):
+        """清理所有资源"""
+        try:
+            # 关闭线程池
+            if hasattr(self, 'thread_executor') and self.thread_executor:
+                print("Shutting down thread executor...")
+                self.thread_executor.shutdown(wait=True, timeout=10.0)
+
+            # 清理logger
+            if hasattr(self, 'logger') and self.logger:
+                try:
+                    if hasattr(self.logger, 'close'):
+                        self.logger.close()
+                    elif hasattr(self.logger, 'finish'):
+                        self.logger.finish()
+                except Exception as e:
+                    print(f"Error closing logger: {e}")
+
+            # 清理validation logger
+            if hasattr(self, 'validation_generations_logger') and self.validation_generations_logger:
+                try:
+                    if hasattr(self.validation_generations_logger, 'close'):
+                        self.validation_generations_logger.close()
+                except Exception as e:
+                    print(f"Error closing validation logger: {e}")
+
+            # 清理异步rollout管理器
+            if hasattr(self, "async_rollout_manager") and self.async_rollout_manager:
+                try:
+                    if hasattr(self.async_rollout_manager, 'shutdown'):
+                        self.async_rollout_manager.shutdown()
+                except Exception as e:
+                    print(f"Error cleaning up async rollout manager: {e}")
+
+            # 清理worker groups
+            if hasattr(self, 'rollout_wg') and self.rollout_wg:
+                try:
+                    if hasattr(self.rollout_wg, 'shutdown'):
+                        self.rollout_wg.shutdown()
+                except Exception as e:
+                    print(f"Error cleaning up rollout worker group: {e}")
+
+            # 强制垃圾回收
+            import gc
+            gc.collect()
+
+        except Exception as e:
+            print(f"Error during resource cleanup: {e}")
+
+    def __del__(self):
+        """析构函数 - 确保资源清理"""
+        try:
+            if hasattr(self, 'running') and self.running:
+                print("Warning: FullyAsyncRollouter being deleted while still running")
+                self.shutdown()
+        except Exception as e:
+            print(f"Error in destructor: {e}")
+
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index c760215c580..1ca9c7b0d2e 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -155,6 +155,10 @@ def set_parameter_synchronizer(self, param_synchronizer):
         with self.lock:
             self.param_synchronizer = param_synchronizer
 
+    def get_rollout_wg(self):
+        """获取 rollout worker group"""
+        return self.rollout_wg
+
     def _validate_config(self):
         # 验证异步训练配置
         if not hasattr(self.config, "async_training"):
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 20bae1a6a64..afef0968a04 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -46,36 +46,17 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
-        """
-        Initialize distributed PPO trainer with Ray backend.
-        Note that this trainer runs on the driver process on a single CPU/GPU node.
-
-        Args:
-            config: Configuration object containing training parameters.
-            tokenizer: Tokenizer used for encoding and decoding text.
-            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
-            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
-            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
-            processor: Optional data processor, used for multimodal data
-            reward_fn: Function for computing rewards during training.
-            val_reward_fn: Function for computing rewards during validation.
-            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
-            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
-            collate_fn: Function to collate data samples into batches.
-            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
-            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
-        """
 
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -141,59 +122,9 @@ def set_parameter_synchronizer(self, param_synchronizer):
         with self.lock:
             self.param_synchronizer = param_synchronizer
 
-    def _get_actor_params(self):
-        """
-        获取actor参数 - 基于one_step_off_policy的实现
-        """
-        if not hasattr(self, "actor_wg") or self.actor_wg is None:
-            raise ValueError("Actor worker group not initialized")
-
-        # 从actor worker group获取参数
-        actor_workers = self.actor_wg.workers
-        if not actor_workers:
-            raise ValueError("No actor workers available")
-
-        # 获取第一个actor worker的参数信息
-        params_future = actor_workers[0]._get_actor_params.remote()
-        params = ray.get(params_future, timeout=10.0)
-        return params
-
-    def get_actor_weights_info(self):
-        """
-        获取actor权重信息 - 基于one_step_off_policy的模式
-        """
-        if hasattr(self, "_weights_info") and self._weights_info is not None:
-            return self._weights_info
-
-        if not hasattr(self, "actor_wg") or self.actor_wg is None:
-            raise ValueError("Actor worker group not initialized")
-
-        # 从actor worker group获取权重信息
-        weights_info_future = self.actor_wg.get_actor_weights_info.remote()
-        weights_info = ray.get(weights_info_future, timeout=10.0)
-
-        # 缓存权重信息
-        self._weights_info = weights_info[0] if isinstance(weights_info, list) else weights_info
-        return self._weights_info
-
-    def sync_rollout_weights(self):
-        """
-        同步rollout权重 - Actor端的同步操作
-        """
-        if not hasattr(self, "actor_wg") or self.actor_wg is None:
-            logger.warning("Actor worker group not initialized for sync")
-            return False
-
-        try:
-            # 触发actor worker group的参数同步
-            sync_future = self.actor_wg.sync_rollout_weights.remote()
-            ray.get(sync_future, timeout=30.0)
-            logger.debug("Actor weights sync completed")
-            return True
-
-        except Exception as e:
-            logger.error(f"Failed to sync actor weights: {e}")
-            return False
+    def get_actor_wg(self):
+        """获取 actor worker group"""
+        return self.actor_wg
 
     def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
@@ -457,22 +388,14 @@ def fit(self):
             # self._post_batch_processing(batch)
 
             print("step end")
-
             # 在训练步骤结束后触发参数同步
             self._trigger_parameter_sync_after_step()
-
-            # TODO: make a canonical logger that supports various backend
-            print(data=metrics, step=self.global_steps)
-
             # progress_bar.update(1)
             self.global_steps += 1
-            print("is_last_step")
-            # if is_last_step:
-            #     pprint(f"Final validation metrics: {last_val_metrics}")
-            #     print("is_last_step")
-            #     # progress_bar.close()
-            #     return
-            ray.get(self.param_synchronizer.sync_weights.remote(self.global_steps))
+            print(f"is_last_step {is_last_step}")
+            if is_last_step:
+                print("is_last_step")
+                return
 
     def get_statistics(self) -> dict:
         """获取训练统计信息"""
@@ -494,79 +417,12 @@ def _trigger_parameter_sync_after_step(self):
         在训练步骤结束后触发参数同步
         这确保rollouter总是使用最新训练的参数
         """
-        if not self.param_synchronizer:
-            logger.debug("No parameter synchronizer available, skipping sync")
-            return
-
-        try:
-            # 更新参数版本号
-            new_version = self.current_param_version + 1
-
-            print(
-                f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}"
-            )
-            logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}")
-
-            # 异步触发参数同步，不阻塞训练流程
-            import threading
-
-            sync_thread = threading.Thread(target=self._async_parameter_sync, args=(new_version,), daemon=True)
-            sync_thread.start()
-
-        except Exception as e:
-            logger.error(f"Error triggering parameter sync: {e}")
-
-    def _async_parameter_sync(self, new_version: int):
-        """
-        异步执行参数同步，避免阻塞训练流程
-
-        Args:
-            new_version: 新的参数版本号
-        """
-        try:
-            # 执行参数同步
-            success = self.param_synchronizer.sync_to_rollouter(new_version)
-
-            if success:
-                # 更新本地参数版本
-                with self.lock:
-                    self.current_param_version = new_version
-                    self.param_sync_count += 1
-
-                print(f"[TRAINER] Parameter sync completed successfully for version {new_version}")
-                logger.info(f"Parameter sync completed successfully for version {new_version}")
-            else:
-                print(f"[TRAINER] Parameter sync failed for version {new_version}")
-                logger.warning(f"Parameter sync failed for version {new_version}")
-
-        except Exception as e:
-            logger.error(f"Error in async parameter sync: {e}")
-
-    def update_param_version(self, param_version: int) -> bool:
-        """
-        更新trainer的参数版本，用于跟踪与rollouter的参数同步状态
-
-        Args:
-            param_version: 新的参数版本号
-
-        Returns:
-            bool: 是否成功更新
-        """
-        try:
-            with self.lock:
-                old_version = self.current_param_version
-                self.current_param_version = param_version
-                self.param_sync_count += 1
-
-                # 更新消息队列的参数版本
-                if self.message_queue_client:
-                    self.message_queue_client.update_param_version(param_version)
-
-                print(f"Updated trainer param version from {old_version} to {param_version}")
-                return True
-        except Exception as e:
-            logger.error(f"Error updating param version: {e}")
-            return False
+        new_version = self.current_param_version + 1
+        print(
+            f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}"
+        )
+        logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}")
+        ray.get(self.param_synchronizer.sync_weights.remote(new_version))
 
     def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
         """
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 10843302786..3657916dda0 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -28,18 +28,13 @@ class ParameterSynchronizer:
     合并了原有的多个同步器类的功能
     """
 
-    def __init__(self, config, actor_wg, rollout_wg):
-        """
-        初始化统一参数同步器
-
-        Args:
-            config: 配置对象
-            actor_wg: trainer actor引用（用于async模式）
-            rollout_wg: rollouter actor引用（用于async模式）
-        """
+    def __init__(self, config, trainer, rollouter):
+
         self.config = config
-        self.actor_wg = actor_wg
-        self.rollout_wg = rollout_wg
+        self.trainer = trainer
+        self.rollouter = rollouter
+        self.actor_wg = ray.get(trainer.get_actor_wg.remote())
+        self.rollout_wg = ray.get(rollouter.get_rollout_wg.remote())
 
         # 基础属性
         self.weights_info = None
@@ -78,5 +73,9 @@ def _init_sync_group(self):
     def sync_weights(self, version):
         self.current_version = version
         logger.debug(f"Starting weight synchronization (version {self.current_version})...")
+
+        # TODO 暂停及恢复rollout
+        print("TODO 暂停及恢复rollout")
         self.actor_wg.sync_rollout_weights()
         ray.get(self.rollout_wg.sync_rollout_weights())
+        print("sync_weights success")
diff --git a/recipe/fully_async_policy/run_benchmark.sh b/recipe/fully_async_policy/run_benchmark.sh
deleted file mode 100755
index f9bfaceaa32..00000000000
--- a/recipe/fully_async_policy/run_benchmark.sh
+++ /dev/null
@@ -1,307 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-# Benchmark script for fully_async_policy performance testing
-# This script runs various performance tests to evaluate the async training system
-
-NUM_GPUS=${NUM_GPUS:-8}
-ACTOR_STRATEGY=${ACTOR_STRATEGY:-"fsdp2"}
-
-# Download model if not exists
-MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
-MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
-huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
-
-# Create benchmark results directory
-BENCHMARK_DIR="benchmark_results_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "${BENCHMARK_DIR}"
-
-echo "Starting fully_async_policy performance benchmark..."
-echo "Results will be saved to: ${BENCHMARK_DIR}"
-
-# Benchmark parameters
-n_gpus_rollout=2
-n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
-
-# Common parameters
-train_prompt_bsz=16
-n_resp_per_prompt=4
-train_prompt_mini_bsz=4
-max_prompt_length=512
-max_response_length=1024
-
-# Benchmark Test 1: Different staleness thresholds
-echo "=== Benchmark Test 1: Staleness Threshold Impact ==="
-staleness_values=(1 3 5 10)
-
-for staleness in "${staleness_values[@]}"; do
-    echo "Testing staleness threshold: ${staleness}"
-
-    exp_name="benchmark-staleness-${staleness}"
-    log_file="${BENCHMARK_DIR}/staleness_${staleness}.log"
-
-    timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
-        data.train_files="${HOME}/data/gsm8k/train.parquet" \
-        data.val_files="${HOME}/data/gsm8k/test.parquet" \
-        data.prompt_key=prompt \
-        data.truncation='left' \
-        data.max_prompt_length=${max_prompt_length} \
-        data.max_response_length=${max_response_length} \
-        data.train_batch_size=${train_prompt_bsz} \
-        actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-        actor_rollout_ref.model.path="${MODEL_PATH}" \
-        trainer.logger=['console'] \
-        trainer.project_name='verl-benchmark' \
-        trainer.experiment_name="${exp_name}" \
-        trainer.val_before_train=False \
-        trainer.test_freq=-1 \
-        trainer.save_freq=-1 \
-        trainer.total_epochs=1 \
-        trainer.total_training_steps=10 \
-        trainer.n_gpus_per_node=${n_gpus_training} \
-        rollout.n_gpus_per_node=${n_gpus_rollout} \
-        async_training.staleness_threshold=${staleness} \
-        async_training.max_staleness_allowed=$((staleness + 2)) \
-        > "${log_file}" 2>&1 || echo "Test with staleness ${staleness} timed out or failed"
-
-    # Extract key metrics from log
-    if [ -f "${log_file}" ]; then
-        echo "=== Metrics for staleness=${staleness} ===" >> "${BENCHMARK_DIR}/summary.txt"
-        grep -E "(Generated.*batches|Dropped.*samples|param_version|Queue size)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
-        echo "" >> "${BENCHMARK_DIR}/summary.txt"
-    fi
-done
-
-# Benchmark Test 2: Different queue sizes
-echo "=== Benchmark Test 2: Queue Size Impact ==="
-queue_sizes=(50 100 500 1000)
-
-for queue_size in "${queue_sizes[@]}"; do
-    echo "Testing queue size: ${queue_size}"
-
-    exp_name="benchmark-queue-${queue_size}"
-    log_file="${BENCHMARK_DIR}/queue_${queue_size}.log"
-
-    timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
-        data.train_files="${HOME}/data/gsm8k/train.parquet" \
-        data.val_files="${HOME}/data/gsm8k/test.parquet" \
-        data.prompt_key=prompt \
-        data.truncation='left' \
-        data.max_prompt_length=${max_prompt_length} \
-        data.max_response_length=${max_response_length} \
-        data.train_batch_size=${train_prompt_bsz} \
-        actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-        actor_rollout_ref.model.path="${MODEL_PATH}" \
-        trainer.logger=['console'] \
-        trainer.project_name='verl-benchmark' \
-        trainer.experiment_name="${exp_name}" \
-        trainer.val_before_train=False \
-        trainer.test_freq=-1 \
-        trainer.save_freq=-1 \
-        trainer.total_epochs=1 \
-        trainer.total_training_steps=10 \
-        trainer.n_gpus_per_node=${n_gpus_training} \
-        rollout.n_gpus_per_node=${n_gpus_rollout} \
-        async_training.max_queue_size=${queue_size} \
-        > "${log_file}" 2>&1 || echo "Test with queue size ${queue_size} timed out or failed"
-
-    # Extract key metrics from log
-    if [ -f "${log_file}" ]; then
-        echo "=== Metrics for queue_size=${queue_size} ===" >> "${BENCHMARK_DIR}/summary.txt"
-        grep -E "(Generated.*batches|Queue size|memory)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
-        echo "" >> "${BENCHMARK_DIR}/summary.txt"
-    fi
-done
-
-# Benchmark Test 3: Different batch generation intervals
-echo "=== Benchmark Test 3: Generation Interval Impact ==="
-intervals=(0.0 0.1 0.5 1.0)
-
-for interval in "${intervals[@]}"; do
-    echo "Testing batch generation interval: ${interval}s"
-
-    exp_name="benchmark-interval-${interval}"
-    log_file="${BENCHMARK_DIR}/interval_${interval}.log"
-
-    timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
-        data.train_files="${HOME}/data/gsm8k/train.parquet" \
-        data.val_files="${HOME}/data/gsm8k/test.parquet" \
-        data.prompt_key=prompt \
-        data.truncation='left' \
-        data.max_prompt_length=${max_prompt_length} \
-        data.max_response_length=${max_response_length} \
-        data.train_batch_size=${train_prompt_bsz} \
-        actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-        actor_rollout_ref.model.path="${MODEL_PATH}" \
-        trainer.logger=['console'] \
-        trainer.project_name='verl-benchmark' \
-        trainer.experiment_name="${exp_name}" \
-        trainer.val_before_train=False \
-        trainer.test_freq=-1 \
-        trainer.save_freq=-1 \
-        trainer.total_epochs=1 \
-        trainer.total_training_steps=10 \
-        trainer.n_gpus_per_node=${n_gpus_training} \
-        rollout.n_gpus_per_node=${n_gpus_rollout} \
-        async_training.batch_generation_interval=${interval} \
-        > "${log_file}" 2>&1 || echo "Test with interval ${interval} timed out or failed"
-
-    # Extract key metrics from log
-    if [ -f "${log_file}" ]; then
-        echo "=== Metrics for interval=${interval}s ===" >> "${BENCHMARK_DIR}/summary.txt"
-        grep -E "(Generated.*batches|generation_timestamp)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
-        echo "" >> "${BENCHMARK_DIR}/summary.txt"
-    fi
-done
-
-# Benchmark Test 4: Resource allocation comparison
-echo "=== Benchmark Test 4: Resource Allocation Comparison ==="
-
-# Test different rollout/training GPU distributions
-if [ "${NUM_GPUS}" -ge "6" ]; then
-    gpu_configs=(
-        "1,$((NUM_GPUS - 1))"  # 1 rollout, rest training
-        "2,$((NUM_GPUS - 2))"  # 2 rollout, rest training
-        "3,$((NUM_GPUS - 3))"  # 3 rollout, rest training
-    )
-
-    for config in "${gpu_configs[@]}"; do
-        IFS=',' read -r rollout_gpus training_gpus <<< "$config"
-
-        echo "Testing GPU allocation: ${rollout_gpus} rollout, ${training_gpus} training"
-
-        exp_name="benchmark-gpu-${rollout_gpus}r-${training_gpus}t"
-        log_file="${BENCHMARK_DIR}/gpu_${rollout_gpus}_${training_gpus}.log"
-
-        timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \
-            data.train_files="${HOME}/data/gsm8k/train.parquet" \
-            data.val_files="${HOME}/data/gsm8k/test.parquet" \
-            data.prompt_key=prompt \
-            data.truncation='left' \
-            data.max_prompt_length=${max_prompt_length} \
-            data.max_response_length=${max_response_length} \
-            data.train_batch_size=${train_prompt_bsz} \
-            actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-            actor_rollout_ref.model.path="${MODEL_PATH}" \
-            trainer.logger=['console'] \
-            trainer.project_name='verl-benchmark' \
-            trainer.experiment_name="${exp_name}" \
-            trainer.val_before_train=False \
-            trainer.test_freq=-1 \
-            trainer.save_freq=-1 \
-            trainer.total_epochs=1 \
-            trainer.total_training_steps=10 \
-            trainer.n_gpus_per_node=${training_gpus} \
-            rollout.n_gpus_per_node=${rollout_gpus} \
-            > "${log_file}" 2>&1 || echo "Test with GPU config ${config} timed out or failed"
-
-        # Extract key metrics from log
-        if [ -f "${log_file}" ]; then
-            echo "=== Metrics for ${rollout_gpus}r/${training_gpus}t GPUs ===" >> "${BENCHMARK_DIR}/summary.txt"
-            grep -E "(Generated.*batches|training.*steps|GPU)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true
-            echo "" >> "${BENCHMARK_DIR}/summary.txt"
-        fi
-    done
-fi
-
-# Benchmark Test 5: Pause/Resume Performance
-echo "=== Benchmark Test 5: Pause/Resume Performance Test ==="
-log_file="${BENCHMARK_DIR}/pause_resume.log"
-
-# Start the training in background
-python3 -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${HOME}/data/gsm8k/train.parquet" \
-    data.val_files="${HOME}/data/gsm8k/test.parquet" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    trainer.logger=['console'] \
-    trainer.project_name='verl-benchmark-pause' \
-    trainer.experiment_name='pause-resume-test' \
-    trainer.val_before_train=False \
-    trainer.test_freq=-1 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=1 \
-    trainer.total_training_steps=20 \
-    trainer.n_gpus_per_node=${n_gpus_training} \
-    rollout.n_gpus_per_node=${n_gpus_rollout} \
-    > "${log_file}" 2>&1 &
-
-TRAINING_PID=$!
-
-# Note: In actual implementation, we would need a way to remotely control pause/resume
-# This is a placeholder for testing the pause/resume functionality
-echo "Training started with PID: ${TRAINING_PID}"
-echo "Pause/resume testing would require remote control interface" >> "${BENCHMARK_DIR}/summary.txt"
-
-# Wait a bit and then kill the training (simulating early termination)
-sleep 60
-if kill -0 $TRAINING_PID 2>/dev/null; then
-    echo "Stopping training process..."
-    kill $TRAINING_PID
-fi
-
-# Generate performance report
-echo "=== Generating Performance Report ==="
-report_file="${BENCHMARK_DIR}/performance_report.md"
-
-cat > "${report_file}" << EOF
-# Fully Async Policy Performance Benchmark Report
-
-**Date:** $(date)
-**Hardware:** ${NUM_GPUS} GPUs
-**Strategy:** ${ACTOR_STRATEGY}
-**Model:** ${MODEL_ID}
-
-## Test Configuration
-- Training Batch Size: ${train_prompt_bsz}
-- Responses per Prompt: ${n_resp_per_prompt}
-- Max Prompt Length: ${max_prompt_length}
-- Max Response Length: ${max_response_length}
-
-## Results Summary
-$(cat "${BENCHMARK_DIR}/summary.txt" 2>/dev/null || echo "No summary available")
-
-## Log Files
-EOF
-
-# List all log files
-for log_file in "${BENCHMARK_DIR}"/*.log; do
-    if [ -f "$log_file" ]; then
-        echo "- $(basename "${log_file}")" >> "${report_file}"
-    fi
-done
-
-cat >> "${report_file}" << EOF
-
-## Key Findings
-- **Staleness Impact:** Lower staleness thresholds may increase sample dropping but improve freshness
-- **Queue Size Impact:** Larger queues provide better buffering but use more memory
-- **Generation Interval:** Shorter intervals increase throughput but may stress the system
-- **GPU Allocation:** Balance between generation and training capacity is crucial
-- **Pause/Resume:** System should handle interruptions gracefully
-
-## Recommendations
-1. Start with staleness_threshold=3 for good balance
-2. Use queue_size=500-1000 for most workloads
-3. Set generation_interval=0.1s for good performance
-4. Allocate 2-3 GPUs for rollout in typical 8-GPU setups
-5. Monitor queue utilization and adjust based on workload
-
-EOF
-
-echo "Benchmark completed!"
-echo "Results saved to: ${BENCHMARK_DIR}/"
-echo "Performance report: ${report_file}"
-
-# Print summary to console
-if [ -f "${BENCHMARK_DIR}/summary.txt" ]; then
-    echo ""
-    echo "=== BENCHMARK SUMMARY ==="
-    cat "${BENCHMARK_DIR}/summary.txt"
-fi
-
diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh
deleted file mode 100644
index cd2265cde0d..00000000000
--- a/recipe/fully_async_policy/run_fully_async_example.sh
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/bin/bash
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -x
-
-# 实验配置
-project_name='FullyAsyncPPO'
-exp_name='async-qwen2.5-7b-test'
-
-# 模型和数据路径
-MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-7B-Instruct"}
-TRAIN_FILE=${TRAIN_FILE:-"~/data/train.parquet"}
-VAL_FILE=${VAL_FILE:-"~/data/val.parquet"}
-
-# 硬件配置
-NNODES=${NNODES:-1}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-# 异步训练资源分配
-n_gpus_rollout=3  # rollout专用GPU数量
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))  # 训练GPU数量
-
-echo "==================================="
-echo "完全异步PPO训练启动"
-echo "==================================="
-echo "模型路径: $MODEL_PATH"
-echo "训练数据: $TRAIN_FILE"
-echo "验证数据: $VAL_FILE"
-echo "节点数: $NNODES"
-echo "每节点GPU数: $NGPUS_PER_NODE"
-echo "Rollout GPU数: $n_gpus_rollout"
-echo "训练GPU数: $n_gpus_training"
-echo "==================================="
-
-# 算法参数
-temperature=1.0
-top_p=1.0
-top_k=-1
-
-# 序列长度
-max_prompt_length=1024
-max_response_length=1024
-
-# 异步训练参数
-staleness_threshold=3
-max_queue_size=1000
-min_batch_count=1
-batch_timeout=30.0
-
-# 训练参数
-train_batch_size=128
-total_training_steps=1000
-save_freq=100
-val_freq=50
-
-# 设置环境变量
-export NCCL_DEBUG=WARN
-export VLLM_USE_V1=1
-export VERL_LOGGING_LEVEL=INFO
-
-# 启动训练
-python -m recipe.one_step_off_policy.fully_async_main \
-    trainer.project_name="$project_name" \
-    trainer.experiment_name="$exp_name" \
-    trainer.device=cuda \
-    trainer.nnodes=$NNODES \
-    trainer.n_gpus_per_node=$NGPUS_PER_NODE \
-    data.train_files="$TRAIN_FILE" \
-    data.val_files="$VAL_FILE" \
-    data.train_batch_size=$train_batch_size \
-    data.max_prompt_length=$max_prompt_length \
-    data.max_response_length=$max_response_length \
-    data.train_files="$TRAIN_FILE" \
-    data.val_files="$VAL_FILE" \
-    data.train_batch_size=$train_batch_size \
-    data.max_prompt_length=$max_prompt_length \
-    data.max_response_length=$max_response_length \
-    \
-    # 模型配置
-    actor_rollout_ref.model.path="$MODEL_PATH" \
-    actor_rollout_ref.model.lora_rank=64 \
-    actor_rollout_ref.model.lora_alpha=128 \
-    \
-    # Rollout配置
-    actor_rollout_ref.rollout.mode=async \
-    actor_rollout_ref.rollout.n_gpus=$n_gpus_rollout \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.temperature=$temperature \
-    actor_rollout_ref.rollout.top_k=$top_k \
-    actor_rollout_ref.rollout.top_p=$top_p \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=8192 \
-    actor_rollout_ref.rollout.free_cache_engine=true \
-    actor_rollout_ref.rollout.enforce_eager=true \
-    \
-    # Actor配置
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
-    actor_rollout_ref.actor.use_dynamic_bsz=true \
-    actor_rollout_ref.actor.fsdp_config.param_offload=false \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=false \
-    \
-    # Critic配置
-    critic.model.path="$MODEL_PATH" \
-    critic.optim.lr=1e-5 \
-    critic.fsdp_config.param_offload=false \
-    \
-    # 异步训练配置
-    async_training.staleness_threshold=$staleness_threshold \
-    async_training.max_queue_size=$max_queue_size \
-    async_training.min_batch_count=$min_batch_count \
-    async_training.batch_timeout=$batch_timeout \
-    \
-    # 训练配置
-    trainer.total_training_steps=$total_training_steps \
-    trainer.save_freq=$save_freq \
-    trainer.val_freq=$val_freq \
-    trainer.critic_warmup=0 \
-    \
-    # 算法配置
-    algorithm.adv_estimator=gae \
-    algorithm.cliprange=0.2 \
-    algorithm.vf_coeff=0.1 \
-    algorithm.entropy_coeff=0.01 \
-    algorithm.kl_coeff=0.1 \
-    \
-    # 日志配置
-    trainer.logger='["console", "wandb"]' \
-    trainer.val_before_train=false
-
-echo "==================================="
-echo "完全异步PPO训练完成"
-echo "==================================="
-
diff --git a/recipe/fully_async_policy/test_components_pytest.py b/recipe/fully_async_policy/unittest/test_components_pytest.py
similarity index 100%
rename from recipe/fully_async_policy/test_components_pytest.py
rename to recipe/fully_async_policy/unittest/test_components_pytest.py

From c819fe1b6a9dc1f1ea92469ddb35bb79c975ca50 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 8 Aug 2025 17:31:08 +0800
Subject: [PATCH 031/182] ParameterSynchronizer

---
 .../fully_async_policy/README_fully_async.md  | 336 -------------
 .../config/fully_async_ppo_trainer.yaml       |   2 +
 recipe/fully_async_policy/fully_async_main.py |  99 +---
 .../fully_async_rollouter.py                  | 207 ++------
 .../fully_async_policy/fully_async_trainer.py |  45 +-
 recipe/fully_async_policy/message_queue.py    |   6 +-
 recipe/fully_async_policy/param_sync.py       |  18 +-
 .../unittest/test_components_pytest.py        | 315 -------------
 .../unittest/test_fully_async.py              | 194 --------
 .../unittest/test_fully_async_components.py   | 444 ------------------
 tests/special_e2e/run_fully_async_policy.sh   |   2 +
 verl/trainer/ppo/ray_trainer.py               |   2 +-
 12 files changed, 91 insertions(+), 1579 deletions(-)
 delete mode 100644 recipe/fully_async_policy/README_fully_async.md
 delete mode 100644 recipe/fully_async_policy/unittest/test_components_pytest.py
 delete mode 100644 recipe/fully_async_policy/unittest/test_fully_async.py
 delete mode 100644 recipe/fully_async_policy/unittest/test_fully_async_components.py

diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md
deleted file mode 100644
index 916633a4a81..00000000000
--- a/recipe/fully_async_policy/README_fully_async.md
+++ /dev/null
@@ -1,336 +0,0 @@
-# 完全异步PPO训练系统 (Fully Async Policy)
-
-本文档介绍了基于 OneStepOffRayTrainer 成熟实现改进的完全异步PPO训练系统，该系统实现了 Trainer 和 Rollouter 的完全解耦，支持异步样本生成和训练。
-
-## 🚀 **系统特性**
-
-### 核心特性
-- **完全异步训练**: Trainer 和 Rollouter 在独立的Ray Actor中运行，实现真正的并行处理
-- **智能新鲜度控制**: 基于参数版本和时间戳的样本新鲜度管理，防止过期样本影响训练
-- **健壮的参数同步**: 改进的参数同步机制，支持错误重试和状态管理
-- **简化的消息队列**: 去除ZeroMQ依赖，使用Ray-based消息传递，更稳定可靠
-- **完善的监控**: 详细的性能指标和组件健康状态监控
-
-### 改进亮点
-- **参考OneStepOffRayTrainer**: 使用成熟的训练逻辑，确保训练稳定性
-- **错误处理和恢复**: 完善的异常处理和资源清理机制
-- **组件协调**: 统一的组件生命周期管理和状态监控
-- **配置验证**: 智能的配置验证和默认值设置
-
-## 🏗️ **系统架构**
-
-### 组件结构
-
-```
-┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
-│  FullyAsyncMain │────│ MessageQueue    │────│ FullyAsyncTrainer│
-│  (Coordinator)  │    │  (Ray Actor)    │    │   (Ray Actor)   │
-└─────────────────┘    └─────────────────┘    └─────────────────┘
-         │                       │                       │
-         └───────────────────────┼───────────────────────┘
-                                 │
-                    ┌─────────────────┐
-                    │   Rollouter     │
-                    │  (Ray Actor)    │
-                    └─────────────────┘
-                             │
-                    ┌─────────────────┐
-                    │ ParameterSync   │
-                    │   Manager       │
-                    └─────────────────┘
-```
-
-### 数据流
-
-```
-1. 数据生成: Rollouter → MessageQueue
-2. 训练消费: MessageQueue → FullyAsyncTrainer
-3. 参数同步: FullyAsyncTrainer → Rollouter
-4. 状态监控: FullyAsyncMain → All Components
-```
-
-## 📋 **核心组件**
-
-### 1. FullyAsyncTrainer
-- **功能**: 从MessageQueue获取样本进行异步训练
-- **特性**:
-  - 基于OneStepOffRayTrainer的成熟训练逻辑
-  - 智能的样本新鲜度指标计算
-  - 完善的错误处理和重试机制
-  - 详细的训练性能监控
-
-### 2. Rollouter
-- **功能**: 持续生成训练样本并放入MessageQueue
-- **特性**:
-  - 智能的暂停/恢复控制机制
-  - 基于新鲜度的生成控制
-  - 改进的参数同步处理
-  - 异步/同步生成模式支持
-
-### 3. MessageQueue
-- **功能**: Ray-based消息队列，管理样本传递
-- **特性**:
-  - 去除ZeroMQ依赖，更稳定可靠
-  - 智能的样本过期检测
-  - 线程安全的队列操作
-  - 内存使用监控
-
-### 4. ParameterSynchronizer
-- **功能**: 管理Actor和Rollout间的参数同步
-- **特性**:
-  - 支持错误重试和超时处理
-  - 详细的同步状态跟踪
-  - 集群通信组管理
-
-### 5. FullyAsyncMain
-- **功能**: 系统协调器，管理所有组件的生命周期
-- **特性**:
-  - 统一的组件初始化和清理
-  - 实时的健康状态监控
-  - 优雅的关闭和错误恢复
-
-## ⚙️ **配置说明**
-
-### 异步训练配置 (async_training)
-
-```yaml
-async_training:
-  # 新鲜度控制
-  staleness_threshold: 3              # 样本新鲜度阈值
-  max_staleness_allowed: 5            # 最大允许的样本陈旧度
-
-  # 队列管理
-  max_queue_size: 1000               # 消息队列最大大小
-  min_batch_count: 1                 # 每次获取的最小batch数量
-  batch_timeout: 30.0                # 获取batch的超时时间
-
-  # 生成控制
-  generation_timeout: 30.0           # 单次生成的超时时间
-  batch_generation_interval: 0.1     # batch生成间隔
-
-  # 参数同步
-  max_sync_retries: 3                # 参数同步最大重试次数
-  sync_timeout: 30.0                 # 同步超时时间
-  sync_retry_delay: 1.0              # 重试延迟时间
-```
-
-### 资源配置
-
-```yaml
-trainer:
-  n_gpus_per_node: 4                 # 每个训练节点的GPU数量
-  nnodes: 2                          # 训练节点数量
-  device: cuda
-
-rollout:
-  n_gpus_per_node: 2                 # 每个rollout节点的GPU数量
-  nnodes: 1                          # rollout节点数量
-```
-
-## 🔧 **使用方法**
-
-### 1. 基本运行
-
-```bash
-# 使用默认配置运行
-python fully_async_main.py
-
-# 使用自定义配置
-python fully_async_main.py --config-path /path/to/config --config-name my_config
-```
-
-### 2. 配置自定义
-
-```python
-# 在配置文件中自定义异步训练参数
-async_training:
-  staleness_threshold: 5
-  max_queue_size: 2000
-  generation_timeout: 60.0
-```
-
-### 3. 监控和调试
-
-```python
-# 系统会自动输出详细的统计信息
-# 包括: Trainer状态、Rollouter状态、队列状态等
-
-# 日志文件: fully_async_training.log
-# 包含所有组件的详细日志信息
-```
-
-## 📊 **性能监控**
-
-### 关键指标
-
-#### Trainer指标
-- `global_steps`: 训练步数
-- `processed_samples`: 已处理样本数
-- `current_param_version`: 当前参数版本
-- `param_sync_count`: 参数同步次数
-
-#### Rollouter指标
-- `total_generated_samples`: 总生成样本数
-- `dropped_stale_samples`: 丢弃的过期样本数
-- `generation_errors`: 生成错误数
-- `param_sync_requests`: 参数同步请求数
-
-#### 新鲜度指标
-- `avg_sample_age`: 样本平均年龄
-- `max_sample_age`: 样本最大年龄
-- `stale_samples_ratio`: 过期样本比例
-
-#### 队列指标
-- `queue_size`: 当前队列大小
-- `total_produced`: 总生产样本数
-- `total_consumed`: 总消费样本数
-- `dropped_samples`: 总丢弃样本数
-
-## 🔍 **故障排查**
-
-### 常见问题
-
-1. **样本生成过慢**
-   - 检查 `generation_timeout` 设置
-   - 监控 `generation_errors` 指标
-   - 调整 `batch_generation_interval`
-
-2. **样本过期严重**
-   - 调整 `staleness_threshold`
-   - 检查参数同步频率
-   - 监控 `stale_samples_ratio`
-
-3. **队列溢出**
-   - 增加 `max_queue_size`
-   - 优化训练速度
-   - 调整 `min_batch_count`
-
-4. **参数同步失败**
-   - 检查 `sync_timeout` 设置
-   - 监控 `sync_failures` 指标
-   - 调整 `max_sync_retries`
-
-### 日志分析
-
-```bash
-# 查看主要错误
-grep "ERROR" fully_async_training.log
-
-# 查看组件统计
-grep "Component Statistics" fully_async_training.log
-
-# 查看参数同步状态
-grep "Parameter sync" fully_async_training.log
-```
-
-## 🚀 **性能优化建议**
-
-### 1. 资源配置优化
-- 根据模型大小合理配置GPU数量
-- 训练和rollout使用独立的资源池
-- 考虑内存和计算的平衡
-
-### 2. 新鲜度控制优化
-- 根据模型收敛速度调整新鲜度阈值
-- 监控样本年龄分布，避免过度丢弃
-- 动态调整队列大小
-
-### 3. 参数同步优化
-- 合理设置同步频率，平衡性能和一致性
-- 使用异步同步减少等待时间
-- 监控同步耗时，及时发现问题
-
-## 🔧 **扩展和定制**
-
-### 自定义组件
-
-```python
-# 自定义Trainer
-class CustomFullyAsyncTrainer(FullyAsyncTrainer):
-    def _compute_custom_metrics(self, batch):
-        # 添加自定义指标计算
-        pass
-
-# 自定义Rollouter
-class CustomRollouter(Rollouter):
-    def _custom_generation_logic(self, batch):
-        # 添加自定义生成逻辑
-        pass
-```
-
-### 自定义监控
-
-```python
-# 添加自定义监控指标
-def custom_monitor(trainer_stats, rollouter_stats):
-    # 实现自定义监控逻辑
-    custom_metric = calculate_custom_metric(trainer_stats)
-    logger.info(f"Custom metric: {custom_metric}")
-```
-
-## 📚 **与OneStepOffRayTrainer的对比**
-
-| 特性 | OneStepOffRayTrainer | FullyAsyncTrainer |
-|------|---------------------|------------------|
-| 训练模式 | 同步批处理 | 异步流处理 |
-| 参数更新 | 批次同步更新 | 实时异步更新 |
-| 资源利用 | 阶段性利用 | 持续高效利用 |
-| 新鲜度控制 | 无需考虑 | 智能控制 |
-| 复杂度 | 相对简单 | 更复杂但更灵活 |
-| 适用场景 | 标准训练 | 大规模持续训练 |
-
-## 📖 **最佳实践**
-
-1. **配置调优**: 从默认配置开始，根据监控指标逐步优化
-2. **资源规划**: 合理分配训练和生成资源，避免瓶颈
-3. **监控预警**: 设置关键指标的阈值报警
-4. **定期检查**: 定期检查日志和性能指标
-5. **版本管理**: 记录配置变更和性能影响
-
-## 🤝 **贡献和反馈**
-
-欢迎提交issue和PR来改进这个异步训练系统！
-
-## 📄 **更新日志**
-
-### v2.0 (改进版本)
-- ✅ 基于OneStepOffRayTrainer重构训练逻辑
-- ✅ 简化MessageQueue实现，去除ZeroMQ依赖
-- ✅ 改进参数同步机制，支持错误重试
-- ✅ 完善组件协调和监控系统
-- ✅ 优化错误处理和资源管理
-- ✅ 增加详细的性能指标和日志
-
-### v1.0 (原始版本)
-- 基础异步训练框架
-- 简单的消息队列实现
-- 基本的参数同步功能
-
-
-```python
-DataProtoItem(
-    batch=TensorDict(
-        fields={
-            attention_mask: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False),
-            input_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False),
-            position_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False),
-            prompts: Tensor(shape=torch.Size([1024]), device=cpu, dtype=torch.int64, is_shared=False),
-            response_mask: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False),
-            responses: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False)},
-        batch_size=torch.Size([]),
-        device=None,
-        is_shared=False), 
-    non_tensor_batch={'data_source': 'openai/gsm8k',
-                      'ability': 'math', 
-                      'reward_model': {'ground_truth': '35', 'style': 'rule'},
-                      'extra_info': {
-                          'answer': 'The total number of green and red plates is 28 + 21 = <<28+21=49>>49.\nXavier should buy 84 − 49 = 35 more plates.\n#### 35',
-                          'index': 1421, 
-                          'question': 'Xavier needs 84 paper plates for a housewarming party. He already has 21 green plates and 28 red plates. How many more plates should Xavier buy?', 'split': 'train'},
-                      'uid': 'fab3e910-67b3-4653-bc69-377250049267', 
-                      'tools_kwargs': {}, 
-                      'interaction_kwargs': {}, 
-                      'index': 1421},
-    meta_info={'global_token_num': [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]})
-```
-
diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index f9aa06cd4b6..a5f58fadc2f 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -24,6 +24,8 @@ rollout:
   mode: async                        # rollout模式: sync, async
   name: vllm                         # rollout引擎: vllm, sglang
   n: 4                               # 每个prompt生成的响应数量
+  total_rollout_steps: 100
+  total_epochs: 10
 
 data:
   gen_batch_size: 32
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index cf5c0e29d5c..31541982dd7 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -153,27 +153,11 @@ def __init__(self):
     def run(self, config):
         """运行完全异步的PPO训练"""
         print("Starting fully async PPO training...")
-        # 设置信号处理
-        self._setup_signal_handlers()
         # 初始化基础组件
         self._initialize_components(config)
-        # time.sleep(60)
         # 启动训练流程
         self._run_training_loop()
 
-        # self._cleanup_resources()
-
-    def _setup_signal_handlers(self):
-        """设置信号处理器"""
-
-        def signal_handler(signum, frame):
-            print(f"Received signal {signum}, initiating shutdown...")
-            self.running = False
-            self.shutdown_event.set()
-
-        signal.signal(signal.SIGINT, signal_handler)
-        signal.signal(signal.SIGTERM, signal_handler)
-
     def _initialize_components(self, config) -> None:
         """
         初始化所有组件
@@ -225,10 +209,10 @@ def _initialize_components(self, config) -> None:
 
         # 创建MessageQueue
         self.max_queue_size = (
-            config.async_training.staleness_threshold
-            * config.data.train_batch_size
-            * config.actor_rollout_ref.rollout.n
-        )
+                config.async_training.staleness_threshold
+                * config.data.train_batch_size
+                * config.actor_rollout_ref.rollout.n
+        ) * 10 # x 10 避免死锁
         print("Creating MessageQueue...")
         message_queue = MessageQueue.remote(config, self.max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
@@ -237,7 +221,7 @@ def _initialize_components(self, config) -> None:
         self.components["message_queue_client"] = message_queue_client
 
         # 创建Rollouter
-        print("Creating Rollouter...")
+        print("Creating FullyAsyncRollouter...")
         self._create_rollouter(config)
 
         # 创建Trainer
@@ -252,16 +236,17 @@ def _initialize_components(self, config) -> None:
             config=config,
             trainer=self.components["trainer"],
             rollouter=self.components["rollouter"],
+            mq=self.components["message_queue_client"],
         )
 
         # 将参数同步器设置到trainer和rollouter
         ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
         ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer))
 
+        # 首先同步一次参数
         ray.get(param_synchronizer.sync_weights.remote(0))
 
         self.components["param_synchronizer"] = param_synchronizer
-        print("Parameter synchronizer initialized successfully")
         print("All components initialized successfully")
 
     def _create_rollouter(self, config) -> None:
@@ -277,21 +262,14 @@ def _create_rollouter(self, config) -> None:
             device_name=config.trainer.device,
             max_queue_size=self.max_queue_size,
         )
-        print(rollouter)
-
-        print("========== rollouter init workers ======")
 
-        # 初始化Rollouter
         ray.get(rollouter.init_workers.remote())
-
         ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"]))
-
         self.components["rollouter"] = rollouter
         print("Rollouter created and initialized successfully")
 
     def _create_trainer(self, config) -> None:
         """创建Trainer"""
-        # 创建trainer角色映射（排除Rollout）
         trainer_role_mapping = {
             role: worker_cls
             for role, worker_cls in self.components["role_worker_mapping"].items()
@@ -324,74 +302,13 @@ def _run_training_loop(self):
         rollouter_future = self.components["rollouter"].fit.remote()
         trainer_future = self.components["trainer"].fit.remote()
 
-        print("Starting Trainer...")
-        time.sleep(10)
-        print("Starting Trainer...")
-
         ray.get(rollouter_future)
         ray.get(trainer_future)
+
         self.components["message_queue_client"].clear_queue()
 
         print("Training completed or interrupted")
 
-    def _cleanup_resources(self):
-        """清理所有资源"""
-        try:
-            # 关闭线程池
-            if hasattr(self, 'thread_executor') and self.thread_executor:
-                print("Shutting down thread executor...")
-                self.thread_executor.shutdown(wait=True, timeout=10.0)
-
-            # 清理logger
-            if hasattr(self, 'logger') and self.logger:
-                try:
-                    if hasattr(self.logger, 'close'):
-                        self.logger.close()
-                    elif hasattr(self.logger, 'finish'):
-                        self.logger.finish()
-                except Exception as e:
-                    print(f"Error closing logger: {e}")
-
-            # 清理validation logger
-            if hasattr(self, 'validation_generations_logger') and self.validation_generations_logger:
-                try:
-                    if hasattr(self.validation_generations_logger, 'close'):
-                        self.validation_generations_logger.close()
-                except Exception as e:
-                    print(f"Error closing validation logger: {e}")
-
-            # 清理异步rollout管理器
-            if hasattr(self, "async_rollout_manager") and self.async_rollout_manager:
-                try:
-                    if hasattr(self.async_rollout_manager, 'shutdown'):
-                        self.async_rollout_manager.shutdown()
-                except Exception as e:
-                    print(f"Error cleaning up async rollout manager: {e}")
-
-            # 清理worker groups
-            if hasattr(self, 'rollout_wg') and self.rollout_wg:
-                try:
-                    if hasattr(self.rollout_wg, 'shutdown'):
-                        self.rollout_wg.shutdown()
-                except Exception as e:
-                    print(f"Error cleaning up rollout worker group: {e}")
-
-            # 强制垃圾回收
-            import gc
-            gc.collect()
-
-        except Exception as e:
-            print(f"Error during resource cleanup: {e}")
-
-    def __del__(self):
-        """析构函数 - 确保资源清理"""
-        try:
-            if hasattr(self, 'running') and self.running:
-                print("Warning: FullyAsyncRollouter being deleted while still running")
-                self.shutdown()
-        except Exception as e:
-            print(f"Error in destructor: {e}")
-
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 1ca9c7b0d2e..d392e4a1630 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -34,17 +34,17 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
-        max_queue_size=1000,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
+            max_queue_size=1000,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -99,6 +99,14 @@ def __init__(
         pprint(f"Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
+        total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+        if self.config.rollout.total_rollout_steps is not None:
+            total_rollout_steps = self.config.rollout.total_rollout_steps
+
+        self.total_rollout_steps = total_rollout_steps
+        print(f"Total rollout steps: {self.total_rollout_steps}")
+
         # rollouter 参数配置
         self.message_queue_client = None
 
@@ -159,6 +167,13 @@ def get_rollout_wg(self):
         """获取 rollout worker group"""
         return self.rollout_wg
 
+    def update_param_version(self, version: int):
+        """更新当前参数版本"""
+        with self.lock:
+            old_version = self.current_param_version
+            self.current_param_version = version
+            print(f"Parameter version updated from {old_version} to {version}")
+
     def _validate_config(self):
         # 验证异步训练配置
         if not hasattr(self.config, "async_training"):
@@ -184,18 +199,19 @@ def _create_continuous_iterator(self):
         """
         Create a continuous data iterator across epoch
         """
-        for epoch in range(self.config.trainer.total_epochs):
+        for epoch in range(self.config.rollout.total_epochs):
             iterator = iter(self.train_dataloader)
             for batch_dict in iterator:
                 yield epoch, batch_dict
 
     def fit(self):
         """开始异步生成样本 - 改进的主运行逻辑"""
-        print("Starting Rollouter...")
+        print("Starting FullyAsyncRollouter...")
+
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-        # if self.param_synchronizer is None:
-        #     raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
+        if self.param_synchronizer is None:
+            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
         # 设置运行状态
         with self.lock:
@@ -279,8 +295,11 @@ def _generation_loop(self):
 
             metrics = {}
             timing_raw = {}
-            batch, gen_batch = self._prepare_generate_batch(batch_dict)
-            is_last_step = self.global_steps >= self.total_training_steps
+
+            with self.lock:
+                batch, gen_batch = self._prepare_generate_batch(batch_dict)
+
+            is_last_step = self.global_steps >= self.total_rollout_steps
 
             # generate a batch
             with marked_timer("gen", timing_raw, color="red"):
@@ -334,6 +353,12 @@ def _generation_loop(self):
         with self.lock:
             self.running = False
 
+        # 发送终止信号
+        self.message_queue_client.put_sample(
+            sample=None,
+            param_version=self.current_param_version,
+        )
+
     def _monitor_loop(self):
         """监控线程 - 监控状态并处理控制信号"""
         # 主线程保持运行，处理控制信号和状态监控
@@ -390,7 +415,10 @@ def _should_pause_generation(self) -> bool:
             return True  # 出错时暂停生成
 
     def pause(self) -> bool:
-        """暂停生成 - 供外部调用"""
+        """暂停生成
+        TODO 集成 Partial Rollout
+        """
+        print("[rollouter] pause")
         with self.lock:
             if not self.running:
                 return False
@@ -402,7 +430,10 @@ def pause(self) -> bool:
             return True
 
     def resume(self) -> bool:
-        """恢复生成 - 供外部调用"""
+        """恢复生成
+        TODO 集成 Partial Rollout
+        """
+        print("[rollouter] resume")
         with self.lock:
             if not self.running:
                 return False
@@ -415,45 +446,6 @@ def resume(self) -> bool:
             print("Generation resumed")
             return True
 
-    def shutdown(self):
-        """关闭Rollouter - 改进的关闭逻辑"""
-        print("Shutting down Rollouter...")
-
-        with self.lock:
-            self.running = False
-            self.paused = False
-            self.condition.notify_all()
-
-        # 等待生成线程结束
-        if self.generation_thread and self.generation_thread.is_alive():
-            print("Waiting for generation thread to finish...")
-            self.generation_thread.join(timeout=10.0)
-
-            if self.generation_thread.is_alive():
-                print("Generation thread did not finish within timeout")
-
-        # 等待监控线程结束
-        if self.monitor_thread and self.monitor_thread.is_alive():
-            print("Waiting for monitor thread to finish...")
-            self.monitor_thread.join(timeout=5.0)
-
-            if self.monitor_thread.is_alive():
-                print("Monitor thread did not finish within timeout")
-
-        # 关闭线程池
-        if self.thread_executor:
-            self.thread_executor.shutdown(wait=True)
-
-        # 清理异步rollout管理器
-        if hasattr(self, "async_rollout_manager"):
-            try:
-                # TODO: 添加异步rollout管理器的清理逻辑
-                pass
-            except Exception as e:
-                print(f"Error cleaning up async rollout manager: {e}")
-
-        print("Rollouter shutdown complete")
-
     def get_statistics(self) -> dict:
         with self.lock:
             queue_stats = self.message_queue_client.get_statistics()
@@ -468,102 +460,3 @@ def get_statistics(self) -> dict:
                 "queue_size": f"{queue_stats['queue_size']}",
             }
             return stats
-
-    def update_rollout_weights(self, param_version: int) -> bool:
-        """
-        更新rollout模型参数 - 改进的参数同步实现
-        这个方法由外部Trainer调用
-
-        Args:
-            param_version: 新的参数版本号
-
-        Returns:
-            bool: 是否成功更新参数
-        """
-        print(f"Updating rollout weights to version {param_version}")
-
-        with self.sync_lock:
-            if self.sync_in_progress:
-                print(f"Sync already in progress, skipping version {param_version}")
-                return False
-
-            self.sync_in_progress = True
-
-        try:
-            # 暂停rollout - 带超时机制
-            if not self.rollout_controller.pause(timeout=10.0):
-                print("Failed to pause rollout within timeout")
-                return False
-
-            # 等待当前generation完成（如果有的话）
-            time.sleep(0.1)
-
-            # 执行参数同步
-            sync_success = self._execute_parameter_sync(param_version)
-
-            if sync_success:
-                self.current_param_version = param_version
-                self.param_sync_requests += 1
-                self.last_sync_time = time.time()
-                print(f"Successfully updated rollout weights to version {param_version}")
-            else:
-                print(f"Failed to sync parameters to version {param_version}")
-
-        except Exception as e:
-            print(f"Error during parameter sync: {e}")
-            sync_success = False
-        finally:
-            # 恢复rollout
-            self.rollout_controller.resume()
-            self.sync_in_progress = False
-
-        return sync_success
-
-    def _execute_parameter_sync(self, param_version: int) -> bool:
-        """
-        执行实际的参数同步 - 改进的同步逻辑
-
-        Args:
-            param_version: 目标参数版本
-
-        Returns:
-            bool: 是否同步成功
-        """
-        try:
-            # 暂停推理引擎
-            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
-                # 对于异步模式，暂停服务器
-                pass  # 异步服务器的暂停在 pause() 中已经处理
-            else:
-                # 对于同步模式，使用sleep/wake_up机制
-                sleep_futures = self.rollout_wg.sleep()
-                ray.get(sleep_futures)
-
-            # 执行参数同步
-            if self.param_synchronizer:
-                self.param_synchronizer.sync_weights()
-                print("Parameter synchronization completed via synchronizer")
-            else:
-                # 直接使用rollout worker group的同步机制
-                if hasattr(self.rollout_wg, "sync_rollout_weights"):
-                    sync_futures = self.rollout_wg.sync_rollout_weights()
-                    ray.get(sync_futures)
-                    print("Parameter synchronization completed via rollout worker group")
-                else:
-                    print("No parameter synchronization mechanism available")
-                    return False
-
-            # 恢复推理引擎
-            if self.async_rollout_mode and hasattr(self, "async_rollout_manager"):
-                # 对于异步模式，恢复服务器
-                pass  # 异步服务器的恢复在 resume() 中已经处理
-            else:
-                # 对于同步模式，唤醒workers
-                wake_futures = self.rollout_wg.wake_up()
-                ray.get(wake_futures)
-
-            return True
-
-        except Exception as e:
-            print(f"Parameter sync execution failed: {e}")
-            return False
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index afef0968a04..29a7a5c830b 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -105,7 +105,6 @@ def __init__(
         self.processed_samples = 0
         self.stale_samples_processed = 0
         self.current_param_version = 0
-        self.param_sync_count = 0
 
         # 参数同步相关状态
         self._weights_info = None
@@ -133,8 +132,6 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         Returns:
             tuple: (epoch, batch_dict, gen_batch_output)
         """
-        if self.message_queue_client is None:
-            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
 
         # 计算需要获取的样本数量
         n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
@@ -268,11 +265,11 @@ def fit(self):
         to construct the PPO dataflow.
         The light-weight advantage computation is done on the driver process.
         """
-
-        print("FullyAsyncTrainer run")
-
+        print("Starting FullyAsyncTrainer...")
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+        if self.param_synchronizer is None:
+            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
         from verl.utils.tracking import Tracking
 
@@ -288,22 +285,9 @@ def fit(self):
         # load checkpoint before doing anything
         self._load_checkpoint()
 
-        # perform validation before training
-        # currently, we only support validation using the reward_function.
-        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
-            val_metrics = self._validate()
-            assert val_metrics, f"{val_metrics=}"
-            pprint(f"Initial validation metrics: {val_metrics}")
-            print(data=val_metrics, step=self.global_steps)
-            if self.config.trainer.get("val_only", False):
-                return
-        # TODO 需要从
         self.total_training_steps = self.config.trainer.total_training_steps
 
         print(f"Total training steps: {self.total_training_steps}")
-        # add tqdm
-        # progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
-
         # we start from step 1
         self.global_steps += 1
         last_val_metrics = None
@@ -325,13 +309,6 @@ def fit(self):
             metrics = {}
             timing_raw = {}
 
-            do_profile = (
-                self.global_steps in self.config.trainer.profile_steps
-                if self.config.trainer.profile_steps is not None
-                else False
-            )
-            self._start_profiling(do_profile, timing_raw)
-
             is_last_step = self.global_steps >= self.total_training_steps
 
             with marked_timer("step", timing_raw):
@@ -384,13 +361,12 @@ def fit(self):
             # self._stop_profiling(do_profile, timing_raw)
             print("_collect_metrics")
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
-            print("_post_batch_processing")
-            # self._post_batch_processing(batch)
 
-            print("step end")
             # 在训练步骤结束后触发参数同步
+            print("_trigger_parameter_sync_after_step")
+
             self._trigger_parameter_sync_after_step()
-            # progress_bar.update(1)
+            print("global_steps")
             self.global_steps += 1
             print(f"is_last_step {is_last_step}")
             if is_last_step:
@@ -405,7 +381,6 @@ def get_statistics(self) -> dict:
             "processed_samples": self.processed_samples,
             "stale_samples_processed": self.stale_samples_processed,
             "current_param_version": self.current_param_version,
-            "param_sync_count": self.param_sync_count,
             "queue_size": queue_stats.get("queue_size", 0),
             "queue_total_produced": queue_stats.get("total_produced", 0),
             "queue_total_consumed": queue_stats.get("total_consumed", 0),
@@ -417,12 +392,12 @@ def _trigger_parameter_sync_after_step(self):
         在训练步骤结束后触发参数同步
         这确保rollouter总是使用最新训练的参数
         """
-        new_version = self.current_param_version + 1
+        self.current_param_version = self.current_param_version + 1
         print(
-            f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}"
+            f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}"
         )
-        logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}")
-        ray.get(self.param_synchronizer.sync_weights.remote(new_version))
+        logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}")
+        ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
 
     def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
         """
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index e5c382dec2a..47dbd34ecff 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -131,7 +131,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
             samples = []
             for _ in range(batch_count):
                 if self.queue:
-                    samples.append(self.queue.popleft())
+                    data = self.queue.popleft()
+                    if data is None:
+                        return []
+                    else:
+                        samples.append(self.queue.popleft())
 
             self.total_consumed += len(samples)
             return samples
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 3657916dda0..cb9baa5ff8a 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -28,11 +28,11 @@ class ParameterSynchronizer:
     合并了原有的多个同步器类的功能
     """
 
-    def __init__(self, config, trainer, rollouter):
-
+    def __init__(self, config, trainer, rollouter, mq):
         self.config = config
         self.trainer = trainer
         self.rollouter = rollouter
+        self.mq_client = mq
         self.actor_wg = ray.get(trainer.get_actor_wg.remote())
         self.rollout_wg = ray.get(rollouter.get_rollout_wg.remote())
 
@@ -72,10 +72,18 @@ def _init_sync_group(self):
 
     def sync_weights(self, version):
         self.current_version = version
-        logger.debug(f"Starting weight synchronization (version {self.current_version})...")
+        print(f"Starting weight synchronization (version {self.current_version})...")
+
+        print("pause rollout")
+        ray.get(self.rollouter.pause.remote())
+
+        # 更新MQ 版本
+        self.mq_client.update_param_version(version)
 
-        # TODO 暂停及恢复rollout
-        print("TODO 暂停及恢复rollout")
         self.actor_wg.sync_rollout_weights()
         ray.get(self.rollout_wg.sync_rollout_weights())
+
+        # 更新 rollout 版本
+        ray.get(self.rollouter.update_param_version.remote(version))
+        ray.get(self.rollouter.resume.remote())
         print("sync_weights success")
diff --git a/recipe/fully_async_policy/unittest/test_components_pytest.py b/recipe/fully_async_policy/unittest/test_components_pytest.py
deleted file mode 100644
index fd2e207cbe4..00000000000
--- a/recipe/fully_async_policy/unittest/test_components_pytest.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Pytest测试文件，用于测试完全异步PPO训练系统的各个组件
-"""
-
-import time
-from unittest.mock import Mock
-
-import pytest
-import ray
-from omegaconf import OmegaConf
-
-
-@pytest.fixture
-def ray_setup():
-    """Ray初始化fixture"""
-    if not ray.is_initialized():
-        ray.init(ignore_reinit_error=True, num_cpus=2)
-    yield
-    # 测试后不关闭Ray，因为其他测试可能还需要
-
-
-@pytest.fixture
-def basic_config():
-    """基本配置fixture"""
-    return OmegaConf.create(
-        {
-            "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}},
-            "algorithm": {"use_kl_in_reward": False},
-            "critic": {"enable": False},
-            "trainer": {
-                "device": "cpu",
-                "project_name": "test",
-                "experiment_name": "test",
-                "total_epochs": 1,
-                "total_training_steps": 2,
-            },
-            "async_training": {
-                "staleness_threshold": 3,
-                "max_staleness_allowed": 5,
-                "generation_timeout": 10.0,
-                "batch_timeout": 5.0,
-            },
-            "data": {"train_batch_size": 4},
-        }
-    )
-
-
-class TestMessageQueue:
-    """测试MessageQueue功能"""
-
-    def test_message_queue_creation(self, ray_setup):
-        """测试MessageQueue创建"""
-        try:
-            from message_queue import MessageQueueClient
-
-            queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
-
-            # 测试基本功能
-            stats = ray.get(queue.get_statistics.remote())
-            assert "queue_size" in stats
-            assert stats["queue_size"] == 0
-
-            ray.kill(queue)
-
-        except ImportError:
-            pytest.skip("MessageQueue not available")
-
-    def test_queue_put_get(self, ray_setup):
-        """测试队列的put/get操作"""
-        try:
-            from message_queue import MessageQueueClient
-
-            queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
-
-            # 创建模拟样本
-            mock_sample = Mock()
-            mock_sample.batch_size = 4
-
-            # 测试放入样本
-            success = ray.get(
-                queue.put_sample.remote(
-                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-                )
-            )
-            assert success
-
-            # 测试获取样本
-            result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1))
-            assert result is not None
-
-            ray.kill(queue)
-
-        except ImportError:
-            pytest.skip("MessageQueue not available")
-
-
-class TestRollouter:
-    """测试Rollouter功能"""
-
-    def test_rollouter_pause_resume(self, ray_setup, basic_config):
-        """测试Rollouter的暂停恢复功能"""
-        try:
-            from fully_async_rollouter import FullyAsyncRollouter
-
-            # 创建模拟依赖
-            mock_tokenizer = Mock()
-            mock_role_worker_mapping = {}
-            mock_resource_pool_manager = Mock()
-
-            # 创建Rollouter
-            rollouter = FullyAsyncRollouter.remote(
-                config=basic_config,
-                tokenizer=mock_tokenizer,
-                role_worker_mapping=mock_role_worker_mapping,
-                resource_pool_manager=mock_resource_pool_manager,
-            )
-
-            # 测试暂停
-            result = ray.get(rollouter.pause_rollout.remote())
-            assert result is True
-
-            # 检查状态
-            is_paused = ray.get(rollouter.is_rollout_paused.remote())
-            assert is_paused is True
-
-            # 测试恢复
-            result = ray.get(rollouter.resume_rollout.remote())
-            assert result is True
-
-            # 检查状态
-            is_paused = ray.get(rollouter.is_rollout_paused.remote())
-            assert is_paused is False
-
-            ray.kill(rollouter)
-
-        except ImportError:
-            pytest.skip("FullyAsyncRollouter not available")
-
-    def test_rollouter_statistics(self, ray_setup, basic_config):
-        """测试Rollouter统计功能"""
-        try:
-            from fully_async_rollouter import FullyAsyncRollouter
-
-            mock_tokenizer = Mock()
-            mock_role_worker_mapping = {}
-            mock_resource_pool_manager = Mock()
-
-            rollouter = FullyAsyncRollouter.remote(
-                config=basic_config,
-                tokenizer=mock_tokenizer,
-                role_worker_mapping=mock_role_worker_mapping,
-                resource_pool_manager=mock_resource_pool_manager,
-            )
-
-            # 获取统计信息
-            stats = ray.get(rollouter.get_statistics.remote())
-
-            # 验证必要字段存在
-            required_fields = [
-                "total_generated_samples",
-                "dropped_stale_samples",
-                "generation_errors",
-                "current_param_version",
-                "is_paused",
-                "pause_count",
-            ]
-
-            for field in required_fields:
-                assert field in stats
-
-            ray.kill(rollouter)
-
-        except ImportError:
-            pytest.skip("FullyAsyncRollouter not available")
-
-
-class TestTrainer:
-    """测试Trainer功能"""
-
-    def test_trainer_creation(self, ray_setup, basic_config):
-        """测试Trainer创建"""
-        try:
-            from fully_async_trainer import FullyAsyncTrainer
-
-            mock_tokenizer = Mock()
-            mock_role_worker_mapping = {}
-            mock_resource_pool_manager = Mock()
-
-            trainer = FullyAsyncTrainer.remote(
-                config=basic_config,
-                tokenizer=mock_tokenizer,
-                role_worker_mapping=mock_role_worker_mapping,
-                resource_pool_manager=mock_resource_pool_manager,
-            )
-
-            # 基本验证
-            assert trainer is not None
-
-            ray.kill(trainer)
-
-        except ImportError:
-            pytest.skip("FullyAsyncTrainer not available")
-
-
-class TestParameterSync:
-    """测试参数同步功能"""
-
-    def test_param_sync_creation(self, ray_setup):
-        """测试参数同步器创建"""
-        try:
-            from param_sync import ParameterSynchronizer
-
-            config = OmegaConf.create(
-                {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}}
-            )
-
-            mock_actor_wg = Mock()
-            mock_rollout_wg = Mock()
-
-            synchronizer = ParameterSynchronizer.remote(
-                config=config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg
-            )
-
-            assert synchronizer is not None
-
-            ray.kill(synchronizer)
-
-        except ImportError:
-            pytest.skip("ParameterSynchronizer not available")
-
-
-class TestIntegration:
-    """集成测试"""
-
-    def test_basic_workflow_simulation(self, ray_setup):
-        """测试基本工作流模拟"""
-        # 这是一个简化的集成测试，模拟基本的工作流
-        try:
-            from message_queue import MessageQueueClient
-
-            # 创建消息队列
-            queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2)
-
-            # 模拟生产者（Rollouter）
-            mock_sample = Mock()
-            mock_sample.batch_size = 2
-
-            # 放入样本
-            success = ray.get(
-                queue.put_sample.remote(
-                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-                )
-            )
-            assert success
-
-            # 模拟消费者（Trainer）
-            result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1))
-            assert result is not None
-
-            samples, metadata_list = result
-            assert len(samples) == 1
-            assert len(metadata_list) == 1
-
-            ray.kill(queue)
-
-        except ImportError:
-            pytest.skip("Integration test components not available")
-
-
-class TestErrorHandling:
-    """错误处理测试"""
-
-    def test_timeout_handling(self, ray_setup):
-        """测试超时处理"""
-        try:
-            from message_queue import MessageQueueClient
-
-            queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2)
-
-            # 测试从空队列超时获取
-            start_time = time.time()
-            result = ray.get(
-                queue.get_samples.remote(
-                    min_batch_count=1,
-                    timeout=1.0,  # 1秒超时
-                    current_param_version=1,
-                )
-            )
-            elapsed = time.time() - start_time
-
-            assert result is None
-            assert 0.9 <= elapsed <= 2.0  # 允许一些误差
-
-            ray.kill(queue)
-
-        except ImportError:
-            pytest.skip("MessageQueue not available")
-
-
-if __name__ == "__main__":
-    # 如果直接运行此文件，执行所有测试
-    pytest.main([__file__, "-v"])
diff --git a/recipe/fully_async_policy/unittest/test_fully_async.py b/recipe/fully_async_policy/unittest/test_fully_async.py
deleted file mode 100644
index 126ff489bf2..00000000000
--- a/recipe/fully_async_policy/unittest/test_fully_async.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-测试完全异步训练工作流的组件
-"""
-
-import logging
-import unittest
-from unittest.mock import Mock
-
-import ray
-from omegaconf import OmegaConf
-
-from recipe.fully_async_policy.message_queue import DataProto, MessageQueue, MessageQueueClient
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class TestMessageQueue(unittest.TestCase):
-    """测试MessageQueue组件"""
-
-    def setUp(self):
-        """设置测试环境"""
-        if not ray.is_initialized():
-            ray.init(local_mode=True)
-
-        config = OmegaConf.create(
-            {
-                "async_training": {
-                    "staleness_threshold": 3,
-                    "max_staleness_allowed": 5,
-                }
-            }
-        )
-
-        self.message_queue = MessageQueue.remote(config, max_queue_size=100)
-        self.client = MessageQueueClient(self.message_queue)
-
-    def tearDown(self):
-        """清理测试环境"""
-        ray.get(self.message_queue.shutdown.remote())
-        if ray.is_initialized():
-            ray.shutdown()
-
-    def test_basic_put_get(self):
-        """测试基本的put和get操作"""
-        # 创建mock数据
-        mock_batch = Mock(spec=DataProto)
-
-        # 放入样本
-        success = self.client.put_sample(sample=mock_batch, param_version=1, rollout_metadata={"test": "data"})
-        self.assertTrue(success)
-
-        # 获取样本
-        samples = self.client.get_samples(min_batch_count=1, timeout=5.0)
-        self.assertIsNotNone(samples)
-        self.assertEqual(len(samples), 1)
-        self.assertEqual(samples[0].param_version, 1)
-
-    def test_freshness_control(self):
-        """测试新鲜度控制"""
-        mock_batch = Mock(spec=DataProto)
-
-        # 更新参数版本
-        self.client.update_param_version(10)
-
-        # 尝试放入过期样本
-        success = self.client.put_sample(
-            sample=mock_batch,
-            param_version=5,  # 版本差异为5，超过阈值3
-            rollout_metadata={},
-        )
-        self.assertFalse(success)  # 应该被拒绝
-
-    def test_queue_statistics(self):
-        """测试队列统计信息"""
-        stats = self.client.get_statistics()
-        self.assertIn("queue_size", stats)
-        self.assertIn("total_produced", stats)
-        self.assertIn("total_consumed", stats)
-        self.assertIn("dropped_samples", stats)
-
-
-class TestRollouterComponents(unittest.TestCase):
-    """测试Rollouter相关组件"""
-
-    def setUp(self):
-        """设置测试环境"""
-        from .fully_async_rollouter import RolloutController
-
-        self.controller = RolloutController()
-
-    def test_rollout_controller(self):
-        """测试rollout控制器"""
-        # 初始状态应该是运行的
-        self.assertFalse(self.controller.is_paused)
-
-        # 测试暂停
-        self.controller.pause()
-        self.assertTrue(self.controller.is_paused)
-
-        # 测试恢复
-        self.controller.resume()
-        self.assertFalse(self.controller.is_paused)
-
-
-class TestParameterSync(unittest.TestCase):
-    """测试参数同步组件"""
-
-    def test_async_parameter_synchronizer(self):
-        """测试异步参数同步器"""
-        from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer
-
-        config = OmegaConf.create({})
-        mock_actor_wg = Mock()
-        mock_rollouter_actor = Mock()
-
-        sync = AsyncParameterSynchronizer(config, mock_actor_wg, mock_rollouter_actor)
-
-        self.assertEqual(sync.get_current_version(), 0)
-
-
-def test_integration():
-    """集成测试"""
-    logger.info("Starting integration test...")
-
-    if not ray.is_initialized():
-        ray.init(local_mode=True)
-
-    try:
-        # 测试MessageQueue和客户端的集成
-        config = OmegaConf.create(
-            {
-                "async_training": {
-                    "staleness_threshold": 3,
-                    "max_staleness_allowed": 5,
-                }
-            }
-        )
-
-        message_queue = MessageQueue.remote(config, max_queue_size=10)
-        client = MessageQueueClient(message_queue)
-
-        # 模拟生产者-消费者场景
-        mock_batch = Mock(spec=DataProto)
-
-        # 生产样本
-        for i in range(5):
-            success = client.put_sample(sample=mock_batch, param_version=i, rollout_metadata={"batch_id": i})
-            assert success, f"Failed to put batch {i}"
-
-        # 消费样本
-        samples = client.get_samples(min_batch_count=3, timeout=10.0)
-        assert samples is not None, "Failed to get samples"
-        assert len(samples) == 3, f"Expected 3 samples, got {len(samples)}"
-
-        # 检查统计信息
-        stats = client.get_statistics()
-        assert stats["total_produced"] == 5
-        assert stats["total_consumed"] == 3
-
-        logger.info("Integration test passed!")
-
-        # 清理
-        ray.get(message_queue.shutdown.remote())
-
-    finally:
-        if ray.is_initialized():
-            ray.shutdown()
-
-
-if __name__ == "__main__":
-    # 运行单元测试
-    unittest.main(argv=[""], exit=False, verbosity=2)
-
-    # 运行集成测试
-    test_integration()
-
-    print("\n" + "=" * 50)
-    print("所有测试完成!")
-    print("=" * 50)
diff --git a/recipe/fully_async_policy/unittest/test_fully_async_components.py b/recipe/fully_async_policy/unittest/test_fully_async_components.py
deleted file mode 100644
index 8a5bc85d562..00000000000
--- a/recipe/fully_async_policy/unittest/test_fully_async_components.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-单元测试文件，用于测试完全异步PPO训练系统的各个组件
-"""
-
-import os
-
-# Import components to test
-import sys
-import time
-import unittest
-from unittest.mock import Mock
-
-import ray
-from omegaconf import OmegaConf
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from fully_async_rollouter import FullyAsyncRollouter
-from fully_async_trainer import FullyAsyncTrainer
-from message_queue import MessageQueueClient
-from param_sync import ParameterSynchronizer
-
-
-class TestMessageQueue(unittest.TestCase):
-    """测试MessageQueue的功能"""
-
-    def setUp(self):
-        """设置测试环境"""
-        if not ray.is_initialized():
-            ray.init(ignore_reinit_error=True)
-
-        # 创建MessageQueue客户端
-        self.message_queue = MessageQueueClient.remote(max_queue_size=100, max_staleness=3)
-
-    def tearDown(self):
-        """清理测试环境"""
-        if hasattr(self, "message_queue"):
-            ray.kill(self.message_queue)
-
-    def test_put_and_get_samples(self):
-        """测试放入和获取样本的基本功能"""
-        # 创建模拟样本数据
-        mock_sample = Mock()
-        mock_sample.batch_size = 4
-
-        # 测试放入样本
-        success = ray.get(
-            self.message_queue.put_sample.remote(
-                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-            )
-        )
-        self.assertTrue(success)
-
-        # 测试获取样本
-        result = ray.get(self.message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1))
-
-        self.assertIsNotNone(result)
-        samples, metadata_list = result
-        self.assertEqual(len(samples), 1)
-        self.assertEqual(len(metadata_list), 1)
-
-    def test_staleness_control(self):
-        """测试新鲜度控制功能"""
-        mock_sample = Mock()
-        mock_sample.batch_size = 4
-
-        # 放入一个参数版本较老的样本
-        success = ray.get(
-            self.message_queue.put_sample.remote(
-                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-            )
-        )
-        self.assertTrue(success)
-
-        # 尝试用较新的参数版本获取样本（应该被拒绝）
-        result = ray.get(
-            self.message_queue.get_samples.remote(
-                min_batch_count=1,
-                timeout=5.0,
-                current_param_version=5,  # 版本差距为4 > max_staleness(3)
-            )
-        )
-
-        # 应该返回空结果，因为样本过期
-        self.assertIsNone(result)
-
-    def test_queue_statistics(self):
-        """测试队列统计功能"""
-        # 获取初始统计
-        stats = ray.get(self.message_queue.get_statistics.remote())
-        initial_queue_size = stats["queue_size"]
-
-        # 添加一些样本
-        mock_sample = Mock()
-        mock_sample.batch_size = 4
-
-        for i in range(3):
-            ray.get(
-                self.message_queue.put_sample.remote(
-                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-                )
-            )
-
-        # 检查统计是否更新
-        stats = ray.get(self.message_queue.get_statistics.remote())
-        self.assertEqual(stats["queue_size"], initial_queue_size + 3)
-        self.assertEqual(stats["total_produced"], 3)
-
-
-class TestParameterSynchronizer(unittest.TestCase):
-    """测试参数同步器的功能"""
-
-    def setUp(self):
-        """设置测试环境"""
-        if not ray.is_initialized():
-            ray.init(ignore_reinit_error=True)
-
-        self.config = OmegaConf.create(
-            {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}}
-        )
-
-    def test_sync_with_retry(self):
-        """测试带重试机制的参数同步"""
-        # 创建模拟的worker groups
-        mock_actor_wg = Mock()
-        mock_rollout_wg = Mock()
-
-        # 模拟同步操作
-        mock_actor_wg.get_weights.return_value = ray.put({"param1": "value1"})
-        mock_rollout_wg.set_weights.return_value = []
-
-        synchronizer = ParameterSynchronizer.remote(
-            config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg
-        )
-
-        # 测试成功同步
-        result = ray.get(synchronizer.sync_weights.remote())
-        self.assertTrue(result)
-
-    def test_sync_failure_and_retry(self):
-        """测试同步失败和重试机制"""
-        mock_actor_wg = Mock()
-        mock_rollout_wg = Mock()
-
-        # 模拟同步失败
-        mock_actor_wg.get_weights.side_effect = Exception("Sync failed")
-
-        synchronizer = ParameterSynchronizer.remote(
-            config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg
-        )
-
-        # 测试失败时的重试
-        result = ray.get(synchronizer.sync_weights.remote())
-        self.assertFalse(result)
-
-
-class TestFullyAsyncRollouter(unittest.TestCase):
-    """测试异步Rollouter的功能"""
-
-    def setUp(self):
-        """设置测试环境"""
-        if not ray.is_initialized():
-            ray.init(ignore_reinit_error=True)
-
-    def test_pause_resume_functionality(self):
-        """测试暂停和恢复功能"""
-        # 创建配置
-        config = OmegaConf.create(
-            {
-                "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}},
-                "algorithm": {"use_kl_in_reward": False},
-                "critic": {"enable": False},
-                "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"},
-                "async_training": {
-                    "staleness_threshold": 3,
-                    "max_staleness_allowed": 5,
-                    "generation_timeout": 10.0,
-                    "batch_generation_interval": 0.1,
-                },
-            }
-        )
-
-        # 创建模拟的依赖
-        mock_tokenizer = Mock()
-        mock_role_worker_mapping = Mock()
-        mock_resource_pool_manager = Mock()
-
-        # 创建Rollouter实例
-        rollouter = FullyAsyncRollouter.remote(
-            config=config,
-            tokenizer=mock_tokenizer,
-            role_worker_mapping=mock_role_worker_mapping,
-            resource_pool_manager=mock_resource_pool_manager,
-        )
-
-        # 测试暂停功能
-        result = ray.get(rollouter.pause_rollout.remote())
-        self.assertTrue(result)
-
-        # 检查暂停状态
-        is_paused = ray.get(rollouter.is_rollout_paused.remote())
-        self.assertTrue(is_paused)
-
-        # 测试恢复功能
-        result = ray.get(rollouter.resume_rollout.remote())
-        self.assertTrue(result)
-
-        # 检查恢复状态
-        is_paused = ray.get(rollouter.is_rollout_paused.remote())
-        self.assertFalse(is_paused)
-
-    def test_statistics_collection(self):
-        """测试统计信息收集功能"""
-        config = OmegaConf.create(
-            {
-                "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}},
-                "algorithm": {"use_kl_in_reward": False},
-                "critic": {"enable": False},
-                "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"},
-                "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "generation_timeout": 10.0},
-            }
-        )
-
-        mock_tokenizer = Mock()
-        mock_role_worker_mapping = Mock()
-        mock_resource_pool_manager = Mock()
-
-        rollouter = FullyAsyncRollouter.remote(
-            config=config,
-            tokenizer=mock_tokenizer,
-            role_worker_mapping=mock_role_worker_mapping,
-            resource_pool_manager=mock_resource_pool_manager,
-        )
-
-        # 获取统计信息
-        stats = ray.get(rollouter.get_statistics.remote())
-
-        # 验证统计信息包含必要的字段
-        expected_keys = [
-            "total_generated_samples",
-            "dropped_stale_samples",
-            "generation_errors",
-            "current_param_version",
-            "is_paused",
-            "pause_count",
-            "resume_count",
-        ]
-
-        for key in expected_keys:
-            self.assertIn(key, stats)
-
-
-class TestFullyAsyncTrainer(unittest.TestCase):
-    """测试异步Trainer的功能"""
-
-    def setUp(self):
-        """设置测试环境"""
-        if not ray.is_initialized():
-            ray.init(ignore_reinit_error=True)
-
-    def test_freshness_metrics_calculation(self):
-        """测试新鲜度指标计算"""
-        # 创建基本配置
-        config = OmegaConf.create(
-            {
-                "trainer": {
-                    "device": "cpu",
-                    "project_name": "test",
-                    "experiment_name": "test",
-                    "total_epochs": 1,
-                    "total_training_steps": 2,
-                },
-                "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "batch_timeout": 10.0},
-                "data": {"train_batch_size": 4},
-                "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}},
-                "algorithm": {"use_kl_in_reward": False},
-                "critic": {"enable": False},
-            }
-        )
-
-        # 创建模拟的依赖
-        mock_tokenizer = Mock()
-        mock_role_worker_mapping = Mock()
-        mock_resource_pool_manager = Mock()
-
-        trainer = FullyAsyncTrainer.remote(
-            config=config,
-            tokenizer=mock_tokenizer,
-            role_worker_mapping=mock_role_worker_mapping,
-            resource_pool_manager=mock_resource_pool_manager,
-        )
-
-        # 测试新鲜度指标计算
-        current_time = time.time()
-        metadata_list = [
-            {"generation_timestamp": current_time - 5, "rollout_param_version": 1},
-            {"generation_timestamp": current_time - 10, "rollout_param_version": 2},
-            {"generation_timestamp": current_time - 15, "rollout_param_version": 1},
-        ]
-
-        freshness_metrics = ray.get(trainer._calculate_freshness_metrics.remote(metadata_list, current_param_version=3))
-
-        # 验证新鲜度指标
-        self.assertIn("avg_sample_age", freshness_metrics)
-        self.assertIn("max_sample_age", freshness_metrics)
-        self.assertIn("min_sample_age", freshness_metrics)
-        self.assertIn("version_diversity", freshness_metrics)
-        self.assertIn("staleness_ratio", freshness_metrics)
-
-
-class TestIntegrationScenarios(unittest.TestCase):
-    """测试组件集成场景"""
-
-    def setUp(self):
-        """设置测试环境"""
-        if not ray.is_initialized():
-            ray.init(ignore_reinit_error=True)
-
-    def test_message_queue_trainer_integration(self):
-        """测试MessageQueue与Trainer的集成"""
-        # 创建MessageQueue
-        message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
-
-        # 放入一些测试样本
-        mock_sample = Mock()
-        mock_sample.batch_size = 4
-
-        ray.get(
-            message_queue.put_sample.remote(
-                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-            )
-        )
-
-        # 验证Trainer能够获取样本
-        result = ray.get(message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1))
-
-        self.assertIsNotNone(result)
-        samples, metadata_list = result
-        self.assertEqual(len(samples), 1)
-
-    def test_rollouter_message_queue_integration(self):
-        """测试Rollouter与MessageQueue的集成"""
-        # 这个测试需要更多的模拟设置，因为涉及到实际的模型生成
-        # 在实际实现中，可以使用更多的Mock对象来模拟这种集成
-        pass
-
-
-class TestErrorHandling(unittest.TestCase):
-    """测试错误处理和边界情况"""
-
-    def setUp(self):
-        """设置测试环境"""
-        if not ray.is_initialized():
-            ray.init(ignore_reinit_error=True)
-
-    def test_message_queue_overflow(self):
-        """测试消息队列溢出处理"""
-        # 创建小容量的队列
-        message_queue = MessageQueueClient.remote(max_queue_size=2, max_staleness=3)
-
-        mock_sample = Mock()
-        mock_sample.batch_size = 4
-
-        # 填满队列
-        for i in range(2):
-            result = ray.get(
-                message_queue.put_sample.remote(
-                    epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-                )
-            )
-            self.assertTrue(result)
-
-        # 尝试再放入一个样本（应该失败或者覆盖旧样本）
-        result = ray.get(
-            message_queue.put_sample.remote(
-                epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()}
-            )
-        )
-
-        # 根据实现，这里可能是False（拒绝）或True（覆盖）
-        self.assertIsInstance(result, bool)
-
-    def test_timeout_handling(self):
-        """测试超时处理"""
-        message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3)
-
-        # 尝试从空队列获取样本，应该超时
-        start_time = time.time()
-        result = ray.get(
-            message_queue.get_samples.remote(
-                min_batch_count=1,
-                timeout=1.0,  # 1秒超时
-                current_param_version=1,
-            )
-        )
-        elapsed = time.time() - start_time
-
-        # 应该返回None并且大约在1秒后返回
-        self.assertIsNone(result)
-        self.assertGreater(elapsed, 0.9)  # 允许一些误差
-        self.assertLess(elapsed, 2.0)
-
-
-if __name__ == "__main__":
-    # 设置测试套件
-    test_suite = unittest.TestSuite()
-
-    # 添加测试用例
-    test_classes = [
-        TestMessageQueue,
-        TestParameterSynchronizer,
-        TestFullyAsyncRollouter,
-        TestFullyAsyncTrainer,
-        TestIntegrationScenarios,
-        TestErrorHandling,
-    ]
-
-    for test_class in test_classes:
-        tests = unittest.TestLoader().loadTestsFromTestCase(test_class)
-        test_suite.addTests(tests)
-
-    # 运行测试
-    runner = unittest.TextTestRunner(verbosity=2)
-    result = runner.run(test_suite)
-
-    # 清理Ray
-    if ray.is_initialized():
-        ray.shutdown()
-
-    # 退出
-    exit(0 if result.wasSuccessful() else 1)
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 50eb9070314..52a4d2bc8fd 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -121,6 +121,8 @@ common_params=(
     trainer.n_gpus_per_node=${n_gpus_training}
     rollout.nnodes=1
     rollout.n_gpus_per_node=${n_gpus_rollout}
+    rollout.total_rollout_steps=100
+    rollout.total_epochs=10
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}
     async_training.sync_timeout=${sync_timeout}
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 9b87d5a3bd8..89acaebfe03 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1248,7 +1248,7 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
-            print("marked_timer rewold_log_prob")
+            print("marked_timer old_log_prob")
 
             old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
             entropys = old_log_prob.batch["entropys"]

From 50cb8dfd799b903af4d4f00dcabd77b7fe4830d9 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 8 Aug 2025 18:46:05 +0800
Subject: [PATCH 032/182] stop train

---
 recipe/fully_async_policy/fully_async_trainer.py | 13 ++-----------
 recipe/fully_async_policy/message_queue.py       |  9 ++++++---
 tests/special_e2e/run_fully_async_policy.sh      |  4 +---
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 29a7a5c830b..588f5998fe7 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -285,9 +285,6 @@ def fit(self):
         # load checkpoint before doing anything
         self._load_checkpoint()
 
-        self.total_training_steps = self.config.trainer.total_training_steps
-
-        print(f"Total training steps: {self.total_training_steps}")
         # we start from step 1
         self.global_steps += 1
         last_val_metrics = None
@@ -309,7 +306,7 @@ def fit(self):
             metrics = {}
             timing_raw = {}
 
-            is_last_step = self.global_steps >= self.total_training_steps
+            is_last_step = False
 
             with marked_timer("step", timing_raw):
                 with marked_timer("gen", timing_raw, color="red"):
@@ -352,8 +349,6 @@ def fit(self):
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 print("_log_rollout")
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-                print("_validate_metrics")
-                last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
                 print("_check_save_checkpoint")
                 self._check_save_checkpoint(is_last_step, timing_raw)
 
@@ -366,12 +361,8 @@ def fit(self):
             print("_trigger_parameter_sync_after_step")
 
             self._trigger_parameter_sync_after_step()
-            print("global_steps")
+            print(f"global_steps: {self.global_steps}")
             self.global_steps += 1
-            print(f"is_last_step {is_last_step}")
-            if is_last_step:
-                print("is_last_step")
-                return
 
     def get_statistics(self) -> dict:
         """获取训练统计信息"""
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 47dbd34ecff..ad261b0072a 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -119,7 +119,10 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
         print("get_samples")
         with self.lock:
             while len(self.queue) < min_batch_count and self.running:
-                print("consumer_condition")
+                print(f"consumer_condition {len(self.queue)}")
+                for data in self.queue:
+                    if data is None:
+                        return []
                 self.consumer_condition.wait()
 
             # 如果队列已关闭且没有足够样本，返回空列表
@@ -135,7 +138,7 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
                     if data is None:
                         return []
                     else:
-                        samples.append(self.queue.popleft())
+                        samples.append(data)
 
             self.total_consumed += len(samples)
             return samples
@@ -174,7 +177,7 @@ def clear_queue(self):
 
     def shutdown(self):
         """关闭消息队列"""
-        with self.lock:  # 修正：需要加锁
+        with self.lock:
             self.running = False
             # 通知所有等待的线程，让它们能够退出
             self.consumer_condition.notify_all()
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 52a4d2bc8fd..c95476e898a 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -114,14 +114,12 @@ common_params=(
     trainer.val_before_train=False
     trainer.test_freq=-1
     trainer.save_freq=-1
-    trainer.total_epochs=2
-    trainer.total_training_steps=10
     trainer.resume_mode=disable
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
     rollout.nnodes=1
     rollout.n_gpus_per_node=${n_gpus_rollout}
-    rollout.total_rollout_steps=100
+    rollout.total_rollout_steps=10
     rollout.total_epochs=10
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}

From d59b734298d25fd80ed914363ae8cd322465c2b3 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 8 Aug 2025 18:50:12 +0800
Subject: [PATCH 033/182] readme docs

---
 recipe/fully_async_policy/README.md | 66 +++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 recipe/fully_async_policy/README.md

diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md
new file mode 100644
index 00000000000..0509969216b
--- /dev/null
+++ b/recipe/fully_async_policy/README.md
@@ -0,0 +1,66 @@
+# 基于verl的改造方案
+
+## 方案
+
+### 方案1 (StreamRL, AsyncFlow)
+
+![StreamRL](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/StreamRL.png?raw=true)
+
+在分离架构的基础上，修改在Rollout和Train的样本传递过程中，将离线策略生成一批global样本修改为生成一批batch的方式，实现生成和训练两阶段的高度重叠。
+训练阶段一收到足够样本就开始处理，训练一定步数后，将参数同步到PS侧， Rollout在每次样本生成完成后，check是否有新的参数，如果有就进行一次同步。
+
+### 方案2 (Mistralai, Areal)
+
+![mistralai](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/mistralai.png?raw=true)
+
+在分离架构的基础上，实现Rollout的partial rollout逻辑。样本仍然修改为batch的方式进行传递，实现生成和训练两阶段的高度重叠。
+在参数同步方面，训练阶段主动触发Rollout的暂停，参数同步以及恢复。 Rollout使用Rollout Server的方式，支持样本生成的中断与恢复，
+产生的的样本所使用的参数版本会有所不同。
+
+### 折中
+
+上述两种方案的核心都是将训练与生成进行overlap，核心区别主要集中在参数同步的处理方式不同，方案1需要实现PS完成参数的异步加载。
+方案2使用同步的方式进行参数同步，但需要完成PartialRollout的逻辑。综合已有代码，以及社区进行中的工作，我们希望先将异步的工作流搭建完成，先以方案1进行开发，后续再进一步开发方案2。
+
+## 设计
+
+### 架构图
+
+![full_async](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/full_async.svg?raw=true)
+
+为实现纯异步训练工作流，基于已有的 one step off policy 代码，扩增实现 Rollouter 以及 Message Queue，以及对Trainer进行更新。
+
+整体的训练流程参考StreamRL，将原有流程中生成 train_batch_size 个样本后进行下一步训练的过程，修改为流式的样本传递，train
+拿到一次前向的样本后就进行样本分发（ppo_mini_batch_size*worker）。与one-step-off相比，我们将一次step的异步，继续细化到一次batch的异步。
+
+**MessageQueue** 作为Ray的Actor存在，支持zeromq消息队列保存生成的样本，并提供给Trainer使用。Trainer 和 Rollouter 都持有
+MessageQueue 的Handler，通过接口完成样本的插入与消费。
+
+**FullyAsyncRollouter** 类似于现有的 Trainer，实现fit()工作流，循环调用 Rollout 进行样本的生成。FullyAsyncRollouter 对于已有的
+vLLMAsyncRollout SGLangAsyncRollout 进行封装。
+
+* 方案1，使用异步更新策略，FullyAsyncRollouter 根据样本生成的进展，自动访问PS，判断是否进行新的参数加载。
+* 方案2，参考PR https://github.com/volcengine/verl/pull/2246 https://github.com/volcengine/verl/pull/2200 Rollout
+  组件需要支持暂停及恢复，从而进行参数的更新。暂停时，需要保存进行中的rollout样本，下次继续恢复生产。
+
+**FullyAsyncTrainer** 与当前实现类似，区别是样本的获取修改为从Queue中获取，Queue有最少batch样本就开始进行分发。rainer完成一次step的训练后，
+与FullyAsyncRollouter的使用策略对应：
+
+* 方案1，使用异步更新策略，参数产生后，主动同步到PS中。
+* 方案2，直接调用Rollouter进行同步，主动通知Rollouter暂停生成，进行参数的同步更新。
+
+## 总结
+
+当Rollouter生产快于Trainer消费时，queue中会存在多步过期的样本，我们需要在Rollouter中设置“陈旧度 staleness”阈值，
+由当前的参数版本以及生成的样本数量，决定是否要暂停生成。zeromq 的最大长度应为 staleness * total_size，并且实现基于陈旧度的拒绝策略，进行防御性编程。
+
+* 当使用方案1时，参数的同步由FullyAsyncRollouter主动控制，触发时机取决预先设置的固定数量样本完成以及参数已就绪，产生的样本所使用的参数版本一致，
+  但是避免不了长尾的问题，会有"rollout空洞"产生。
+
+* 当使用方案2时，参数的同步会更加及时，陈旧度低的样本数量较多，但是长尾样本由不同的参数产生，长尾样本的不同token所对应的参数版本会传递给训练引擎，
+  后续可以根据这一信息对loss进行加权处理。
+
+当Rollouter生产慢于Trainer消费时，队列长时间为空，基本等价于同步训练。
\ No newline at end of file

From 6e5da717c8cb67c2e6e49e70c7bdcedf6e702457 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 8 Aug 2025 19:40:28 +0800
Subject: [PATCH 034/182] refactor code

---
 recipe/fully_async_policy/fully_async_main.py |  55 +-
 .../fully_async_rollouter.py                  |  71 +--
 .../fully_async_policy/fully_async_trainer.py | 107 ++--
 recipe/fully_async_policy/param_sync.py       |  20 +-
 .../unittest/protocol_examples.py             | 202 --------
 recipe/fully_async_policy/unittest/test_mq.py | 473 ++++++------------
 .../fully_async_policy/unittest/test_mq2.py   | 171 -------
 .../unittest/test_protocol_split_merge.py     | 207 +++++++-
 8 files changed, 426 insertions(+), 880 deletions(-)
 delete mode 100644 recipe/fully_async_policy/unittest/protocol_examples.py
 delete mode 100644 recipe/fully_async_policy/unittest/test_mq2.py

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 31541982dd7..6afb44abd9d 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import os
-import signal
 import socket
 import threading
-import time
 from pprint import pprint
 
 import hydra
@@ -33,14 +31,14 @@
 
 def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
     """
-    创建资源池管理器
+    Create resource pool manager
 
     Args:
-        config: 配置对象
-        roles: 需要创建资源池的角色列表
+        config: Configuration object
+        roles: List of roles that need to create resource pools
 
     Returns:
-        ResourcePoolManager: 资源池管理器
+        ResourcePoolManager: Resource pool manager
     """
     # 构建资源池规格
     resource_pool_spec = {}
@@ -73,13 +71,13 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
 
 def create_role_worker_mapping(config):
     """
-    创建角色到worker类的映射
+    Create mapping from roles to worker classes
 
     Args:
-        config: 配置对象
+        config: Configuration object
 
     Returns:
-        dict: 角色到worker类的映射
+        dict: Mapping from roles to worker classes
     """
     # 根据策略选择worker类
     if config.actor_rollout_ref.actor.strategy == "fsdp2":
@@ -121,7 +119,6 @@ def create_role_worker_mapping(config):
         Role.Critic: ray.remote(CriticWorker),
     }
 
-    # 添加reward model（如果启用）
     if config.reward_model.enable:
         if config.reward_model.strategy == "fsdp2":
             from verl.workers.fsdp_workers import RewardModelWorker
@@ -153,36 +150,23 @@ def __init__(self):
     def run(self, config):
         """运行完全异步的PPO训练"""
         print("Starting fully async PPO training...")
-        # 初始化基础组件
         self._initialize_components(config)
-        # 启动训练流程
         self._run_training_loop()
 
     def _initialize_components(self, config) -> None:
-        """
-        初始化所有组件
-
-        Args:
-            config: 配置对象
-
-        Returns:
-            bool: 是否初始化成功
-        """
-        # 打印配置信息
         print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
         pprint(OmegaConf.to_container(config, resolve=True))
         OmegaConf.resolve(config)
 
-        # 初始化模型路径和tokenizer
         print("Initializing model and tokenizer...")
         local_path = copy_to_local(
             config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
         )
-        # Instantiate the tokenizer and processor.
         from verl.utils import hf_processor, hf_tokenizer
 
         trust_remote_code = config.data.get("trust_remote_code", False)
         tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+
         # Used for multimodal LLM, could be None
         processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
 
@@ -190,13 +174,11 @@ def _initialize_components(self, config) -> None:
         self.components["processor"] = processor
         self.components["config"] = config  # 保存config以供其他方法使用
 
-        # 创建worker映射和资源池
         print("Creating worker mapping and resource pools...")
         role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config)
         self.components["role_worker_mapping"] = role_worker_mapping
         self.components["ray_worker_group_cls"] = ray_worker_group_cls
 
-        # 创建奖励函数
         print("Loading reward functions...")
         reward_fn = load_reward_manager(
             config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
@@ -207,12 +189,11 @@ def _initialize_components(self, config) -> None:
         self.components["reward_fn"] = reward_fn
         self.components["val_reward_fn"] = val_reward_fn
 
-        # 创建MessageQueue
         self.max_queue_size = (
-                config.async_training.staleness_threshold
-                * config.data.train_batch_size
-                * config.actor_rollout_ref.rollout.n
-        ) * 10 # x 10 避免死锁
+            config.async_training.staleness_threshold
+            * config.data.train_batch_size
+            * config.actor_rollout_ref.rollout.n
+        ) * 10  # x 10 avoid deadlock
         print("Creating MessageQueue...")
         message_queue = MessageQueue.remote(config, self.max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
@@ -220,15 +201,12 @@ def _initialize_components(self, config) -> None:
         self.components["message_queue"] = message_queue
         self.components["message_queue_client"] = message_queue_client
 
-        # 创建Rollouter
         print("Creating FullyAsyncRollouter...")
         self._create_rollouter(config)
 
-        # 创建Trainer
         print("Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
-        # 设置参数同步
         print("Setting up parameter synchronization...")
         from recipe.fully_async_policy.param_sync import ParameterSynchronizer
 
@@ -239,18 +217,15 @@ def _initialize_components(self, config) -> None:
             mq=self.components["message_queue_client"],
         )
 
-        # 将参数同步器设置到trainer和rollouter
         ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
         ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer))
 
-        # 首先同步一次参数
         ray.get(param_synchronizer.sync_weights.remote(0))
 
         self.components["param_synchronizer"] = param_synchronizer
         print("All components initialized successfully")
 
     def _create_rollouter(self, config) -> None:
-        """创建Rollouter"""
         pprint(self.components)
         rollouter = FullyAsyncRollouter.remote(
             config=config,
@@ -269,7 +244,6 @@ def _create_rollouter(self, config) -> None:
         print("Rollouter created and initialized successfully")
 
     def _create_trainer(self, config) -> None:
-        """创建Trainer"""
         trainer_role_mapping = {
             role: worker_cls
             for role, worker_cls in self.components["role_worker_mapping"].items()
@@ -288,14 +262,12 @@ def _create_trainer(self, config) -> None:
             device_name=config.trainer.device,
         )
 
-        # 初始化Trainer
         ray.get(trainer.init_workers.remote())
         ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"]))
         self.components["trainer"] = trainer
         print("FullyAsyncTrainer created and initialized successfully")
 
     def _run_training_loop(self):
-        """运行训练循环"""
         self.running = True
 
         print("Starting Rollouter in background...")
@@ -312,10 +284,9 @@ def _run_training_loop(self):
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
-    """主入口函数"""
     from verl.trainer.main_ppo import run_ppo
 
-    # 确保异步训练配置存在
+    # Ensure async training config exists
     if not hasattr(config, "async_training"):
         raise RuntimeError("must set async_training config")
     run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index d392e4a1630..01affa67586 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -29,38 +29,24 @@
 @ray.remote(num_cpus=10, max_concurrency=10)
 class FullyAsyncRollouter(RayPPOTrainer):
     """
-    异步样本生成器，负责持续生成训练样本并放入MessageQueue
-    基于OneStepOffRayTrainer的成熟实现改进
+    Asynchronous sample generator, responsible for continuously generating training samples
+    and putting them into MessageQueue
+    Based on the mature implementation improvements of OneStepOffRayTrainer
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
-            max_queue_size=1000,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
+        max_queue_size=1000,
     ):
-        """
-        Initialize distributed PPO trainer with Ray backend.
-        Note that this trainer runs on the driver process on a single CPU/GPU node.
-
-        Args:
-            config: Configuration object containing training parameters.
-            tokenizer: Tokenizer used for encoding and decoding text.
-            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
-            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
-            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
-            processor: Optional data processor, used for multimodal data
-            reward_fn: Function for computing rewards during training.
-            val_reward_fn: Function for computing rewards during validation.
-            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
-        """
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
         self.processor = processor
@@ -86,7 +72,7 @@ def __init__(
         self.use_reference_policy = False
         self.use_rm = False
 
-        # 创建数据集
+        # Create datasets
         print("Creating datasets...")
         from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
         from verl.utils.dataset.rl_dataset import collate_fn
@@ -107,16 +93,16 @@ def __init__(
         self.total_rollout_steps = total_rollout_steps
         print(f"Total rollout steps: {self.total_rollout_steps}")
 
-        # rollouter 参数配置
+        # Rollouter parameter configuration
         self.message_queue_client = None
 
         self.current_param_version = 0
 
-        # 新鲜度控制 - 改进的配置管理
+        # Freshness control - improved configuration management
         async_config = config.async_training
         self.staleness_threshold = async_config.get("staleness_threshold", 3)
 
-        # 统计信息
+        # Statistics
         self.total_generated_samples = 0
         self.dropped_stale_samples = 0
         self.param_sync_requests = 0
@@ -125,7 +111,7 @@ def __init__(
         self.rollout_wg = None
         self.message_queue_client = None
 
-        # 并发控制
+        # Concurrency control
         self.running = False
         self.paused = False
         self.generation_thread = None
@@ -134,48 +120,43 @@ def __init__(
         self.lock = threading.RLock()
         self.condition = threading.Condition(self.lock)
 
-        # 暂停/恢复统计信息
+        # Pause/resume statistics
         self.pause_count = 0
         self.resume_count = 0
         self.total_pause_time = 0.0
         self.last_pause_time = None
 
-        # 参数同步相关
+        # Parameter synchronization related
         self.param_synchronizer = None
         self.last_sync_time = 0
         self.sync_in_progress = False
         self.sync_lock = threading.Lock()
 
-        # 参数同步状态 - 基于one_step_off_policy模式
-        self._weights_info = None
-        self._is_rollout = True  # rollouter是rollout角色
-        self._is_actor = False
-
         self.max_queue_size = max_queue_size
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
-        """设置消息队列客户端"""
+        """Set message queue client"""
         with self.lock:
             self.message_queue_client = message_queue_client
 
     def set_parameter_synchronizer(self, param_synchronizer):
-        """设置参数同步器"""
+        """Set parameter synchronizer"""
         with self.lock:
             self.param_synchronizer = param_synchronizer
 
     def get_rollout_wg(self):
-        """获取 rollout worker group"""
+        """Get rollout worker group"""
         return self.rollout_wg
 
     def update_param_version(self, version: int):
-        """更新当前参数版本"""
+        """Update current parameter version"""
         with self.lock:
             old_version = self.current_param_version
             self.current_param_version = version
             print(f"Parameter version updated from {old_version} to {version}")
 
     def _validate_config(self):
-        # 验证异步训练配置
+        # Validate asynchronous training configuration
         if not hasattr(self.config, "async_training"):
             raise ValueError("Missing async_training configuration")
 
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 588f5998fe7..39fedb022d5 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import logging
-import threading
 import time
 import warnings
-from pprint import pprint
 from typing import Any
 
 import numpy as np
@@ -46,18 +44,17 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
-
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
         self.processor = processor
@@ -97,43 +94,35 @@ def __init__(
 
         self._validate_config()
 
-        self.lock = threading.RLock()
         self.message_queue_client = None
         self.param_synchronizer = None
 
-        # 统计信息
+        # Statistics
         self.processed_samples = 0
         self.stale_samples_processed = 0
         self.current_param_version = 0
 
-        # 参数同步相关状态
-        self._weights_info = None
-        self._is_actor = False  # 将在init_worker_group中设置
-        self._is_rollout = False
-
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
-        """设置消息队列客户端"""
-        with self.lock:
-            self.message_queue_client = message_queue_client
+        """Set message queue client"""
+        self.message_queue_client = message_queue_client
 
     def set_parameter_synchronizer(self, param_synchronizer):
-        """设置参数同步器"""
-        with self.lock:
-            self.param_synchronizer = param_synchronizer
+        """Set parameter synchronizer"""
+        self.param_synchronizer = param_synchronizer
 
     def get_actor_wg(self):
-        """获取 actor worker group"""
+        """Get actor worker group"""
         return self.actor_wg
 
     def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
-        从消息队列获取样本并组成gen_batch_output
+        Get samples from message queue and compose gen_batch_output
 
         Returns:
             tuple: (epoch, batch_dict, gen_batch_output)
         """
 
-        # 计算需要获取的样本数量
+        # Calculate the number of samples needed
         n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
         batch_size = self.config.data.train_batch_size
         required_samples = n_responses_per_prompt * batch_size
@@ -143,7 +132,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
             flush=True,
         )
 
-        # 从队列获取样本
+        # Get samples from queue
         consumer_start = time.time()
         queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples)
         consumer_end = time.time()
@@ -157,7 +146,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
         print(queue_samples)
 
-        # 组装 batch
+        # Assemble batch
         batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
 
         print("=" * 200)
@@ -167,15 +156,15 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
     def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]):
         """
-        从队列样本中组装gen_batch_output
+        Assemble gen_batch_output from queue samples
 
         Args:
-            queue_samples: 队列中的样本列表
-            n_responses_per_prompt: 每个prompt的响应数量
-            batch_size: 批次大小
+            queue_samples: List of samples from queue
+            n_responses_per_prompt: Number of responses per prompt
+            batch_size: Batch size
 
         Returns:
-            DataProto: 组装好的gen_batch_output
+            DataProto: Assembled gen_batch_output
         """
         import numpy as np
 
@@ -186,7 +175,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
 
         print(f"Assembling batch from {len(queue_samples)} queue samples")
 
-        # 提取所有样本的数据和元数据
+        # Extract data and metadata from all samples
         sample_data_list = []
         rollout_metadata_list = []
         timing_info = {}
@@ -197,11 +186,11 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
 
         batch = DataProto.from_items(sample_data_list)
 
-        # 收集timing信息和metadata
+        # Collect timing information and metadata
         param_versions = []
         sample_timestamps = []
         for metadata in rollout_metadata_list:
-            # 提取参数版本和时间戳
+            # Extract parameter version and timestamp
             param_versions.append(metadata.get("rollout_param_version", 0))
             sample_timestamps.append(metadata.get("generation_timestamp", time.time()))
             if "timing" in metadata:
@@ -210,13 +199,13 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
                         timing_info[timing_key] = []
                     # if isinstance(timing_value, (int, float)):
                     #     timing_info[timing_key].append(timing_value)
-        # 计算平均timing
+        # Calculate average timing
         avg_timing = {}
         for key, values in timing_info.items():
             if values and len(values) > 0:
                 avg_timing[key] = sum(values) / len(values)
 
-        # 创建meta_info
+        # Create meta_info
         meta_info = {
             "timing": avg_timing,
             "queue_sample_count": len(queue_samples),
@@ -287,15 +276,14 @@ def fit(self):
 
         # we start from step 1
         self.global_steps += 1
-        last_val_metrics = None
         self.max_steps_duration = 0
 
-        # 使用队列模式，不需要传统的dataloader迭代器
-        # 初始化获取第一批数据
+        # Use queue mode, no need for traditional dataloader iterator
+        # Initialize to get the first batch of data
         while True:
             print("while True", flush=True)
 
-            # 检查队列状态
+            # Check queue status
             if self.message_queue_client:
                 queue_stats = self.message_queue_client.get_statistics()
                 print(f"Queue status before getting samples: {queue_stats}")
@@ -317,7 +305,6 @@ def fit(self):
                 print("_get_samples_from_queue end")
 
                 # # 更新统计信息
-                # with self.lock:
                 #     self.processed_samples += len(batch) if isinstance(batch, list) else 1
                 #
                 #     # 从meta_info中获取参数版本信息
@@ -352,20 +339,17 @@ def fit(self):
                 print("_check_save_checkpoint")
                 self._check_save_checkpoint(is_last_step, timing_raw)
 
-            print("_stop_profiling")
-            # self._stop_profiling(do_profile, timing_raw)
             print("_collect_metrics")
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
 
-            # 在训练步骤结束后触发参数同步
+            # Trigger parameter synchronization after training step
             print("_trigger_parameter_sync_after_step")
-
             self._trigger_parameter_sync_after_step()
             print(f"global_steps: {self.global_steps}")
             self.global_steps += 1
 
     def get_statistics(self) -> dict:
-        """获取训练统计信息"""
+        """Get training statistics"""
         queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {}
         return {
             "global_steps": self.global_steps,
@@ -380,37 +364,40 @@ def get_statistics(self) -> dict:
 
     def _trigger_parameter_sync_after_step(self):
         """
-        在训练步骤结束后触发参数同步
-        这确保rollouter总是使用最新训练的参数
+        Trigger parameter synchronization after training step
+        This ensures rollouter always uses the latest trained parameters
         """
         self.current_param_version = self.current_param_version + 1
         print(
-            f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}"
+            f"[TRAINER] Triggering parameter sync after "
+            f"training step {self.global_steps}, version: {self.current_param_version}"
+        )
+        logger.info(
+            f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}"
         )
-        logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}")
         ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
 
     def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
         """
-        计算样本新鲜度指标
+        Compute sample freshness metrics
 
         Args:
-            batch_samples: 队列样本列表
+            batch_samples: List of queue samples
 
         Returns:
-            dict: 新鲜度指标字典
+            dict: Dictionary of freshness metrics
         """
         if not batch_samples:
             return {}
 
         try:
-            # 提取参数版本和时间戳
+            # Extract parameter versions and timestamps
             sample_ages = []
             sample_latencies = []
             current_time = time.time()
 
             for sample in batch_samples:
-                # 从rollout_metadata中获取信息
+                # Get information from rollout_metadata
                 if hasattr(sample, "rollout_metadata") and sample.rollout_metadata:
                     rollout_version = sample.rollout_metadata.get("rollout_param_version", 0)
                     generation_time = sample.rollout_metadata.get("generation_timestamp", current_time)
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index cb9baa5ff8a..11d94c79ae4 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -23,9 +23,9 @@
 @ray.remote
 class ParameterSynchronizer:
     """
-    统一的参数同步器，负责在actor和rollout之间同步模型参数
-    基于one_step_off_policy的成熟同步模式实现
-    合并了原有的多个同步器类的功能
+    Unified parameter synchronizer, responsible for synchronizing model parameters between actor and rollout
+    Based on the mature synchronization mode implementation of one_step_off_policy
+    Merges the functions of the original multiple synchronizer classes
     """
 
     def __init__(self, config, trainer, rollouter, mq):
@@ -36,23 +36,23 @@ def __init__(self, config, trainer, rollouter, mq):
         self.actor_wg = ray.get(trainer.get_actor_wg.remote())
         self.rollout_wg = ray.get(rollouter.get_rollout_wg.remote())
 
-        # 基础属性
+        # Basic attributes
         self.weights_info = None
         self.sync_group_initialized = False
         self.sync_group_name = "actor_rollout"
 
-        # 统计信息
+        # Statistics
         self.current_version = 0
 
         self._init_weights_info()
         self._init_sync_group()
 
     def get_current_param_version(self) -> int:
-        """获取当前参数版本号"""
+        """Get current parameter version number"""
         return self.current_version
 
     def get_weights_info(self):
-        """获取权重信息"""
+        """Get weights info"""
         return self.weights_info
 
     def _init_weights_info(self):
@@ -74,16 +74,16 @@ def sync_weights(self, version):
         self.current_version = version
         print(f"Starting weight synchronization (version {self.current_version})...")
 
-        print("pause rollout")
         ray.get(self.rollouter.pause.remote())
 
-        # 更新MQ 版本
+        # Update MQ version
         self.mq_client.update_param_version(version)
 
+        # sync weights
         self.actor_wg.sync_rollout_weights()
         ray.get(self.rollout_wg.sync_rollout_weights())
 
-        # 更新 rollout 版本
+        # Update rollout version
         ray.get(self.rollouter.update_param_version.remote(version))
         ray.get(self.rollouter.resume.remote())
         print("sync_weights success")
diff --git a/recipe/fully_async_policy/unittest/protocol_examples.py b/recipe/fully_async_policy/unittest/protocol_examples.py
deleted file mode 100644
index b695c163c23..00000000000
--- a/recipe/fully_async_policy/unittest/protocol_examples.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import torch
-
-from verl.protocol import DataProto, DataProtoItem
-
-
-def example_basic_split_merge():
-    """Basic example of splitting DataProto into DataProtoItems and merging back."""
-    print("=== Basic Split and Merge Example ===")
-
-    # Create sample data
-    batch_size = 3
-    seq_len = 5
-
-    # Create tensors
-    input_ids = torch.randint(0, 1000, (batch_size, seq_len))
-    attention_mask = torch.ones(batch_size, seq_len)
-
-    # Create non-tensor data
-    prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object)
-    scores = np.array([0.8, 0.9, 0.7], dtype=object)
-
-    # Create DataProto
-    data_proto = DataProto.from_dict(
-        tensors={"input_ids": input_ids, "attention_mask": attention_mask},
-        non_tensors={"prompts": prompts, "scores": scores},
-        meta_info={"model_name": "test_model", "version": "1.0"},
-    )
-
-    print(f"Original DataProto length: {len(data_proto)}")
-    print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}")
-    print(f"Prompts: {data_proto.non_tensor_batch['prompts']}")
-
-    # Split into DataProtoItems
-    items = data_proto.to_items()
-    print(f"\nSplit into {len(items)} items")
-
-    for i, item in enumerate(items):
-        print(f"Item {i}:")
-        print(f"  Input IDs shape: {item.batch['input_ids'].shape}")
-        print(f"  Prompt: {item.non_tensor_batch['prompts']}")
-        print(f"  Score: {item.non_tensor_batch['scores']}")
-
-    # Merge back to DataProto
-    merged_proto = DataProto.from_items(items)
-    print(f"\nMerged DataProto length: {len(merged_proto)}")
-    print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}")
-    print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}")
-
-    # Verify they're identical
-    assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"])
-    assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"])
-    assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"])
-    assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"])
-
-    print("\n✓ Original and merged DataProto are identical!")
-
-
-def example_item_processing():
-    """Example showing individual item processing before merging."""
-    print("\n=== Individual Item Processing Example ===")
-
-    # Create initial data
-    #    batch_size = 4
-
-    values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1)  # Shape: (4, 1)
-    labels = np.array(["A", "B", "C", "D"], dtype=object)
-
-    original_proto = DataProto.from_dict(
-        tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0}
-    )
-
-    print(f"Original values: {original_proto.batch['values'].flatten()}")
-    print(f"Original labels: {original_proto.non_tensor_batch['labels']}")
-
-    # Split and process each item individually
-    items = original_proto.to_items()
-    processed_items = []
-
-    for i, item in enumerate(items):
-        # Process the tensor data (multiply by 2)
-        processed_value = item.batch["values"] * 2
-
-        # Process the non-tensor data (add suffix)
-        processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}"
-
-        # Create new processed item
-        processed_item = DataProtoItem(
-            batch=item.batch.clone(),  # Clone the TensorDict
-            non_tensor_batch=item.non_tensor_batch.copy(),
-            meta_info=item.meta_info.copy(),
-        )
-
-        # Update with processed data
-        processed_item.batch["values"] = processed_value
-        processed_item.non_tensor_batch["labels"] = processed_label
-        processed_item.meta_info["processing_step"] = 1
-
-        processed_items.append(processed_item)
-
-        print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'")
-
-    # Merge processed items back
-    processed_proto = DataProto.from_items(processed_items)
-
-    print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}")
-    print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}")
-    print(f"Processing step: {processed_proto.meta_info['processing_step']}")
-
-
-def example_convenience_methods():
-    """Example showing convenience methods."""
-    print("\n=== Convenience Methods Example ===")
-
-    # Create a single DataProtoItem
-    single_tensor = torch.tensor([42]).unsqueeze(0)  # Shape: (1,)
-    single_item = DataProtoItem(
-        batch=None,  # We'll create TensorDict manually
-        non_tensor_batch={"text": "Hello"},
-        meta_info={"source": "manual"},
-    )
-
-    # Create TensorDict manually for the single item
-    from tensordict import TensorDict
-
-    single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,))
-
-    print(f"Single item data: {single_item.batch['data']}")
-    print(f"Single item text: {single_item.non_tensor_batch['text']}")
-
-    # Convert single item to DataProto using convenience method
-    single_proto = single_item.to_proto()
-    print(f"Converted to DataProto length: {len(single_proto)}")
-
-    # Create multiple items and use static convenience method
-    items = [single_item]
-    for i in range(2):
-        new_item = single_item.copy()  # Use the copy method
-        new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0)
-        new_item.non_tensor_batch["text"] = f"Item {i + 1}"
-        items.append(new_item)
-
-    # Use DataProtoItem.from_items() convenience method
-    merged_proto = DataProtoItem.from_items(items)
-    print(f"Merged using convenience method - length: {len(merged_proto)}")
-    print(f"Data: {merged_proto.batch['data'].flatten()}")
-    print(f"Texts: {merged_proto.non_tensor_batch['text']}")
-
-
-def example_error_handling():
-    """Example showing error handling."""
-    print("\n=== Error Handling Example ===")
-
-    # Try to create DataProto from empty list
-    try:
-        DataProto.from_items([])
-        print("ERROR: Should have raised exception for empty list")
-    except ValueError as e:
-        print(f"✓ Correctly caught error for empty list: {e}")
-
-    # Try to merge items with inconsistent structure
-    try:
-        item1 = DataProtoItem(
-            batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)),
-            non_tensor_batch={"text": "Hello"},
-        )
-        item2 = DataProtoItem(
-            batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)),
-            non_tensor_batch={"text": "World"},
-        )
-
-        DataProto.from_items([item1, item2])
-        print("ERROR: Should have raised exception for inconsistent structure")
-    except ValueError as e:
-        print(f"✓ Correctly caught error for inconsistent structure: {e}")
-
-
-if __name__ == "__main__":
-    # Import tensordict for the examples
-    from tensordict import TensorDict
-
-    # Run all examples
-    example_basic_split_merge()
-    example_item_processing()
-    example_convenience_methods()
-    example_error_handling()
-
-    print("\n🎉 All examples completed successfully!")
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
index b766c60f858..7af4945f311 100644
--- a/recipe/fully_async_policy/unittest/test_mq.py
+++ b/recipe/fully_async_policy/unittest/test_mq.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import threading
 import time
 from unittest.mock import Mock
@@ -19,30 +20,30 @@
 import ray
 from omegaconf import DictConfig
 
-from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample
+from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
 
 
 @pytest.fixture
-def mock_data_proto():
-    """Mock数据对象"""
+def mock_sample():
+    """Mock sample data object"""
     return Mock()
 
 
 @pytest.fixture
 def basic_config():
-    """基础配置"""
+    """Basic configuration"""
     return DictConfig({"async_training": {"staleness_threshold": 3}})
 
 
 @pytest.fixture
 def queue_config():
-    """队列配置"""
+    """Queue configuration with different staleness threshold"""
     return DictConfig({"async_training": {"staleness_threshold": 2}})
 
 
 @pytest.fixture
 def ray_setup():
-    """设置Ray环境"""
+    """Setup Ray environment"""
     if not ray.is_initialized():
         ray.init(local_mode=True, ignore_reinit_error=True)
     yield
@@ -51,7 +52,7 @@ def ray_setup():
 
 @pytest.fixture
 def message_queue_client(ray_setup, basic_config):
-    """创建MessageQueue actor并返回其客户端"""
+    """Create MessageQueue actor and return its client"""
     actor = MessageQueue.remote(basic_config, max_queue_size=10)
     client = MessageQueueClient(actor)
     yield client
@@ -59,125 +60,110 @@ def message_queue_client(ray_setup, basic_config):
 
 
 class TestMessageQueue:
-    """测试MessageQueue（通过MessageQueueClient）"""
-
-    def test_put_samples_success(self, message_queue_client, mock_data_proto):
-        """测试成功放入samples"""
-        samples = [mock_data_proto, mock_data_proto]
-        metadata_list = [{"test": "data1"}, {"test": "data2"}]
-
-        result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list)
+    """Test MessageQueue (through MessageQueueClient)"""
 
+    def test_put_sample_success(self, message_queue_client, mock_sample):
+        """Test successfully putting a sample"""
+        result = message_queue_client.put_sample(sample=mock_sample, param_version=1)
         assert result is True
 
-        # 检查队列大小
+        # Check queue size
         queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 2
+        assert queue_size == 1
 
-        # 检查统计信息
+        # Check statistics
         stats = message_queue_client.get_statistics()
-        assert stats["total_produced"] == 2
-        assert stats["queue_size"] == 2
+        assert stats["total_produced"] == 1
+        assert stats["queue_size"] == 1
 
-    def test_put_samples_without_metadata(self, message_queue_client, mock_data_proto):
-        """测试不提供metadata时的处理"""
-        samples = [mock_data_proto, mock_data_proto]
+    def test_put_multiple_samples(self, message_queue_client, mock_sample):
+        """Test putting multiple samples"""
+        for i in range(3):
+            result = message_queue_client.put_sample(sample=mock_sample, param_version=1)
+            assert result is True
 
-        result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
-
-        assert result is True
+        # Check queue size
         queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 2
-
-    def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_proto):
-        """测试metadata长度不匹配的处理"""
-        samples = [mock_data_proto, mock_data_proto]
-        metadata_list = [{"test": "data1"}]  # 长度不匹配
+        assert queue_size == 3
 
-        result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list)
-
-        assert result is False  # 应该失败
-        queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 0
+        # Check statistics
+        stats = message_queue_client.get_statistics()
+        assert stats["total_produced"] == 3
+        assert stats["queue_size"] == 3
 
-    def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto):
-        """测试新鲜度检查"""
-        # 更新参数版本为5
+    def test_put_sample_staleness_check(self, message_queue_client, mock_sample):
+        """Test freshness check when putting samples"""
+        # Update parameter version to 5
         message_queue_client.update_param_version(5)
 
-        # 尝试放入版本过旧的batch（版本差异>=3会被拒绝）
-        samples = [mock_data_proto]
+        # Try to put a stale sample (version difference >= 3 will be rejected)
         result = message_queue_client.put_sample(
-            sample=samples,
-            param_version=2,  # 5-2=3, 达到阈值
-            rollout_metadata=None,
+            sample=mock_sample,
+            param_version=2,  # 5-2=3, reaches threshold
         )
 
         assert result is False
 
-        # 检查统计信息中的丢弃样本数
+        # Check dropped samples count in statistics
         stats = message_queue_client.get_statistics()
         assert stats["dropped_samples"] == 1
 
-    def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto):
-        """测试队列溢出处理"""
-        # 填满队列（最大容量10）
-        for i in range(6):  # 每次放入2个，总共12个，超过最大容量10
-            samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
+    def test_put_sample_queue_overflow(self, message_queue_client, mock_sample):
+        """Test queue overflow handling"""
+        # Fill the queue (max capacity 10)
+        for i in range(12):  # Put 12 samples, exceeding max capacity 10
+            message_queue_client.put_sample(sample=mock_sample, param_version=1)
 
-        # 队列大小应该保持在最大值
+        # Queue size should stay at maximum value
         queue_size = message_queue_client.get_queue_size()
         assert queue_size == 10
 
-        # 检查统计信息
+        # Check statistics
         stats = message_queue_client.get_statistics()
-        assert stats["dropped_samples"] == 2  # 超出的2个被丢弃
+        assert stats["dropped_samples"] == 2  # 2 samples should be dropped
 
-    def test_get_samples_success(self, message_queue_client, mock_data_proto):
-        """测试成功获取samples"""
-        # 先放入一些samples
-        samples = [mock_data_proto, mock_data_proto, mock_data_proto]
-        metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}]
-        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list)
+    def test_get_samples_success(self, message_queue_client, mock_sample):
+        """Test successfully getting samples"""
+        # First put some samples
+        for i in range(3):
+            message_queue_client.put_sample(sample=mock_sample, param_version=1)
 
-        # 获取2个samples
+        # Get 2 samples
         retrieved_samples = message_queue_client.get_samples(min_batch_count=2)
 
         assert retrieved_samples is not None
         assert len(retrieved_samples) == 2
-        assert all(isinstance(sample, QueueSample) for sample in retrieved_samples)
 
-        # 检查队列大小减少
+        # Check queue size decreased
         queue_size = message_queue_client.get_queue_size()
         assert queue_size == 1
 
-        # 检查统计信息
+        # Check statistics
         stats = message_queue_client.get_statistics()
         assert stats["total_consumed"] == 2
 
-    def test_get_samples_blocking_behavior(self, message_queue_client, mock_data_proto):
-        """测试阻塞行为"""
+    def test_get_samples_blocking_behavior(self, message_queue_client, mock_sample):
+        """Test blocking behavior"""
         result = []
 
         def get_samples():
-            # 这会阻塞直到有足够样本
+            # This will block until enough samples are available
             samples = message_queue_client.get_samples(min_batch_count=2)
             result.append(samples)
 
         def put_samples_later():
-            time.sleep(0.5)  # 延迟放入
-            samples = [mock_data_proto, mock_data_proto]
-            message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
+            time.sleep(0.5)  # Delay putting samples
+            message_queue_client.put_sample(sample=mock_sample, param_version=1)
+            message_queue_client.put_sample(sample=mock_sample, param_version=1)
 
-        # 启动消费者线程
+        # Start consumer thread
         consumer_thread = threading.Thread(target=get_samples)
         producer_thread = threading.Thread(target=put_samples_later)
 
         consumer_thread.start()
         producer_thread.start()
 
-        # 等待两个线程完成
+        # Wait for both threads to complete
         producer_thread.join(timeout=2)
         consumer_thread.join(timeout=2)
 
@@ -185,34 +171,33 @@ def put_samples_later():
         assert len(result[0]) == 2
 
     def test_update_param_version(self, message_queue_client):
-        """测试更新参数版本"""
+        """Test updating parameter version"""
         message_queue_client.update_param_version(10)
         stats = message_queue_client.get_statistics()
         assert stats["current_param_version"] == 10
 
-    def test_clear_queue(self, message_queue_client, mock_data_proto):
-        """测试清空队列"""
-        # 先添加一些样本
-        samples = [mock_data_proto, mock_data_proto, mock_data_proto]
-        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
+    def test_clear_queue(self, message_queue_client, mock_sample):
+        """Test clearing the queue"""
+        # First add some samples
+        for i in range(3):
+            message_queue_client.put_sample(sample=mock_sample, param_version=1)
 
-        # 清空队列
+        # Clear the queue
         message_queue_client.clear_queue()
 
-        # 检查队列大小
+        # Check queue size
         queue_size = message_queue_client.get_queue_size()
         assert queue_size == 0
 
-    def test_get_queue_size(self, message_queue_client, mock_data_proto):
-        """测试获取队列大小"""
+    def test_get_queue_size(self, message_queue_client, mock_sample):
+        """Test getting queue size"""
         assert message_queue_client.get_queue_size() == 0
 
-        samples = [mock_data_proto]
-        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
+        message_queue_client.put_sample(sample=mock_sample, param_version=1)
         assert message_queue_client.get_queue_size() == 1
 
     def test_get_statistics(self, message_queue_client):
-        """测试获取统计信息"""
+        """Test getting statistics"""
         stats = message_queue_client.get_statistics()
 
         expected_keys = {
@@ -229,11 +214,11 @@ def test_get_statistics(self, message_queue_client):
         assert isinstance(stats["total_produced"], int)
         assert isinstance(stats["total_consumed"], int)
 
-    def test_get_memory_usage(self, message_queue_client, mock_data_proto):
-        """测试获取内存使用统计"""
-        # 添加一些样本
-        samples = [mock_data_proto, mock_data_proto]
-        message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
+    def test_get_memory_usage(self, message_queue_client, mock_sample):
+        """Test getting memory usage statistics"""
+        # Add some samples
+        for i in range(2):
+            message_queue_client.put_sample(sample=mock_sample, param_version=1)
 
         memory_stats = message_queue_client.get_memory_usage()
 
@@ -244,44 +229,44 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto):
         assert memory_stats["estimated_memory_mb"] > 0
 
     def test_shutdown(self, ray_setup, basic_config):
-        """测试关闭功能"""
-        # 创建新的actor用于测试关闭
+        """Test shutdown functionality"""
+        # Create new actor for testing shutdown
         actor = MessageQueue.remote(basic_config, max_queue_size=10)
         client = MessageQueueClient(actor)
 
-        # 关闭应该不抛出异常
+        # Shutdown should not throw exceptions
         client.shutdown()
 
 
 class TestConcurrency:
-    """测试并发场景"""
+    """Test concurrent scenarios"""
 
     def setup_method(self):
-        """每个测试方法前的设置"""
+        """Setup before each test method"""
         if not ray.is_initialized():
             ray.init(local_mode=True, ignore_reinit_error=True)
 
     def teardown_method(self):
-        """每个测试方法后的清理"""
+        """Cleanup after each test method"""
         if ray.is_initialized():
             ray.shutdown()
 
     def create_message_queue_client(self, config=None):
-        """创建MessageQueue client的辅助方法"""
+        """Helper method to create MessageQueue client"""
         if config is None:
             config = DictConfig({"async_training": {"staleness_threshold": 3}})
         actor = MessageQueue.remote(config, max_queue_size=10)
         return MessageQueueClient(actor)
 
-    def test_concurrent_put_get(self, mock_data_proto):
-        """测试并发放入和获取"""
+    def test_concurrent_put_get(self, mock_sample):
+        """Test concurrent put and get"""
         client = self.create_message_queue_client()
         try:
             results = []
 
             def producer():
                 for i in range(50):
-                    samples = [mock_data_proto, mock_data_proto]
+                    samples = [mock_sample, mock_sample]
                     result = client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
                     results.append(("put", result))
                     time.sleep(0.1)
@@ -296,7 +281,7 @@ def consumer():
                         results.append(("get", False))
                     time.sleep(0.1)
 
-            # 启动生产者和消费者线程
+            # Start producer and consumer threads
             producer_thread = threading.Thread(target=producer)
             consumer_thread = threading.Thread(target=consumer)
 
@@ -307,7 +292,7 @@ def consumer():
             producer_thread.join(timeout=5)
             consumer_thread.join(timeout=5)
 
-            # 检查结果
+            # Check results
             put_results = [r[1] for r in results if r[0] == "put"]
             get_results = [r[1] for r in results if r[0] == "get"]
 
@@ -317,245 +302,85 @@ def consumer():
             client.shutdown()
 
     def test_consume_first_produce_later(self, message_queue_client, mock_data_proto):
-        """测试先消费、后生产的场景 - 验证阻塞和唤醒机制"""
+        """Test consume first, produce later scenario - verify blocking and wake-up mechanism"""
         consumer_result = []
         producer_result = []
-        start_time = time.time()
 
         def consumer_task():
-            """消费者任务：先启动，等待生产者生产数据"""
-            try:
-                # 记录开始消费的时间
-                consumer_start = time.time()
-                # 这里会阻塞等待，直到有至少2个样本可用
-                samples = message_queue_client.get_samples(min_batch_count=2)
-                consumer_end = time.time()
-
-                consumer_result.append(
-                    {
-                        "success": True,
-                        "samples_count": len(samples),
-                        "wait_time": consumer_end - consumer_start,
-                        "samples": samples,
-                    }
-                )
-            except Exception as e:
-                consumer_result.append({"success": False, "error": str(e), "wait_time": time.time() - consumer_start})
+            """Consumer task: start first, wait for producer to generate data"""
+            # Record the start time of consumption
+            consumer_start = time.time()
+            # This will block until at least 3 samples are available
+            samples = message_queue_client.get_samples(min_batch_count=3)
+            consumer_end = time.time()
+            consumer_result.append(
+                {
+                    "success": True,
+                    "samples_count": len(samples),
+                    "wait_time": consumer_end - consumer_start,
+                    "samples": samples,
+                }
+            )
 
         def producer_task():
-            """生产者任务：延迟1秒后开始生产"""
-            try:
-                # 延迟1秒，确保消费者先开始等待
-                time.sleep(1.0)
-                producer_start = time.time()
-
-                # 分两次放入，验证消费者会等到足够的样本数量
-                samples_1 = mock_data_proto
-                result1 = message_queue_client.put_sample(
-                    sample=samples_1, param_version=1, rollout_metadata=[{"batch": "first"}]
-                )
-
-                # 短暂延迟后放入第二批
-                time.sleep(0.1)
-                samples_2 = mock_data_proto
-                result2 = message_queue_client.put_sample(
-                    sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}]
-                )
-
-                samples_2 = mock_data_proto
-                result3 = message_queue_client.put_sample(
-                    sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}]
-                )
-
-                producer_end = time.time()
-                producer_result.append(
-                    {
-                        "success": result1 and result2,
-                        "put_count": 2,
-                        "produce_time": producer_end - producer_start,
-                        "result1": result1,
-                        "result2": result2,
-                    }
-                )
-
-                print("produce finish")
-
-            except Exception as e:
-                producer_result.append({"success": False, "error": str(e)})
-
-        # 启动消费者线程（先启动）
+            """Producer task: start producing after a delay"""
+            time.sleep(4.0)
+            producer_start = time.time()
+            message_queue_client.put_sample(
+                sample=mock_data_proto,
+                param_version=1,
+            )
+            time.sleep(1)
+            message_queue_client.put_sample(
+                sample=mock_data_proto,
+                param_version=1,
+            )
+            time.sleep(1)
+            message_queue_client.put_sample(
+                sample=mock_data_proto,
+                param_version=1,
+            )
+            producer_end = time.time()
+            producer_result.append(
+                {
+                    "put_count": 3,
+                    "produce_time": producer_end - producer_start,
+                }
+            )
+
+            print("produce finish")
+
+        # Start consumer thread (first)
         consumer_thread = threading.Thread(target=consumer_task, name="Consumer")
-        # 启动生产者线程（后启动）
+        time.sleep(3)
+        # Start producer thread (later)
         producer_thread = threading.Thread(target=producer_task, name="Producer")
 
         consumer_thread.start()
-        time.sleep(0.1)  # 确保消费者先开始等待
+        time.sleep(0.1)
         producer_thread.start()
 
-        print("=========")
-        #
-        # # 等待两个线程完成（设置超时避免死锁）
-        producer_thread.join()
-        # print("producer_result", producer_result)
-        # consumer_thread.join()
-        # print("consumer_thread", consumer_result)
-        #
-        # total_time = time.time() - start_time
-        #
-        # # 验证结果
-        # assert len(consumer_result) == 1, "消费者应该执行一次"
-        #
-        # consumer_data = consumer_result[0]
-        # producer_data = producer_result[0]
-        #
-        # # 验证生产者成功
-        # assert producer_data['success'], f"生产者失败: {producer_data.get('error', '')}"
-        # assert producer_data['put_count'] == 2, "应该生产2批数据"
-        #
-        # # 验证消费者成功
-        # assert consumer_data['success'], f"消费者失败: {consumer_data.get('error', '')}"
-        # assert consumer_data['samples_count'] == 2, "消费者应该获取到2个样本"
-        #
-        # # 验证时序：消费者等待时间应该大于1秒（生产者的延迟时间）
-        # assert consumer_data['wait_time'] >= 1.0, f"消费者等待时间应该≥1秒，实际: {consumer_data['wait_time']:.2f}秒"
-        #
-        # # 验证数据完整性
-        # assert all(isinstance(sample, QueueSample) for sample in consumer_data['samples']), "获取的样本应该是QueueSample类型"
-        #
-        # # 验证队列状态
-        # final_queue_size = message_queue_client.get_queue_size()
-        # assert final_queue_size == 0, "队列应该被清空"
-        #
-        # stats = message_queue_client.get_statistics()
-        # assert stats['total_produced'] == 2, "应该生产了2个样本"
-        # assert stats['total_consumed'] == 2, "应该消费了2个样本"
-        #
-        # print(f"测试成功完成，总耗时: {total_time:.2f}秒")
-        # print(f"消费者等待时间: {consumer_data['wait_time']:.2f}秒")
-        # print(f"生产者执行时间: {producer_data['produce_time']:.2f}秒")
-
-    def test_multiple_consumers_single_producer(self, message_queue_client, mock_data_proto):
-        """测试多个消费者等待单个生产者的场景"""
-        consumer_results = []
-        producer_result = []
-
-        def consumer_task(consumer_id):
-            """消费者任务"""
-            try:
-                start_time = time.time()
-                samples = message_queue_client.get_samples(min_batch_count=1)
-                end_time = time.time()
-
-                consumer_results.append(
-                    {
-                        "id": consumer_id,
-                        "success": True,
-                        "samples_count": len(samples),
-                        "wait_time": end_time - start_time,
-                    }
-                )
-            except Exception as e:
-                consumer_results.append({"id": consumer_id, "success": False, "error": str(e)})
-
-        def producer_task():
-            """生产者任务：延迟后批量生产"""
-            try:
-                time.sleep(1.5)  # 确保所有消费者都在等待
-
-                # 生产3批数据，每批1个样本，供3个消费者消费
-                for i in range(3):
-                    samples = [mock_data_proto]
-                    result = message_queue_client.put_sample(
-                        sample=samples, param_version=1, rollout_metadata=[{"batch_id": i}]
-                    )
-                    producer_result.append(result)
-                    time.sleep(0.1)  # 短暂间隔
-
-            except Exception as e:
-                producer_result.append(False)
-
-        print("# 启动3个消费者线程")
-        # consumer_threads = []
-        # for i in range(3):
-        #     thread = threading.Thread(target=consumer_task, args=(i,), name=f"Consumer-{i}")
-        #     consumer_threads.append(thread)
-        #     thread.start()
-        #     time.sleep(0.1)  # 错开启动时间
-        #
-        # # 启动生产者线程
-        # producer_thread = threading.Thread(target=producer_task, name="Producer")
-        # producer_thread.start()
-        #
-        # # 等待所有线程完成
-        # producer_thread.join(timeout=10)
-        # for thread in consumer_threads:
-        #     thread.join(timeout=10)
-        #
-        # # 验证结果
-        # assert len(consumer_results) == 3, "应该有3个消费者结果"
-        # assert len(producer_result) == 3, "应该生产3批数据"
-        #
-        # # 验证所有消费者都成功
-        # for result in consumer_results:
-        #     assert result['success'], f"消费者{result['id']}失败: {result.get('error', '')}"
-        #     assert result['samples_count'] == 1, f"消费者{result['id']}应该获取1个样本"
-        #     assert result['wait_time'] >= 1.5, f"消费者{result['id']}等待时间应该≥1.5秒"
-        #
-        # # 验证生产者都成功
-        # assert all(producer_result), "所有生产操作都应该成功"
-        #
-        # # 验证最终状态
-        # final_stats = message_queue_client.get_statistics()
-        # assert final_stats['total_produced'] == 3, "应该总共生产3个样本"
-        # assert final_stats['total_consumed'] == 3, "应该总共消费3个样本"
-        # assert final_stats['queue_size'] == 0, "队列应该被清空"
-
-    def test_consumer_timeout_scenario(self, message_queue_client, mock_data_proto):
-        """测试消费者超时场景（通过关闭队列来模拟）"""
-        consumer_result = []
-
-        def consumer_task():
-            """消费者任务：等待样本"""
-            try:
-                start_time = time.time()
-                # 尝试获取样本，但没有生产者会生产数据
-                samples = message_queue_client.get_samples(min_batch_count=2)
-                end_time = time.time()
-
-                consumer_result.append(
-                    {"success": True, "samples_count": len(samples), "wait_time": end_time - start_time}
-                )
-            except Exception as e:
-                consumer_result.append({"success": False, "error": str(e)})
-
-        def shutdown_task():
-            """延迟关闭队列，模拟超时场景"""
-            time.sleep(2.0)  # 让消费者等待2秒
-            message_queue_client.shutdown()
-
-        # 启动消费者和关闭任务
-        consumer_thread = threading.Thread(target=consumer_task, name="Consumer")
-        shutdown_thread = threading.Thread(target=shutdown_task, name="Shutdown")
+        print("=========", flush=True)
 
-        consumer_thread.start()
-        time.sleep(0.1)
-        shutdown_thread.start()
+        producer_thread.join()
+        print("producer_result", producer_result, flush=True)
+        consumer_thread.join()
+        print("consumer_result", consumer_result, flush=True)
 
-        # 等待线程完成
-        shutdown_thread.join(timeout=5)
-        consumer_thread.join(timeout=5)
+        assert len(consumer_result) == 1, "消费者应该执行一次"
 
-        # 验证结果
-        assert len(consumer_result) == 1, "应该有一个消费者结果"
+        consumer_data = consumer_result[0]
+        producer_data = producer_result[0]
 
-        result = consumer_result[0]
-        # 消费者应该在队列关闭后返回空列表
-        if result["success"]:
-            assert result["samples_count"] == 0, "关闭后应该返回空样本列表"
+        assert producer_data["put_count"] == 3
+        assert consumer_data["samples_count"] == 3
 
-        print(f"消费者等待了 {result.get('wait_time', 0):.2f} 秒后退出")
+        final_queue_size = message_queue_client.get_queue_size()
+        assert final_queue_size == 0
 
-    # 运行测试的示例配置
+        stats = message_queue_client.get_statistics()
+        assert stats["total_produced"] == 3
+        assert stats["total_consumed"] == 3
 
 
 if __name__ == "__main__":
diff --git a/recipe/fully_async_policy/unittest/test_mq2.py b/recipe/fully_async_policy/unittest/test_mq2.py
deleted file mode 100644
index d846a16dcb7..00000000000
--- a/recipe/fully_async_policy/unittest/test_mq2.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import threading
-import time
-from unittest.mock import Mock
-
-import pytest
-import ray
-from omegaconf import DictConfig
-
-from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample
-
-
-@pytest.fixture
-def mock_data_proto():
-    """Mock数据对象"""
-    return Mock()
-
-
-@pytest.fixture
-def basic_config():
-    """基础配置"""
-    return DictConfig({"async_training": {"staleness_threshold": 3}})
-
-
-@pytest.fixture
-def queue_config():
-    """队列配置"""
-    return DictConfig({"async_training": {"staleness_threshold": 2}})
-
-
-@pytest.fixture
-def ray_setup():
-    """设置Ray环境"""
-    if not ray.is_initialized():
-        ray.init(local_mode=True, ignore_reinit_error=True)
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture
-def message_queue_client(ray_setup, basic_config):
-    """创建MessageQueue actor并返回其客户端"""
-    actor = MessageQueue.remote(basic_config, max_queue_size=10)
-    client = MessageQueueClient(actor)
-    yield client
-    client.shutdown()
-
-
-class TestConcurrency:
-    """测试并发场景"""
-
-    def setup_method(self):
-        """每个测试方法前的设置"""
-        if not ray.is_initialized():
-            ray.init()
-
-    def teardown_method(self):
-        """每个测试方法后的清理"""
-        if ray.is_initialized():
-            ray.shutdown()
-
-    def create_message_queue_client(self, config=None):
-        """创建MessageQueue client的辅助方法"""
-        if config is None:
-            config = DictConfig({"async_training": {"staleness_threshold": 3}})
-        actor = MessageQueue.remote(config, max_queue_size=10)
-        return MessageQueueClient(actor)
-
-    def test_consume_first_produce_later(self, message_queue_client, mock_data_proto):
-        """测试先消费、后生产的场景 - 验证阻塞和唤醒机制"""
-        consumer_result = []
-        producer_result = []
-        start_time = time.time()
-
-        def consumer_task():
-            """消费者任务：先启动，等待生产者生产数据"""
-            # 记录开始消费的时间
-            consumer_start = time.time()
-            # 这里会阻塞等待，直到有至少2个样本可用
-            samples = message_queue_client.get_samples(min_batch_count=3)
-            consumer_end = time.time()
-            consumer_result.append(
-                {
-                    "success": True,
-                    "samples_count": len(samples),
-                    "wait_time": consumer_end - consumer_start,
-                    "samples": samples,
-                }
-            )
-
-        def producer_task():
-            """生产者任务：延迟1秒后开始生产"""
-            time.sleep(4.0)
-            producer_start = time.time()
-            message_queue_client.put_sample(
-                sample=mock_data_proto,
-                param_version=1,
-            )
-            time.sleep(1)
-            message_queue_client.put_sample(
-                sample=mock_data_proto,
-                param_version=1,
-            )
-            time.sleep(1)
-            message_queue_client.put_sample(
-                sample=mock_data_proto,
-                param_version=1,
-            )
-            producer_end = time.time()
-            producer_result.append(
-                {
-                    "put_count": 3,
-                    "produce_time": producer_end - producer_start,
-                }
-            )
-
-            print("produce finish")
-
-        # 启动消费者线程（先启动）
-        consumer_thread = threading.Thread(target=consumer_task, name="Consumer")
-        time.sleep(3)
-        # 启动生产者线程（后启动）
-        producer_thread = threading.Thread(target=producer_task, name="Producer")
-
-        consumer_thread.start()
-        time.sleep(0.1)  # 确保消费者先开始等待
-        producer_thread.start()
-
-        print("=========", flush=True)
-        #
-        # # 等待两个线程完成（设置超时避免死锁）
-        producer_thread.join()
-        print("producer_result", producer_result, flush=True)
-        consumer_thread.join()
-        print("consumer_result", consumer_result, flush=True)
-
-        # 验证结果
-        assert len(consumer_result) == 1, "消费者应该执行一次"
-
-        consumer_data = consumer_result[0]
-        producer_data = producer_result[0]
-
-        # 验证生产者成功
-        assert producer_data["put_count"] == 3, "应该生产2批数据"
-        assert consumer_data["samples_count"] == 3, "消费者应该获取到2个样本"
-
-        # 验证队列状态
-        final_queue_size = message_queue_client.get_queue_size()
-        assert final_queue_size == 0, "队列应该被清空"
-
-        stats = message_queue_client.get_statistics()
-        assert stats["total_produced"] == 3, "应该生产了2个样本"
-        assert stats["total_consumed"] == 3, "应该消费了2个样本"
-        #
-
-
-# 运行测试的示例配置
-if __name__ == "__main__":
-    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py
index 7c959a791bb..a5c61f11ba6 100644
--- a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py
+++ b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py
@@ -16,7 +16,7 @@
 import torch
 from tensordict import TensorDict
 
-from verl.protocol import DataProto
+from verl.protocol import DataProto, DataProtoItem
 
 
 def create_sample_dataproto():
@@ -435,32 +435,187 @@ def run_visual_comparison():
     return success
 
 
-if __name__ == "__main__":
-    print("Testing DataProto Split/Merge Functionality")
-    print("=" * 60)
+def example_basic_split_merge():
+    """Basic example of splitting DataProto into DataProtoItems and merging back."""
+    print("=== Basic Split and Merge Example ===")
+
+    # Create sample data
+    batch_size = 3
+    seq_len = 5
+
+    # Create tensors
+    input_ids = torch.randint(0, 1000, (batch_size, seq_len))
+    attention_mask = torch.ones(batch_size, seq_len)
+
+    # Create non-tensor data
+    prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object)
+    scores = np.array([0.8, 0.9, 0.7], dtype=object)
+
+    # Create DataProto
+    data_proto = DataProto.from_dict(
+        tensors={"input_ids": input_ids, "attention_mask": attention_mask},
+        non_tensors={"prompts": prompts, "scores": scores},
+        meta_info={"model_name": "test_model", "version": "1.0"},
+    )
+
+    print(f"Original DataProto length: {len(data_proto)}")
+    print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}")
+    print(f"Prompts: {data_proto.non_tensor_batch['prompts']}")
+
+    # Split into DataProtoItems
+    items = data_proto.to_items()
+    print(f"\nSplit into {len(items)} items")
+
+    for i, item in enumerate(items):
+        print(f"Item {i}:")
+        print(f"  Input IDs shape: {item.batch['input_ids'].shape}")
+        print(f"  Prompt: {item.non_tensor_batch['prompts']}")
+        print(f"  Score: {item.non_tensor_batch['scores']}")
+
+    # Merge back to DataProto
+    merged_proto = DataProto.from_items(items)
+    print(f"\nMerged DataProto length: {len(merged_proto)}")
+    print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}")
+    print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}")
+
+    # Verify they're identical
+    assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"])
+    assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"])
+    assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"])
+    assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"])
+
+    print("\n✓ Original and merged DataProto are identical!")
+
+
+def example_item_processing():
+    """Example showing individual item processing before merging."""
+    print("\n=== Individual Item Processing Example ===")
+
+    # Create initial data
+    #    batch_size = 4
+
+    values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1)  # Shape: (4, 1)
+    labels = np.array(["A", "B", "C", "D"], dtype=object)
+
+    original_proto = DataProto.from_dict(
+        tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0}
+    )
+
+    print(f"Original values: {original_proto.batch['values'].flatten()}")
+    print(f"Original labels: {original_proto.non_tensor_batch['labels']}")
+
+    # Split and process each item individually
+    items = original_proto.to_items()
+    processed_items = []
+
+    for i, item in enumerate(items):
+        # Process the tensor data (multiply by 2)
+        processed_value = item.batch["values"] * 2
+
+        # Process the non-tensor data (add suffix)
+        processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}"
 
+        # Create new processed item
+        processed_item = DataProtoItem(
+            batch=item.batch.clone(),  # Clone the TensorDict
+            non_tensor_batch=item.non_tensor_batch.copy(),
+            meta_info=item.meta_info.copy(),
+        )
+
+        # Update with processed data
+        processed_item.batch["values"] = processed_value
+        processed_item.non_tensor_batch["labels"] = processed_label
+        processed_item.meta_info["processing_step"] = 1
+
+        processed_items.append(processed_item)
+
+        print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'")
+
+    # Merge processed items back
+    processed_proto = DataProto.from_items(processed_items)
+
+    print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}")
+    print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}")
+    print(f"Processing step: {processed_proto.meta_info['processing_step']}")
+
+
+def example_convenience_methods():
+    """Example showing convenience methods."""
+    print("\n=== Convenience Methods Example ===")
+
+    # Create a single DataProtoItem
+    single_tensor = torch.tensor([42]).unsqueeze(0)  # Shape: (1,)
+    single_item = DataProtoItem(
+        batch=None,  # We'll create TensorDict manually
+        non_tensor_batch={"text": "Hello"},
+        meta_info={"source": "manual"},
+    )
+
+    # Create TensorDict manually for the single item
+    from tensordict import TensorDict
+
+    single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,))
+
+    print(f"Single item data: {single_item.batch['data']}")
+    print(f"Single item text: {single_item.non_tensor_batch['text']}")
+
+    # Convert single item to DataProto using convenience method
+    single_proto = single_item.to_proto()
+    print(f"Converted to DataProto length: {len(single_proto)}")
+
+    # Create multiple items and use static convenience method
+    items = [single_item]
+    for i in range(2):
+        new_item = single_item.copy()  # Use the copy method
+        new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0)
+        new_item.non_tensor_batch["text"] = f"Item {i + 1}"
+        items.append(new_item)
+
+    # Use DataProtoItem.from_items() convenience method
+    merged_proto = DataProtoItem.from_items(items)
+    print(f"Merged using convenience method - length: {len(merged_proto)}")
+    print(f"Data: {merged_proto.batch['data'].flatten()}")
+    print(f"Texts: {merged_proto.non_tensor_batch['text']}")
+
+
+def example_error_handling():
+    """Example showing error handling."""
+    print("\n=== Error Handling Example ===")
+
+    # Try to create DataProto from empty list
     try:
-        # Run all tests
-        test_basic_split_and_merge()
-        test_individual_item_access()
-        test_partial_merge()
-        test_item_processing()
-        test_error_conditions()
-        test_roundtrip_integrity()
-
-        # Run visual comparison
-        visual_success = run_visual_comparison()
-
-        if visual_success:
-            print("\n" + "=" * 60)
-            print("🎉 ALL TESTS PASSED!")
-            print("DataProto split/merge functionality is working correctly.")
-        else:
-            print("\n" + "=" * 60)
-            print("❌ SOME TESTS FAILED!")
+        DataProto.from_items([])
+        print("ERROR: Should have raised exception for empty list")
+    except ValueError as e:
+        print(f"✓ Correctly caught error for empty list: {e}")
 
-    except Exception as e:
-        print(f"\n❌ Test failed with exception: {e}")
-        import traceback
+    # Try to merge items with inconsistent structure
+    try:
+        item1 = DataProtoItem(
+            batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)),
+            non_tensor_batch={"text": "Hello"},
+        )
+        item2 = DataProtoItem(
+            batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)),
+            non_tensor_batch={"text": "World"},
+        )
 
-        traceback.print_exc()
+        DataProto.from_items([item1, item2])
+        print("ERROR: Should have raised exception for inconsistent structure")
+    except ValueError as e:
+        print(f"✓ Correctly caught error for inconsistent structure: {e}")
+
+
+if __name__ == "__main__":
+    # Run all tests
+    test_basic_split_and_merge()
+    test_individual_item_access()
+    test_partial_merge()
+    test_item_processing()
+    test_error_conditions()
+    test_roundtrip_integrity()
+    example_basic_split_merge()
+    example_item_processing()
+    example_convenience_methods()
+    example_error_handling()
+    run_visual_comparison()

From 1cfebfe1bb4c2a170fd46be83e27cf275aaa2566 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 12 Aug 2025 14:14:52 +0800
Subject: [PATCH 035/182] english notes

---
 .../config/fully_async_ppo_trainer.yaml       |  7 --
 recipe/fully_async_policy/fully_async_main.py | 13 ++-
 .../fully_async_rollouter.py                  | 51 +++++-------
 .../fully_async_policy/fully_async_trainer.py |  4 +-
 recipe/fully_async_policy/message_queue.py    | 79 +++++++++----------
 5 files changed, 63 insertions(+), 91 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index a5f58fadc2f..665f7a8be89 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -12,17 +12,10 @@ async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
 
-  # 参数同步 (Parameter Synchronization)
-  max_sync_retries: 3                # 参数同步最大重试次数
-  sync_timeout: 30.0                 # 同步超时时间(秒)
-  sync_retry_delay: 1.0              # 重试延迟时间(秒)
-
 # Rollout配置
 rollout:
   nnodes: 1                          # Number of nodes used in the rollout
   n_gpus_per_node: 8                 # Number of GPUs per node
-  mode: async                        # rollout模式: sync, async
-  name: vllm                         # rollout引擎: vllm, sglang
   n: 4                               # 每个prompt生成的响应数量
   total_rollout_steps: 100
   total_epochs: 10
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 6afb44abd9d..39eacb86314 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -40,11 +40,10 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
     Returns:
         ResourcePoolManager: Resource pool manager
     """
-    # 构建资源池规格
     resource_pool_spec = {}
     mapping = {}
 
-    # Actor/Critic资源池（训练相关）
+    # Actor/Critic resource pool
     if any(role in roles for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]):
         assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0"
         assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0"
@@ -52,12 +51,12 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
         trainer_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes
         resource_pool_spec["trainer_pool"] = trainer_pool
 
-        # 训练相关角色映射到同一个资源池
+        # Map training-related roles to the same resource pool
         for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]:
             if role in roles:
                 mapping[role] = "trainer_pool"
 
-    # Rollout资源池
+    # Rollout resource pool
     if Role.Rollout in roles:
         assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0"
         assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0"
@@ -79,7 +78,7 @@ def create_role_worker_mapping(config):
     Returns:
         dict: Mapping from roles to worker classes
     """
-    # 根据策略选择worker类
+    # Select worker class based on strategy
     if config.actor_rollout_ref.actor.strategy == "fsdp2":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
         from recipe.one_step_off_policy.fsdp_workers import (
@@ -148,7 +147,6 @@ def __init__(self):
         self.shutdown_event = threading.Event()
 
     def run(self, config):
-        """运行完全异步的PPO训练"""
         print("Starting fully async PPO training...")
         self._initialize_components(config)
         self._run_training_loop()
@@ -172,7 +170,7 @@ def _initialize_components(self, config) -> None:
 
         self.components["tokenizer"] = tokenizer
         self.components["processor"] = processor
-        self.components["config"] = config  # 保存config以供其他方法使用
+        self.components["config"] = config
 
         print("Creating worker mapping and resource pools...")
         role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config)
@@ -278,7 +276,6 @@ def _run_training_loop(self):
         ray.get(trainer_future)
 
         self.components["message_queue_client"].clear_queue()
-
         print("Training completed or interrupted")
 
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 01affa67586..d2abf3dab2f 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -72,7 +72,6 @@ def __init__(
         self.use_reference_policy = False
         self.use_rm = False
 
-        # Create datasets
         print("Creating datasets...")
         from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
         from verl.utils.dataset.rl_dataset import collate_fn
@@ -186,7 +185,6 @@ def _create_continuous_iterator(self):
                 yield epoch, batch_dict
 
     def fit(self):
-        """开始异步生成样本 - 改进的主运行逻辑"""
         print("Starting FullyAsyncRollouter...")
 
         if self.message_queue_client is None:
@@ -199,15 +197,12 @@ def fit(self):
             self.running = True
             self.paused = False
 
-        # 创建并启动生成线程
         self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
         self.generation_thread.start()
 
-        # 创建并启动监控线程
         self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
         self.monitor_thread.start()
 
-        # 等待线程完成
         self.generation_thread.join()
         self.monitor_thread.join()
 
@@ -215,16 +210,17 @@ def fit(self):
 
     def _generation_loop(self):
         """
-        主要的生成循环
 
-        循环入口，需要
-        1. running 判断
-        4. 中断判断
-        3. 新鲜度判断
+        Main Generation Loop
 
-        生成样本过程中，需要
-        1. running 判断
-        2. 中断判断
+        Loop Entry Requirements:
+        1. Running status validation
+        2. Interruption detection
+        3. Freshness validation
+
+        During Sample Generation Process:
+        1. Running status validation
+        2. Interruption detection
         """
 
         from verl.utils.tracking import Tracking
@@ -265,12 +261,10 @@ def _generation_loop(self):
                 if self._should_pause_generation():
                     self.pause()
 
-                # 如果被暂停，等待恢复
                 while self.paused and self.running:
                     print("Generation thread paused, waiting...")
                     self.condition.wait()
 
-                # 再次检查运行状态
                 if not self.running:
                     break
 
@@ -292,7 +286,7 @@ def _generation_loop(self):
                 gen_batch_output.meta_info.pop("timing", None)
 
             if gen_batch_output is not None:
-                # 准备rollout metadata
+                # prepare rollout metadata
                 rollout_metadata = {
                     "timing": timing_raw,
                     "generation_timestamp": time.time(),
@@ -306,7 +300,6 @@ def _generation_loop(self):
                         data=sample,
                         rollout_metadata=rollout_metadata,
                     )
-                    # 放入队列
                     success = self.message_queue_client.put_sample(
                         sample=ray.cloudpickle.dumps(queue_sample),
                         param_version=self.current_param_version,
@@ -341,11 +334,9 @@ def _generation_loop(self):
         )
 
     def _monitor_loop(self):
-        """监控线程 - 监控状态并处理控制信号"""
-        # 主线程保持运行，处理控制信号和状态监控
         last_stats_time = time.time()
-        stats_interval = 30.0  # 30秒报告一次统计
-        check_interval = 5.0  # 5秒检查一次状态
+        stats_interval = 30.0
+        check_interval = 5.0
         while True:
             with self.lock:
                 if not self.running:
@@ -356,7 +347,6 @@ def _monitor_loop(self):
             if current_time - last_stats_time >= stats_interval:
                 print(self.get_statistics())
                 last_stats_time = current_time
-            # 检查是否应该恢复生成
             if not self._should_pause_generation():
                 with self.lock:
                     if self.paused:
@@ -365,18 +355,14 @@ def _monitor_loop(self):
                         print("Generation resumed")
 
     def _should_pause_generation(self) -> bool:
-        """
-        判断是否应该暂停生成，基于新鲜度控制 - 改进的判断逻辑
-        """
+        """Determine whether the build should be paused"""
         try:
             queue_stats = self.message_queue_client.get_statistics()
             queue_size = queue_stats["queue_size"]
             current_trainer_version = queue_stats["current_param_version"]
 
-            # 计算参数版本差异
             version_diff = self.current_param_version - current_trainer_version
 
-            # 如果版本差异过大，暂停生成
             if version_diff >= self.staleness_threshold:
                 print(
                     f"Should pause due to staleness: rollout_version={self.current_param_version}, "
@@ -384,7 +370,6 @@ def _should_pause_generation(self) -> bool:
                 )
                 return True
 
-            # 如果队列太满，也暂停生成
             if queue_size >= self.max_queue_size:
                 print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
                 return True
@@ -393,11 +378,11 @@ def _should_pause_generation(self) -> bool:
 
         except Exception as e:
             print(f"Error checking pause conditions: {e}")
-            return True  # 出错时暂停生成
+            return True
 
     def pause(self) -> bool:
-        """暂停生成
-        TODO 集成 Partial Rollout
+        """ pause rollout
+        TODO integrated Partial Rollout
         """
         print("[rollouter] pause")
         with self.lock:
@@ -411,8 +396,8 @@ def pause(self) -> bool:
             return True
 
     def resume(self) -> bool:
-        """恢复生成
-        TODO 集成 Partial Rollout
+        """ resume rollout
+        TODO integrated Partial Rollout
         """
         print("[rollouter] resume")
         with self.lock:
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 39fedb022d5..bbc5cfa75d0 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -39,8 +39,8 @@
 @ray.remote
 class FullyAsyncTrainer(RayPPOTrainer):
     """
-    完全异步的PPO训练器，从MessageQueue获取样本进行训练
-    基于OneStepOffRayTrainer的成熟实现改进
+    A fully asynchronous PPO trainer that obtains samples from a MessageQueue for training.
+    Based on an improved implementation of OneStepOffRayTrainer
     """
 
     def __init__(
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index ad261b0072a..c4c7d85f5a7 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -26,8 +26,6 @@
 
 @dataclass
 class QueueSample:
-    """单个batch样本，包含参数版本和新鲜度信息"""
-
     data: Any
     rollout_metadata: dict[str, Any]
 
@@ -35,7 +33,7 @@ class QueueSample:
 @ray.remote(num_cpus=10, max_concurrency=10)
 class MessageQueue:
     """
-    简化的Ray-based异步消息队列，用于Rollouter和Trainer之间的通信
+    Simplified Ray-based asynchronous message queue for communication between Rollouter and Trainer
     """
 
     def __init__(self, config: DictConfig, max_queue_size: int = 1000):
@@ -44,7 +42,6 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.queue = deque(maxlen=max_queue_size)
         self.current_param_version = 0
 
-        # 安全地获取配置值
         try:
             if hasattr(config, "async_training") and config.async_training is not None:
                 self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3)
@@ -56,40 +53,40 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         # Threading for message handling
         self.running = True
 
-        # 线程安全
+        # thread safe
         self.lock = threading.RLock()
         self.consumer_condition = threading.Condition(self.lock)
 
-        # 统计信息
+        # statistic message
         self.total_produced = 0
         self.total_consumed = 0
         self.dropped_samples = 0
 
         logger.info(
             f"MessageQueue initialized with max_queue_size={max_queue_size},"
-            "staleness_threshold={self.staleness_threshold}"
+            f"staleness_threshold={self.staleness_threshold}"
         )
 
     def put_sample(self, sample: Any, param_version: int) -> bool:
         """
-        放入一个batch样本到队列
+        Put a batch sample into the queue
 
         Args:
-            sample: 样本数据
-            param_version: 参数版本号
+            sample: Sample data
+            param_version: Parameter version number
 
         Returns:
-            bool: 是否成功放入队列
+            bool: Whether the sample was successfully put into the queue
         """
         with self.lock:
-            # 检查新鲜度
+            # Check freshness
             staleness = self.current_param_version - param_version
             if staleness >= self.staleness_threshold:
                 self.dropped_samples += 1
                 logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
                 return False
 
-            # 如果队列满了，移除最旧的样本，一般不会发生
+            # If queue is full, remove the oldest sample (rarely happens)
             if len(self.queue) >= self.max_queue_size:
                 removed = self.queue.popleft()
                 self.dropped_samples += 1
@@ -97,7 +94,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
             self.queue.append(sample)
             self.total_produced += 1
 
-            # 通知等待的消费者
+            # Notify waiting consumers
             self.consumer_condition.notify()
 
             if self.total_produced % 100 == 0:
@@ -107,13 +104,13 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
 
     def get_samples(self, min_batch_count: int = 1) -> list[Any]:
         """
-        从队列获取batch样本，一直等待直到有足够样本
+        Get batch samples from the queue, wait until enough samples are available
 
         Args:
-            min_batch_count: sample数量满足min_batch，一次性获取
+            min_batch_count: Get samples at once when sample count meets min_batch
 
         Returns:
-            List[Any]: 获取的样本列表
+            List[Any]: List of retrieved samples
         """
 
         print("get_samples")
@@ -125,11 +122,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
                         return []
                 self.consumer_condition.wait()
 
-            # 如果队列已关闭且没有足够样本，返回空列表
+            # If queue is closed and doesn't have enough samples, return empty list
             if not self.running and len(self.queue) < min_batch_count:
                 return []
 
-            # 获取指定数量的样本
+            # Get specified number of samples
             batch_count = min(min_batch_count, len(self.queue))
             samples = []
             for _ in range(batch_count):
@@ -144,19 +141,19 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
             return samples
 
     def update_param_version(self, version: int):
-        """更新当前参数版本"""
+        """Update current parameter version"""
         with self.lock:
             old_version = self.current_param_version
             self.current_param_version = version
             logger.debug(f"Parameter version updated from {old_version} to {version}")
 
     def get_queue_size(self) -> int:
-        """获取当前队列长度"""
+        """Get current queue length"""
         with self.lock:
             return len(self.queue)
 
     def get_statistics(self) -> dict[str, Any]:
-        """获取队列统计信息"""
+        """Get queue statistics"""
         with self.lock:
             return {
                 "queue_size": len(self.queue),
@@ -169,41 +166,41 @@ def get_statistics(self) -> dict[str, Any]:
             }
 
     def clear_queue(self):
-        """清空队列"""
+        """Clear the queue"""
         with self.lock:
             cleared_count = len(self.queue)
             self.queue.clear()
             logger.info(f"Cleared {cleared_count} samples from queue")
 
     def shutdown(self):
-        """关闭消息队列"""
+        """Shutdown the message queue"""
         with self.lock:
             self.running = False
-            # 通知所有等待的线程，让它们能够退出
+            # Notify all waiting threads so they can exit
             self.consumer_condition.notify_all()
         logger.info("MessageQueue shutdown")
 
     def get_memory_usage(self) -> dict:
-        """获取内存使用统计"""
+        """Get memory usage statistics"""
         with self.lock:
-            # 估算队列中样本的内存使用
+            # Estimate memory usage of samples in queue
             import sys
 
             total_size = 0
             sample_count = len(self.queue)
 
             if sample_count > 0:
-                # 估算单个样本的大小（简化估算）
+                # Estimate size of a single sample (simplified estimation)
                 sample = list(self.queue)[0]
                 try:
                     sample_size = sys.getsizeof(sample)
                     if hasattr(sample.data, "batch") and hasattr(sample.data.batch, "__len__"):
-                        # 如果有batch信息，估算数据大小
+                        # If batch info is available, estimate data size
                         batch_size = len(sample.data.batch)
-                        sample_size += batch_size * 1000  # 粗略估算每个batch条目1KB
+                        sample_size += batch_size * 1000  # Roughly estimate 1KB per batch entry
                     total_size = sample_size * sample_count
                 except Exception:
-                    total_size = sample_count * 10000  # 粗略估算每个样本10KB
+                    total_size = sample_count * 10000  # Roughly estimate 10KB per sample
 
             return {
                 "queue_samples": sample_count,
@@ -213,39 +210,39 @@ def get_memory_usage(self) -> dict:
 
 
 class MessageQueueClient:
-    """MessageQueue的客户端，用于与MessageQueue Actor通信"""
+    """MessageQueue client for communicating with MessageQueue Actor"""
 
     def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
     def put_sample(self, sample: Any, param_version: int) -> bool:
-        """放入batch到队列"""
+        """Put batch into queue"""
         return ray.get(self.queue_actor.put_sample.remote(sample, param_version))
 
     def get_samples(self, min_batch_count: int = 1) -> list[Any]:
-        """从队列获取batch，一直等待直到有足够样本"""
+        """Get batch from queue, wait until enough samples are available"""
         return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
     def update_param_version(self, version: int):
-        """更新参数版本"""
+        """Update parameter version"""
         ray.get(self.queue_actor.update_param_version.remote(version))
 
     def get_queue_size(self) -> int:
-        """获取队列大小"""
+        """Get queue size"""
         return ray.get(self.queue_actor.get_queue_size.remote())
 
     def get_statistics(self) -> dict[str, Any]:
-        """获取统计信息"""
+        """Get statistics"""
         return ray.get(self.queue_actor.get_statistics.remote())
 
     def clear_queue(self):
-        """清空队列"""
+        """Clear queue"""
         ray.get(self.queue_actor.clear_queue.remote())
 
     def shutdown(self):
-        """关闭队列"""
+        """Shutdown queue"""
         ray.get(self.queue_actor.shutdown.remote())
 
     def get_memory_usage(self) -> dict:
-        """获取内存使用统计"""
-        return ray.get(self.queue_actor.get_memory_usage.remote())
+        """Get memory usage statistics"""
+        return ray.get(self.queue_actor.get_memory_usage.remote())
\ No newline at end of file

From 5d108bfe48b0083f3966b3f450b8cc655a1e3fb5 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 12 Aug 2025 14:15:12 +0800
Subject: [PATCH 036/182] english notes

---
 recipe/fully_async_policy/fully_async_rollouter.py | 4 ++--
 recipe/fully_async_policy/message_queue.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index d2abf3dab2f..b0b270ba685 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -381,7 +381,7 @@ def _should_pause_generation(self) -> bool:
             return True
 
     def pause(self) -> bool:
-        """ pause rollout
+        """pause rollout
         TODO integrated Partial Rollout
         """
         print("[rollouter] pause")
@@ -396,7 +396,7 @@ def pause(self) -> bool:
             return True
 
     def resume(self) -> bool:
-        """ resume rollout
+        """resume rollout
         TODO integrated Partial Rollout
         """
         print("[rollouter] resume")
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index c4c7d85f5a7..3efe982752d 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -245,4 +245,4 @@ def shutdown(self):
 
     def get_memory_usage(self) -> dict:
         """Get memory usage statistics"""
-        return ray.get(self.queue_actor.get_memory_usage.remote())
\ No newline at end of file
+        return ray.get(self.queue_actor.get_memory_usage.remote())

From 796880ea3e21364cdc1622200a9ca6164f786013 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 12 Aug 2025 15:36:46 +0800
Subject: [PATCH 037/182] update print

---
 recipe/fully_async_policy/fully_async_main.py |  38 ++++---
 .../fully_async_rollouter.py                  | 101 +++++++++---------
 .../fully_async_policy/fully_async_trainer.py |  61 ++++-------
 recipe/fully_async_policy/message_queue.py    |  10 +-
 recipe/fully_async_policy/param_sync.py       |   6 +-
 verl/trainer/ppo/ray_trainer.py               |   8 --
 6 files changed, 98 insertions(+), 126 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 39eacb86314..163b2420381 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -147,16 +147,16 @@ def __init__(self):
         self.shutdown_event = threading.Event()
 
     def run(self, config):
-        print("Starting fully async PPO training...")
+        print("[ASYNC MAIN] Starting fully async PPO training...")
         self._initialize_components(config)
         self._run_training_loop()
 
     def _initialize_components(self, config) -> None:
-        print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
+        print(f"[ASYNC MAIN] TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
         pprint(OmegaConf.to_container(config, resolve=True))
         OmegaConf.resolve(config)
 
-        print("Initializing model and tokenizer...")
+        print("[ASYNC MAIN] Initializing model and tokenizer...")
         local_path = copy_to_local(
             config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
         )
@@ -172,12 +172,12 @@ def _initialize_components(self, config) -> None:
         self.components["processor"] = processor
         self.components["config"] = config
 
-        print("Creating worker mapping and resource pools...")
+        print("[ASYNC MAIN] Creating worker mapping and resource pools...")
         role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config)
         self.components["role_worker_mapping"] = role_worker_mapping
         self.components["ray_worker_group_cls"] = ray_worker_group_cls
 
-        print("Loading reward functions...")
+        print("[ASYNC MAIN] Loading reward functions...")
         reward_fn = load_reward_manager(
             config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
         )
@@ -187,25 +187,24 @@ def _initialize_components(self, config) -> None:
         self.components["reward_fn"] = reward_fn
         self.components["val_reward_fn"] = val_reward_fn
 
-        self.max_queue_size = (
-            config.async_training.staleness_threshold
-            * config.data.train_batch_size
-            * config.actor_rollout_ref.rollout.n
-        ) * 10  # x 10 avoid deadlock
-        print("Creating MessageQueue...")
+        self.max_queue_size = ((config.async_training.staleness_threshold + 1)
+                               * config.data.train_batch_size
+                               * config.actor_rollout_ref.rollout.n
+                               ) * 10  # x 10 avoid deadlock
+        print("[ASYNC MAIN] Creating MessageQueue...")
         message_queue = MessageQueue.remote(config, self.max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
 
         self.components["message_queue"] = message_queue
         self.components["message_queue_client"] = message_queue_client
 
-        print("Creating FullyAsyncRollouter...")
+        print("[ASYNC MAIN] Creating FullyAsyncRollouter...")
         self._create_rollouter(config)
 
-        print("Creating FullyAsyncTrainer...")
+        print("[ASYNC MAIN] Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
-        print("Setting up parameter synchronization...")
+        print("[ASYNC MAIN] Setting up parameter synchronization...")
         from recipe.fully_async_policy.param_sync import ParameterSynchronizer
 
         param_synchronizer = ParameterSynchronizer.remote(
@@ -221,10 +220,9 @@ def _initialize_components(self, config) -> None:
         ray.get(param_synchronizer.sync_weights.remote(0))
 
         self.components["param_synchronizer"] = param_synchronizer
-        print("All components initialized successfully")
+        print("[ASYNC MAIN] All components initialized successfully")
 
     def _create_rollouter(self, config) -> None:
-        pprint(self.components)
         rollouter = FullyAsyncRollouter.remote(
             config=config,
             tokenizer=self.components["tokenizer"],
@@ -239,7 +237,7 @@ def _create_rollouter(self, config) -> None:
         ray.get(rollouter.init_workers.remote())
         ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"]))
         self.components["rollouter"] = rollouter
-        print("Rollouter created and initialized successfully")
+        print("[ASYNC MAIN] Rollouter created and initialized successfully")
 
     def _create_trainer(self, config) -> None:
         trainer_role_mapping = {
@@ -263,12 +261,12 @@ def _create_trainer(self, config) -> None:
         ray.get(trainer.init_workers.remote())
         ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"]))
         self.components["trainer"] = trainer
-        print("FullyAsyncTrainer created and initialized successfully")
+        print("[ASYNC MAIN] FullyAsyncTrainer created and initialized successfully")
 
     def _run_training_loop(self):
         self.running = True
 
-        print("Starting Rollouter in background...")
+        print("[ASYNC MAIN] Starting Rollouter in background...")
         rollouter_future = self.components["rollouter"].fit.remote()
         trainer_future = self.components["trainer"].fit.remote()
 
@@ -276,7 +274,7 @@ def _run_training_loop(self):
         ray.get(trainer_future)
 
         self.components["message_queue_client"].clear_queue()
-        print("Training completed or interrupted")
+        print("[ASYNC MAIN] Training completed or interrupted")
 
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index b0b270ba685..2d9d839feca 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -35,17 +35,17 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
-        max_queue_size=1000,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
+            max_queue_size=1000,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -72,7 +72,7 @@ def __init__(
         self.use_reference_policy = False
         self.use_rm = False
 
-        print("Creating datasets...")
+        print(f"[ROLLOUTER] Creating datasets...")
         from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
         from verl.utils.dataset.rl_dataset import collate_fn
 
@@ -81,7 +81,7 @@ def __init__(
         train_sampler = create_rl_sampler(config.data, train_dataset)
 
         self._validate_config()
-        pprint(f"Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
+        print(f"[ROLLOUTER] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
         total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
@@ -90,7 +90,7 @@ def __init__(
             total_rollout_steps = self.config.rollout.total_rollout_steps
 
         self.total_rollout_steps = total_rollout_steps
-        print(f"Total rollout steps: {self.total_rollout_steps}")
+        print(f"[ROLLOUTER] Total rollout steps: {self.total_rollout_steps}")
 
         # Rollouter parameter configuration
         self.message_queue_client = None
@@ -103,8 +103,14 @@ def __init__(
 
         # Statistics
         self.total_generated_samples = 0
+        self.train_step_samples = 0
         self.dropped_stale_samples = 0
-        self.param_sync_requests = 0
+
+        # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
+        n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
+        batch_size = self.config.data.train_batch_size
+        required_samples = n_responses_per_prompt * batch_size
+        self.max_required_samples = required_samples * (self.staleness_threshold + 1)
 
         # Worker groups
         self.rollout_wg = None
@@ -120,17 +126,13 @@ def __init__(
         self.condition = threading.Condition(self.lock)
 
         # Pause/resume statistics
-        self.pause_count = 0
-        self.resume_count = 0
         self.total_pause_time = 0.0
         self.last_pause_time = None
 
         # Parameter synchronization related
         self.param_synchronizer = None
-        self.last_sync_time = 0
-        self.sync_in_progress = False
-        self.sync_lock = threading.Lock()
 
+        # queue size
         self.max_queue_size = max_queue_size
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
@@ -152,12 +154,14 @@ def update_param_version(self, version: int):
         with self.lock:
             old_version = self.current_param_version
             self.current_param_version = version
-            print(f"Parameter version updated from {old_version} to {version}")
+            # every time param change, reset train_step_samples
+            self.train_step_samples = 0
+            print(f"[ROLLOUTER] Parameter version updated from {old_version} to {version}")
 
     def _validate_config(self):
         # Validate asynchronous training configuration
         if not hasattr(self.config, "async_training"):
-            raise ValueError("Missing async_training configuration")
+            raise ValueError("[ROLLOUTER] Missing async_training configuration")
 
     def _create_actor_rollout_classes(self):
         # only create rollout
@@ -185,7 +189,7 @@ def _create_continuous_iterator(self):
                 yield epoch, batch_dict
 
     def fit(self):
-        print("Starting FullyAsyncRollouter...")
+        print(f"[ROLLOUTER] Starting FullyAsyncRollouter...")
 
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
@@ -206,7 +210,7 @@ def fit(self):
         self.generation_thread.join()
         self.monitor_thread.join()
 
-        print("Rollouter fit completed")
+        print(f"[ROLLOUTER] Rollouter fit completed")
 
     def _generation_loop(self):
         """
@@ -217,6 +221,7 @@ def _generation_loop(self):
         1. Running status validation
         2. Interruption detection
         3. Freshness validation
+        4. train_step_samples validation
 
         During Sample Generation Process:
         1. Running status validation
@@ -242,7 +247,7 @@ def _generation_loop(self):
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
             assert val_metrics, f"{val_metrics=}"
-            pprint(f"Initial validation metrics: {val_metrics}")
+            pprint(f"[ROLLOUTER] Initial validation metrics: {val_metrics}")
             self.logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
@@ -262,7 +267,7 @@ def _generation_loop(self):
                     self.pause()
 
                 while self.paused and self.running:
-                    print("Generation thread paused, waiting...")
+                    print(f"[ROLLOUTER] Generation thread paused, waiting...")
                     self.condition.wait()
 
                 if not self.running:
@@ -304,24 +309,17 @@ def _generation_loop(self):
                         sample=ray.cloudpickle.dumps(queue_sample),
                         param_version=self.current_param_version,
                     )
-                    print(f"put samples {success}")
                     with self.lock:
                         if success:
                             self.total_generated_samples += 1
+                            self.train_step_samples += 1
                         else:
                             self.dropped_stale_samples += 1
 
-                    if self.global_steps % 1 == 0:
-                        print(
-                            f"Generated {self.total_generated_samples} batches, \n"
-                            f"param_version={self.current_param_version}, \n"
-                            f"Dropped stale samples: {self.dropped_stale_samples}\n"
-                        )
-
             self.global_steps += 1
 
             if is_last_step:
-                pprint(f"Final validation metrics: {last_val_metrics}")
+                pprint(f"[ROLLOUTER] Final validation metrics: {last_val_metrics}")
                 break
 
         with self.lock:
@@ -345,14 +343,14 @@ def _monitor_loop(self):
             # 定期打印统计信息
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
-                print(self.get_statistics())
+                print(f"[ROLLOUTER] {self.get_statistics()}")
                 last_stats_time = current_time
             if not self._should_pause_generation():
                 with self.lock:
                     if self.paused:
                         self.paused = False
                         self.condition.notify_all()
-                        print("Generation resumed")
+                        print(f"[ROLLOUTER] Generation resumed")
 
     def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""
@@ -363,28 +361,35 @@ def _should_pause_generation(self) -> bool:
 
             version_diff = self.current_param_version - current_trainer_version
 
-            if version_diff >= self.staleness_threshold:
+            if version_diff > self.staleness_threshold:
                 print(
-                    f"Should pause due to staleness: rollout_version={self.current_param_version}, "
+                    "[ROLLOUTER] "
+                    f"Should pause due to version_diff > self.staleness_threshold: "
+                    f"rollout_version={self.current_param_version}, "
                     f"trainer_version={current_trainer_version}, diff={version_diff}"
                 )
                 return True
 
             if queue_size >= self.max_queue_size:
-                print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
+                print(f"[ROLLOUTER] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
+                return True
+
+            if self.train_step_samples >= self.max_required_samples:
+                print(f"[ROLLOUTER] Should pause due to step_generated_samples >= max_required_samples: "
+                      f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}")
                 return True
 
             return False
 
         except Exception as e:
-            print(f"Error checking pause conditions: {e}")
+            print(f"[ROLLOUTER] Error checking pause conditions: {e}")
             return True
 
     def pause(self) -> bool:
         """pause rollout
         TODO integrated Partial Rollout
         """
-        print("[rollouter] pause")
+        print(f"[ROLLOUTER] pause")
         with self.lock:
             if not self.running:
                 return False
@@ -399,7 +404,7 @@ def resume(self) -> bool:
         """resume rollout
         TODO integrated Partial Rollout
         """
-        print("[rollouter] resume")
+        print(f"[ROLLOUTER] resume")
         with self.lock:
             if not self.running:
                 return False
@@ -409,20 +414,18 @@ def resume(self) -> bool:
 
             self.paused = False
             self.condition.notify_all()
-            print("Generation resumed")
             return True
 
     def get_statistics(self) -> dict:
         with self.lock:
             queue_stats = self.message_queue_client.get_statistics()
             stats = {
+                "is_running": self.running,
                 "total_generated_samples": self.total_generated_samples,
+                "train_step_samples": self.train_step_samples,
                 "dropped_stale_samples": self.dropped_stale_samples,
                 "current_param_version": self.current_param_version,
-                "param_sync_requests": self.param_sync_requests,
-                "last_sync_time": self.last_sync_time,
-                "is_running": self.running,
-                "sync_in_progress": self.sync_in_progress,
-                "queue_size": f"{queue_stats['queue_size']}",
+                "queue_size": queue_stats['queue_size'],
+                "queue_max_size": self.max_queue_size,
             }
             return stats
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index bbc5cfa75d0..418ab024d0a 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -44,16 +44,16 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -128,6 +128,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         required_samples = n_responses_per_prompt * batch_size
 
         print(
+            "[FullyAsyncTrainer] "
             f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})",
             flush=True,
         )
@@ -141,17 +142,13 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
             logger.warning("required_samples is empty")
             return None, None
 
-        print(f"Retrieved {len(queue_samples)} samples from queue. wait time {consumer_end - consumer_start}")
+        print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue."
+              f"wait time {consumer_end - consumer_start:.2f} seconds.")
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
-        print(queue_samples)
-
         # Assemble batch
         batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
 
-        print("=" * 200)
-        print(batch)
-
         return 0, batch
 
     def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]):
@@ -173,7 +170,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
         if not queue_samples:
             raise ValueError("Empty queue_samples provided for batch assembly")
 
-        print(f"Assembling batch from {len(queue_samples)} queue samples")
+        print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples")
 
         # Extract data and metadata from all samples
         sample_data_list = []
@@ -215,7 +212,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
             "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]),
         }
 
-        print(meta_info)
+        print(f"[FullyAsyncTrainer] {meta_info}")
 
         return batch
 
@@ -254,7 +251,7 @@ def fit(self):
         to construct the PPO dataflow.
         The light-weight advantage computation is done on the driver process.
         """
-        print("Starting FullyAsyncTrainer...")
+        print("[FullyAsyncTrainer] Starting FullyAsyncTrainer...")
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
         if self.param_synchronizer is None:
@@ -281,16 +278,6 @@ def fit(self):
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
         while True:
-            print("while True", flush=True)
-
-            # Check queue status
-            if self.message_queue_client:
-                queue_stats = self.message_queue_client.get_statistics()
-                print(f"Queue status before getting samples: {queue_stats}")
-
-                if queue_stats.get("queue_size", 0) == 0:
-                    print("WARNING: Queue is empty, will block waiting for samples")
-
             metrics = {}
             timing_raw = {}
 
@@ -302,8 +289,6 @@ def fit(self):
                     if batch is None:
                         break
 
-                print("_get_samples_from_queue end")
-
                 # # 更新统计信息
                 #     self.processed_samples += len(batch) if isinstance(batch, list) else 1
                 #
@@ -332,20 +317,15 @@ def fit(self):
                 #                     "statistics/current_param_version": self.current_param_version,
                 #                 }
                 #             )
-                print("_process_batch_common")
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
-                print("_log_rollout")
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-                print("_check_save_checkpoint")
                 self._check_save_checkpoint(is_last_step, timing_raw)
 
-            print("_collect_metrics")
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
 
             # Trigger parameter synchronization after training step
-            print("_trigger_parameter_sync_after_step")
             self._trigger_parameter_sync_after_step()
-            print(f"global_steps: {self.global_steps}")
+            print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}")
             self.global_steps += 1
 
     def get_statistics(self) -> dict:
@@ -369,11 +349,12 @@ def _trigger_parameter_sync_after_step(self):
         """
         self.current_param_version = self.current_param_version + 1
         print(
-            f"[TRAINER] Triggering parameter sync after "
+            f"[FullyAsyncTrainer] Triggering parameter sync after "
             f"training step {self.global_steps}, version: {self.current_param_version}"
         )
-        logger.info(
-            f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}"
+        print(
+            f"[FullyAsyncTrainer] Triggering parameter sync"
+            f" after training step {self.global_steps}, version: {self.current_param_version}"
         )
         ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
 
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 3efe982752d..089a703f924 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -81,7 +81,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
         with self.lock:
             # Check freshness
             staleness = self.current_param_version - param_version
-            if staleness >= self.staleness_threshold:
+            if staleness > self.staleness_threshold:
                 self.dropped_samples += 1
                 logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
                 return False
@@ -113,13 +113,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
             List[Any]: List of retrieved samples
         """
 
-        print("get_samples")
         with self.lock:
             while len(self.queue) < min_batch_count and self.running:
-                print(f"consumer_condition {len(self.queue)}")
-                for data in self.queue:
-                    if data is None:
-                        return []
+                print(f"[MessageQueue] consumer_condition {len(self.queue)}")
+                if len(self.queue) > 0 and self.queue[-1] is None:
+                    return []
                 self.consumer_condition.wait()
 
             # If queue is closed and doesn't have enough samples, return empty list
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 11d94c79ae4..3de781959ab 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -60,7 +60,7 @@ def _init_weights_info(self):
         self.rollout_wg.set_actor_weights_info(self.weights_info)
 
     def _init_sync_group(self):
-        print("Initializing parameter synchronization group...")
+        print("[ParameterSynchronizer] Initializing parameter synchronization group...")
         actor_rollout_workers = self.actor_wg.workers + self.rollout_wg.workers
         collective.create_collective_group(
             actor_rollout_workers,
@@ -72,7 +72,7 @@ def _init_sync_group(self):
 
     def sync_weights(self, version):
         self.current_version = version
-        print(f"Starting weight synchronization (version {self.current_version})...")
+        print(f"[ParameterSynchronizer] Starting weight synchronization (version {self.current_version})...")
 
         ray.get(self.rollouter.pause.remote())
 
@@ -86,4 +86,4 @@ def sync_weights(self, version):
         # Update rollout version
         ray.get(self.rollouter.update_param_version.remote(version))
         ray.get(self.rollouter.resume.remote())
-        print("sync_weights success")
+        print("[ParameterSynchronizer] sync_weights success")
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 89acaebfe03..e8398fd0865 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1237,7 +1237,6 @@ def _post_generate_batch(self, batch, gen_batch_output, metrics):
     def _process_batch_common(self, batch, metrics, timing_raw):
         with marked_timer("reward", timing_raw, color="yellow"):
             # compute reward model score
-            print("marked_timer reward")
             if self.use_rm:
                 reward_tensor = self.rm_wg.compute_rm_score(batch)
                 batch = batch.union(reward_tensor)
@@ -1248,7 +1247,6 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
-            print("marked_timer old_log_prob")
 
             old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
             entropys = old_log_prob.batch["entropys"]
@@ -1284,8 +1282,6 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                     }
                 )
         if self.use_reference_policy:
-            print("marked_timer use_reference_policy")
-
             # compute reference log_prob
             with marked_timer("ref", timing_raw, color="olive"):
                 if not self.ref_in_actor:
@@ -1295,12 +1291,10 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 batch = batch.union(ref_log_prob)
         # compute values
         if self.use_critic:
-            print("marked_timer compute use_critic")
             with marked_timer("values", timing_raw, color="cyan"):
                 values = self.critic_wg.compute_values(batch)
                 batch = batch.union(values)
         with marked_timer("adv", timing_raw, color="brown"):
-            print("marked_timer adv")
             # we combine with rule-based rm
             reward_extra_infos_dict: dict[str, list]
             if self.config.reward_model.launch_reward_fn_async:
@@ -1336,7 +1330,6 @@ def _process_batch_common(self, batch, metrics, timing_raw):
             )
         # update critic
         if self.use_critic:
-            print("marked_timer update use_critic")
             with marked_timer("update_critic", timing_raw, color="pink"):
                 critic_output = self.critic_wg.update_critic(batch)
             critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
@@ -1344,7 +1337,6 @@ def _process_batch_common(self, batch, metrics, timing_raw):
         # implement critic warmup
         if self.config.trainer.critic_warmup <= self.global_steps:
             # update actor
-            print("marked_timer update_actor")
             with marked_timer("update_actor", timing_raw, color="red"):
                 batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
                 actor_output = self.actor_rollout_wg.update_actor(batch)

From 444c3d1af644ce710b420bf4f981a40294ce8496 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 12 Aug 2025 15:58:56 +0800
Subject: [PATCH 038/182] update message

---
 .../fully_async_rollouter.py                  | 36 +++++-----
 .../fully_async_policy/fully_async_trainer.py | 68 +++++++++----------
 recipe/fully_async_policy/message_queue.py    |  6 +-
 tests/special_e2e/run_fully_async_policy.sh   | 11 +--
 4 files changed, 55 insertions(+), 66 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 2d9d839feca..97d0f627eb8 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -72,7 +72,7 @@ def __init__(
         self.use_reference_policy = False
         self.use_rm = False
 
-        print(f"[ROLLOUTER] Creating datasets...")
+        print(f"[FullyAsyncRollouter] Creating datasets...")
         from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
         from verl.utils.dataset.rl_dataset import collate_fn
 
@@ -81,7 +81,7 @@ def __init__(
         train_sampler = create_rl_sampler(config.data, train_dataset)
 
         self._validate_config()
-        print(f"[ROLLOUTER] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
+        print(f"[FullyAsyncRollouter] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
         total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
@@ -90,7 +90,7 @@ def __init__(
             total_rollout_steps = self.config.rollout.total_rollout_steps
 
         self.total_rollout_steps = total_rollout_steps
-        print(f"[ROLLOUTER] Total rollout steps: {self.total_rollout_steps}")
+        print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}")
 
         # Rollouter parameter configuration
         self.message_queue_client = None
@@ -156,12 +156,12 @@ def update_param_version(self, version: int):
             self.current_param_version = version
             # every time param change, reset train_step_samples
             self.train_step_samples = 0
-            print(f"[ROLLOUTER] Parameter version updated from {old_version} to {version}")
+            print(f"[FullyAsyncRollouter] Parameter version updated from {old_version} to {version}")
 
     def _validate_config(self):
         # Validate asynchronous training configuration
         if not hasattr(self.config, "async_training"):
-            raise ValueError("[ROLLOUTER] Missing async_training configuration")
+            raise ValueError("[FullyAsyncRollouter] Missing async_training configuration")
 
     def _create_actor_rollout_classes(self):
         # only create rollout
@@ -189,7 +189,7 @@ def _create_continuous_iterator(self):
                 yield epoch, batch_dict
 
     def fit(self):
-        print(f"[ROLLOUTER] Starting FullyAsyncRollouter...")
+        print(f"[FullyAsyncRollouter] Starting FullyAsyncRollouter...")
 
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
@@ -210,7 +210,7 @@ def fit(self):
         self.generation_thread.join()
         self.monitor_thread.join()
 
-        print(f"[ROLLOUTER] Rollouter fit completed")
+        print(f"[FullyAsyncRollouter] Rollouter fit completed")
 
     def _generation_loop(self):
         """
@@ -247,7 +247,7 @@ def _generation_loop(self):
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
             assert val_metrics, f"{val_metrics=}"
-            pprint(f"[ROLLOUTER] Initial validation metrics: {val_metrics}")
+            pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
             self.logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
@@ -267,7 +267,7 @@ def _generation_loop(self):
                     self.pause()
 
                 while self.paused and self.running:
-                    print(f"[ROLLOUTER] Generation thread paused, waiting...")
+                    print(f"[FullyAsyncRollouter] Generation thread paused, waiting...")
                     self.condition.wait()
 
                 if not self.running:
@@ -319,7 +319,7 @@ def _generation_loop(self):
             self.global_steps += 1
 
             if is_last_step:
-                pprint(f"[ROLLOUTER] Final validation metrics: {last_val_metrics}")
+                pprint(f"[FullyAsyncRollouter] Final validation metrics: {last_val_metrics}")
                 break
 
         with self.lock:
@@ -343,14 +343,14 @@ def _monitor_loop(self):
             # 定期打印统计信息
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
-                print(f"[ROLLOUTER] {self.get_statistics()}")
+                print(f"[FullyAsyncRollouter] {self.get_statistics()}")
                 last_stats_time = current_time
             if not self._should_pause_generation():
                 with self.lock:
                     if self.paused:
                         self.paused = False
                         self.condition.notify_all()
-                        print(f"[ROLLOUTER] Generation resumed")
+                        print(f"[FullyAsyncRollouter] Generation resumed")
 
     def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""
@@ -363,7 +363,7 @@ def _should_pause_generation(self) -> bool:
 
             if version_diff > self.staleness_threshold:
                 print(
-                    "[ROLLOUTER] "
+                    "[FullyAsyncRollouter] "
                     f"Should pause due to version_diff > self.staleness_threshold: "
                     f"rollout_version={self.current_param_version}, "
                     f"trainer_version={current_trainer_version}, diff={version_diff}"
@@ -371,25 +371,25 @@ def _should_pause_generation(self) -> bool:
                 return True
 
             if queue_size >= self.max_queue_size:
-                print(f"[ROLLOUTER] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
+                print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
                 return True
 
             if self.train_step_samples >= self.max_required_samples:
-                print(f"[ROLLOUTER] Should pause due to step_generated_samples >= max_required_samples: "
+                print(f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: "
                       f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}")
                 return True
 
             return False
 
         except Exception as e:
-            print(f"[ROLLOUTER] Error checking pause conditions: {e}")
+            print(f"[FullyAsyncRollouter] Error checking pause conditions: {e}")
             return True
 
     def pause(self) -> bool:
         """pause rollout
         TODO integrated Partial Rollout
         """
-        print(f"[ROLLOUTER] pause")
+        print(f"[FullyAsyncRollouter] pause")
         with self.lock:
             if not self.running:
                 return False
@@ -404,7 +404,7 @@ def resume(self) -> bool:
         """resume rollout
         TODO integrated Partial Rollout
         """
-        print(f"[ROLLOUTER] resume")
+        print(f"[FullyAsyncRollouter] resume")
         with self.lock:
             if not self.running:
                 return False
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 418ab024d0a..70f30a180f8 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -135,15 +135,17 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
         # Get samples from queue
         consumer_start = time.time()
-        queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples)
+        queue_samples, queue_len = self.message_queue_client.get_samples(min_batch_count=required_samples)
         consumer_end = time.time()
 
         if not queue_samples or len(queue_samples) == 0:
             logger.warning("required_samples is empty")
             return None, None
 
-        print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue."
-              f"wait time {consumer_end - consumer_start:.2f} seconds.")
+        print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue. "
+              f"wait time {consumer_end - consumer_start:.2f} seconds. "
+              f"queue len {queue_len}. "
+              )
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
         # Assemble batch
@@ -289,34 +291,34 @@ def fit(self):
                     if batch is None:
                         break
 
-                # # 更新统计信息
-                #     self.processed_samples += len(batch) if isinstance(batch, list) else 1
-                #
-                #     # 从meta_info中获取参数版本信息
-                #     if hasattr(batch, "meta_info") and batch.meta_info:
-                #         rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
-                #         if rollout_param_versions:
-                #             # 统计陈旧样本
-                #             stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
-                #             self.stale_samples_processed += stale_count
-                #
-                #         # 添加新鲜度指标到metrics
-                #         if rollout_param_versions:
-                #             param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
-                #             avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
-                #
-                #             metrics.update(
-                #                 {
-                #                     "freshness/param_version_diversity": param_version_diversity,
-                #                     "freshness/avg_sample_age": avg_sample_age,
-                #                     "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
-                #                     if rollout_param_versions
-                #                     else 0,
-                #                     "statistics/processed_samples": self.processed_samples,
-                #                     "statistics/stale_samples_processed": self.stale_samples_processed,
-                #                     "statistics/current_param_version": self.current_param_version,
-                #                 }
-                #             )
+                # 更新统计信息
+                    self.processed_samples += len(batch) if isinstance(batch, list) else 1
+
+                    # 从meta_info中获取参数版本信息
+                    if hasattr(batch, "meta_info") and batch.meta_info:
+                        rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
+                        if rollout_param_versions:
+                            # 统计陈旧样本
+                            stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
+                            self.stale_samples_processed += stale_count
+
+                        # 添加新鲜度指标到metrics
+                        if rollout_param_versions:
+                            param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
+                            avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
+
+                            metrics.update(
+                                {
+                                    "freshness/param_version_diversity": param_version_diversity,
+                                    "freshness/avg_sample_age": avg_sample_age,
+                                    "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
+                                    if rollout_param_versions
+                                    else 0,
+                                    "statistics/processed_samples": self.processed_samples,
+                                    "statistics/stale_samples_processed": self.stale_samples_processed,
+                                    "statistics/current_param_version": self.current_param_version,
+                                }
+                            )
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
                 self._check_save_checkpoint(is_last_step, timing_raw)
@@ -352,10 +354,6 @@ def _trigger_parameter_sync_after_step(self):
             f"[FullyAsyncTrainer] Triggering parameter sync after "
             f"training step {self.global_steps}, version: {self.current_param_version}"
         )
-        print(
-            f"[FullyAsyncTrainer] Triggering parameter sync"
-            f" after training step {self.global_steps}, version: {self.current_param_version}"
-        )
         ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
 
     def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 089a703f924..bae34c84e47 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -102,7 +102,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
 
             return True
 
-    def get_samples(self, min_batch_count: int = 1) -> list[Any]:
+    def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
         """
         Get batch samples from the queue, wait until enough samples are available
 
@@ -136,7 +136,7 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]:
                         samples.append(data)
 
             self.total_consumed += len(samples)
-            return samples
+            return samples, len(self.queue)
 
     def update_param_version(self, version: int):
         """Update current parameter version"""
@@ -217,7 +217,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
         """Put batch into queue"""
         return ray.get(self.queue_actor.put_sample.remote(sample, param_version))
 
-    def get_samples(self, min_batch_count: int = 1) -> list[Any]:
+    def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
         """Get batch from queue, wait until enough samples are available"""
         return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index c95476e898a..27c033abc1d 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -51,13 +51,6 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
 # Async training specific configurations
 staleness_threshold=3
-min_batch_count=1
-batch_timeout=30.0
-generation_timeout=30.0
-batch_generation_interval=0.1
-max_sync_retries=3
-sync_timeout=30.0
-sync_retry_delay=1.0
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"
 
@@ -120,11 +113,9 @@ common_params=(
     rollout.nnodes=1
     rollout.n_gpus_per_node=${n_gpus_rollout}
     rollout.total_rollout_steps=10
-    rollout.total_epochs=10
+    rollout.total_epochs=2
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}
-    async_training.sync_timeout=${sync_timeout}
-    async_training.sync_retry_delay=${sync_retry_delay}
 )
 
 if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then

From bd7520703c411b36ca9697e5aa677ad028d79b38 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 12 Aug 2025 18:45:05 +0800
Subject: [PATCH 039/182] sync weight time

---
 .../fully_async_rollouter.py                  | 27 ++++++++++---------
 recipe/fully_async_policy/message_queue.py    |  6 ++---
 recipe/fully_async_policy/param_sync.py       |  7 ++++-
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 97d0f627eb8..81fee5c6074 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -282,13 +282,14 @@ def _generation_loop(self):
             is_last_step = self.global_steps >= self.total_rollout_steps
 
             # generate a batch
-            with marked_timer("gen", timing_raw, color="red"):
-                if not self.async_rollout_mode:
-                    gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
-                else:
-                    gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
-                timing_raw.update(gen_batch_output.meta_info["timing"])
-                gen_batch_output.meta_info.pop("timing", None)
+            with self.lock:
+                with marked_timer("gen", timing_raw, color="red"):
+                    if not self.async_rollout_mode:
+                        gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                    else:
+                        gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
+                    timing_raw.update(gen_batch_output.meta_info["timing"])
+                    gen_batch_output.meta_info.pop("timing", None)
 
             if gen_batch_output is not None:
                 # prepare rollout metadata
@@ -332,6 +333,10 @@ def _generation_loop(self):
         )
 
     def _monitor_loop(self):
+        """
+        Function 1: Log information output
+        Function 2: Trigger rollout recovery
+        """
         last_stats_time = time.time()
         stats_interval = 30.0
         check_interval = 5.0
@@ -346,11 +351,7 @@ def _monitor_loop(self):
                 print(f"[FullyAsyncRollouter] {self.get_statistics()}")
                 last_stats_time = current_time
             if not self._should_pause_generation():
-                with self.lock:
-                    if self.paused:
-                        self.paused = False
-                        self.condition.notify_all()
-                        print(f"[FullyAsyncRollouter] Generation resumed")
+                self.resume()
 
     def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""
@@ -413,7 +414,7 @@ def resume(self) -> bool:
                 return True
 
             self.paused = False
-            self.condition.notify_all()
+            self.actor_rollout_wg.resume()
             return True
 
     def get_statistics(self) -> dict:
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index bae34c84e47..e86c006106e 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -117,12 +117,12 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
             while len(self.queue) < min_batch_count and self.running:
                 print(f"[MessageQueue] consumer_condition {len(self.queue)}")
                 if len(self.queue) > 0 and self.queue[-1] is None:
-                    return []
+                    return [], len(self.queue)
                 self.consumer_condition.wait()
 
             # If queue is closed and doesn't have enough samples, return empty list
             if not self.running and len(self.queue) < min_batch_count:
-                return []
+                return [], len(self.queue)
 
             # Get specified number of samples
             batch_count = min(min_batch_count, len(self.queue))
@@ -131,7 +131,7 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
                 if self.queue:
                     data = self.queue.popleft()
                     if data is None:
-                        return []
+                        return [], len(self.queue)
                     else:
                         samples.append(data)
 
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 3de781959ab..ccf62462264 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import time
 
 import ray
 from ray.util.collective import collective
@@ -71,6 +72,8 @@ def _init_sync_group(self):
         )
 
     def sync_weights(self, version):
+        start_time = time.time()
+
         self.current_version = version
         print(f"[ParameterSynchronizer] Starting weight synchronization (version {self.current_version})...")
 
@@ -86,4 +89,6 @@ def sync_weights(self, version):
         # Update rollout version
         ray.get(self.rollouter.update_param_version.remote(version))
         ray.get(self.rollouter.resume.remote())
-        print("[ParameterSynchronizer] sync_weights success")
+        end_time = time.time()
+
+        print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time} seconds")

From 57b93b7be195d4c7c9d8e100706f64db2075cb58 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 13 Aug 2025 10:11:21 +0800
Subject: [PATCH 040/182] total batch to mini batch

---
 .../dapo_7b_math_fsdp2_4_12.sh                | 148 ++++++++++++++++++
 .../fully_async_rollouter.py                  |   2 +-
 recipe/fully_async_policy/param_sync.py       |   2 +-
 3 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
new file mode 100644
index 00000000000..d2f9fa2d6f0
--- /dev/null
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-one-step-off-4-12'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=4
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+train_sync_weight_steps=64
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+staleness_threshold=3
+
+NNODES=${NNODES:-1}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=4
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+/home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=10 \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps=100 \
+    rollout.total_epochs=2 \
+    async_training.staleness_threshold=${staleness_threshold}
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 81fee5c6074..e3bb3e0652b 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -414,7 +414,7 @@ def resume(self) -> bool:
                 return True
 
             self.paused = False
-            self.actor_rollout_wg.resume()
+            self.condition.notify_all()
             return True
 
     def get_statistics(self) -> dict:
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index ccf62462264..7e40e755a12 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -91,4 +91,4 @@ def sync_weights(self, version):
         ray.get(self.rollouter.resume.remote())
         end_time = time.time()
 
-        print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time} seconds")
+        print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds")

From aeb4056f611398a72be46d782b4016c61e00d4a6 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 13 Aug 2025 15:35:21 +0800
Subject: [PATCH 041/182] StreamRL batch

---
 recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh | 4 ++--
 recipe/fully_async_policy/fully_async_rollouter.py   | 6 ++++++
 recipe/fully_async_policy/fully_async_trainer.py     | 6 ++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
index d2f9fa2d6f0..5c2ac5e6017 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
@@ -40,8 +40,8 @@ overlong_penalty_factor=1.0
 
 loss_agg_mode="token-mean"
 
-train_prompt_bsz=4
-gen_prompt_bsz=1
+train_prompt_bsz=2
+gen_prompt_bsz=4
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 train_sync_weight_steps=64
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index e3bb3e0652b..0c2574e5f06 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -283,6 +283,7 @@ def _generation_loop(self):
 
             # generate a batch
             with self.lock:
+                start_time = time.time()
                 with marked_timer("gen", timing_raw, color="red"):
                     if not self.async_rollout_mode:
                         gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
@@ -290,6 +291,8 @@ def _generation_loop(self):
                         gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
                     timing_raw.update(gen_batch_output.meta_info["timing"])
                     gen_batch_output.meta_info.pop("timing", None)
+                end_time = time.time()
+                print(f"[FullyAsyncRollouter] rollout time {end_time - start_time:.2f} seconds")
 
             if gen_batch_output is not None:
                 # prepare rollout metadata
@@ -300,6 +303,7 @@ def _generation_loop(self):
                 }
                 batch = self._post_generate_batch(batch, gen_batch_output, metrics)
 
+                start_time = time.time()
                 for sample in batch:
                     # for sample in samples:
                     queue_sample = QueueSample(
@@ -316,6 +320,8 @@ def _generation_loop(self):
                             self.train_step_samples += 1
                         else:
                             self.dropped_stale_samples += 1
+                end_time = time.time()
+                print(f"[FullyAsyncRollouter] mq push time {end_time - start_time:.2f} seconds")
 
             self.global_steps += 1
 
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 70f30a180f8..4ee8fa52332 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -121,7 +121,6 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         Returns:
             tuple: (epoch, batch_dict, gen_batch_output)
         """
-
         # Calculate the number of samples needed
         n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
         batch_size = self.config.data.train_batch_size
@@ -165,6 +164,8 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
         Returns:
             DataProto: Assembled gen_batch_output
         """
+        start_time = time.time()
+
         import numpy as np
 
         from verl.protocol import DataProto
@@ -214,7 +215,8 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
             "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]),
         }
 
-        print(f"[FullyAsyncTrainer] {meta_info}")
+        end_time = time.time()
+        print(f"[FullyAsyncTrainer] {meta_info} time elapsed: {end_time - start_time:.2f} seconds")
 
         return batch
 

From 6c9d615e3a8fc47828213bcee9747b00069f255f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 14 Aug 2025 03:58:55 +0800
Subject: [PATCH 042/182] stream rollout

---
 recipe/fully_async_policy/fully_async_main.py |   9 +-
 .../fully_async_rollouter.py                  | 449 ++++++++++++------
 .../fully_async_policy/fully_async_trainer.py | 143 ++++--
 recipe/fully_async_policy/message_queue.py    |  24 +
 .../simple_streaming_demo.py                  | 176 +++++++
 recipe/one_step_off_policy/ray_trainer.py     |  28 +-
 verl/experimental/agent_loop/agent_loop.py    | 245 +++++++---
 verl/trainer/main_ppo.py                      |   2 +-
 verl/trainer/ppo/ray_trainer.py               |   1 -
 9 files changed, 797 insertions(+), 280 deletions(-)
 create mode 100644 recipe/fully_async_policy/simple_streaming_demo.py

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 163b2420381..179929f242a 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -187,10 +187,11 @@ def _initialize_components(self, config) -> None:
         self.components["reward_fn"] = reward_fn
         self.components["val_reward_fn"] = val_reward_fn
 
-        self.max_queue_size = ((config.async_training.staleness_threshold + 1)
-                               * config.data.train_batch_size
-                               * config.actor_rollout_ref.rollout.n
-                               ) * 10  # x 10 avoid deadlock
+        self.max_queue_size = (
+            (config.async_training.staleness_threshold + 1)
+            * config.data.train_batch_size
+            * config.actor_rollout_ref.rollout.n
+        ) * 10  # x 10 avoid deadlock
         print("[ASYNC MAIN] Creating MessageQueue...")
         message_queue = MessageQueue.remote(config, self.max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 0c2574e5f06..ae74cc838b1 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import threading
+import asyncio
 import time
-from concurrent.futures import ThreadPoolExecutor
 from pprint import pprint
 
 import ray
@@ -22,7 +21,6 @@
 from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
-from verl.utils.debug import marked_timer
 from verl.utils.tracking import ValidationGenerationsLogger
 
 
@@ -35,17 +33,17 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
-            max_queue_size=1000,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
+        max_queue_size=1000,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -72,7 +70,7 @@ def __init__(
         self.use_reference_policy = False
         self.use_rm = False
 
-        print(f"[FullyAsyncRollouter] Creating datasets...")
+        print("[FullyAsyncRollouter] Creating datasets...")
         from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
         from verl.utils.dataset.rl_dataset import collate_fn
 
@@ -82,6 +80,9 @@ def __init__(
 
         self._validate_config()
         print(f"[FullyAsyncRollouter] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
+
+        assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one"
+
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
         total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
@@ -119,11 +120,9 @@ def __init__(
         # Concurrency control
         self.running = False
         self.paused = False
-        self.generation_thread = None
-        self.monitor_thread = None
-        self.thread_executor = ThreadPoolExecutor(max_workers=2)
-        self.lock = threading.RLock()
-        self.condition = threading.Condition(self.lock)
+        # Initialize async locks directly - asyncio.Lock() creation is synchronous
+        self.lock = asyncio.Lock()
+        self.condition = asyncio.Condition(self.lock)
 
         # Pause/resume statistics
         self.total_pause_time = 0.0
@@ -135,23 +134,34 @@ def __init__(
         # queue size
         self.max_queue_size = max_queue_size
 
-    def set_message_queue_client(self, message_queue_client: MessageQueueClient):
+        self.async_rollout_manager = None
+
+        # 流式处理相关配置
+        self.max_concurrent_samples = async_config.get("max_concurrent_samples", 512)  # 最大并发处理样本数
+
+        # 流式处理统计
+        self.max_processing_time = 0.0  # 最长处理时间
+        self.processed_sample_count = 0  # 已处理的样本计数
+        self.active_sample_count = 0  # 当前正在处理的样本数
+        self.queue_full_pause_count = 0  # 队列满导致的暂停次数
+
+    async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
-        with self.lock:
+        async with self.lock:
             self.message_queue_client = message_queue_client
 
-    def set_parameter_synchronizer(self, param_synchronizer):
+    async def set_parameter_synchronizer(self, param_synchronizer):
         """Set parameter synchronizer"""
-        with self.lock:
+        async with self.lock:
             self.param_synchronizer = param_synchronizer
 
     def get_rollout_wg(self):
         """Get rollout worker group"""
         return self.rollout_wg
 
-    def update_param_version(self, version: int):
+    async def update_param_version(self, version: int):
         """Update current parameter version"""
-        with self.lock:
+        async with self.lock:
             old_version = self.current_param_version
             self.current_param_version = version
             # every time param change, reset train_step_samples
@@ -188,46 +198,163 @@ def _create_continuous_iterator(self):
             for batch_dict in iterator:
                 yield epoch, batch_dict
 
-    def fit(self):
-        print(f"[FullyAsyncRollouter] Starting FullyAsyncRollouter...")
+    def _init_async_rollout_manager(self):
+        # create async rollout manager and request scheduler
+        assert self.config.actor_rollout_ref.rollout.mode == "async"
+        from verl.experimental.agent_loop import AgentLoopManager
 
-        if self.message_queue_client is None:
-            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-        if self.param_synchronizer is None:
-            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
+        self.async_rollout_mode = True
+        self.async_rollout_manager = AgentLoopManager(
+            config=self.config,
+            worker_group=self.rollout_wg,
+        )
 
-        # 设置运行状态
-        with self.lock:
-            self.running = True
-            self.paused = False
+    # 添加样本到待处理队列的协程
+    async def _feed_samples(self):
+        continuous_iterator = self._create_continuous_iterator()
+        sample_count = 0
+        for epoch, batch_dict in continuous_iterator:
+            # 准备样本数据
+            sample_id = f"sample_{epoch}_{sample_count}"
+            batch, gen_batch = self._prepare_generate_batch(batch_dict)
 
-        self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True)
-        self.generation_thread.start()
+            sample_data = {"sample_id": sample_id, "gen_batch": gen_batch, "epoch": epoch, "timestamp": time.time()}
 
-        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
-        self.monitor_thread.start()
+            await self.pending_samples_queue.put(sample_data)
+            sample_count += 1
 
-        self.generation_thread.join()
-        self.monitor_thread.join()
+            # 检查是否到达最后一步
+            if self.global_steps >= self.total_rollout_steps:
+                print("[FullyAsyncRollouter] 达到最大步数，停止添加新样本")
+                break
 
-        print(f"[FullyAsyncRollouter] Rollouter fit completed")
+            self.global_steps += 1
 
-    def _generation_loop(self):
-        """
+        # 发送结束信号
+        await self.pending_samples_queue.put("DONE")
+
+    async def _submit_worker(self):
+        """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
+        active_tasks = set()
 
-        Main Generation Loop
+        while True:
+            # 获取待处理样本
+            sample_data = await self.pending_samples_queue.get()
+
+            if sample_data == "DONE":
+                print("收到结束信号，等待剩余任务完成...")
+                # 等待所有活动任务完成
+                if active_tasks:
+                    await asyncio.gather(*active_tasks, return_exceptions=True)
+                break
+
+            # 检查并发数是否超限
+            while len(active_tasks) >= self.max_concurrent_samples:
+                print(f"达到最大并发数 {self.max_concurrent_samples}，等待任务完成...")
+                # 等待至少一个任务完成
+                done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
+                # 清理已完成的任务
+                for task in done_tasks:
+                    await task
+
+            # 立即提交单个样本处理
+            task = asyncio.create_task(
+                self._process_single_sample_streaming(sample_data), name=f"process_{sample_data['sample_id']}"
+            )
+            active_tasks.add(task)
+
+            # 标记队列任务完成
+            self.pending_samples_queue.task_done()
+
+    async def _process_single_sample_streaming(self, sample_data: dict):
+        """流式处理单个样本"""
+        # 检查是否需要暂停处理
+        if await self._should_pause_generation():
+            print(f"[FullyAsyncRollouter] 暂停处理样本 {sample_data['sample_id']}")
+            # 暂停时重新放回队列
+            await self.pending_samples_queue.put(sample_data)
+            return
+
+        start_time = time.time()
+        # 直接使用AgentLoopManager的单样本异步处理能力
+        agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async(
+            sample_data["gen_batch"], sample_data["sample_id"]
+        )
+        end_time = time.time()
+
+        # 组装最终结果
+        final_result = {
+            "sample_id": sample_data["sample_id"],
+            "agent_loop_output": agent_loop_output,
+            "processing_time": processing_time,
+            "timestamp": time.time(),
+            "param_version": self.current_param_version,
+            "epoch": sample_data["epoch"],
+        }
+
+        # 立即放入结果队列
+        await self.result_queue.put(final_result)
+
+        async with self.lock:
+            self.processed_sample_count += 1
+            # 更新最大处理时间统计
+            if processing_time > self.max_processing_time:
+                self.max_processing_time = processing_time
+
+        print(
+            f"[FullyAsyncRollouter] 样本 {sample_data['sample_id']} 处理完成，"
+            f"耗时 {processing_time:.2f}s {end_time - start_time:.2f}s"
+        )
 
-        Loop Entry Requirements:
-        1. Running status validation
-        2. Interruption detection
-        3. Freshness validation
-        4. train_step_samples validation
+    async def _consumer_worker(self):
+        """消费者协程，负责从结果队列获取处理结果并放入消息队列"""
+        while True:
+            async with self.lock:
+                if not self.running:
+                    # 如果系统停止但还有结果待处理，继续处理
+                    if self.result_queue.empty():
+                        break
+
+            # 从结果队列获取处理结果
+            result = await self.result_queue.get()
+
+            # 准备rollout metadata
+            rollout_metadata = {
+                "generation_timestamp": result["timestamp"],
+                "rollout_param_version": result["param_version"],
+                "processing_time": result["processing_time"],
+                "epoch": result["epoch"],
+                "agent_loop_metrics": result["agent_loop_output"].metrics.model_dump(),
+            }
 
-        During Sample Generation Process:
-        1. Running status validation
-        2. Interruption detection
-        """
+            # 直接将 AgentLoopOutput 放入消息队列
+            queue_sample = QueueSample(
+                data=result["agent_loop_output"],  # 直接存储 AgentLoopOutput
+                rollout_metadata=rollout_metadata,
+            )
+            success = self.message_queue_client.put_sample(
+                sample=ray.cloudpickle.dumps(queue_sample),
+                param_version=result["param_version"],
+            )
+
+            async with self.lock:
+                if success:
+                    self.total_generated_samples += 1
+                    self.train_step_samples += 1
+                else:
+                    self.dropped_stale_samples += 1
+
+            print(
+                f"[FullyAsyncRollouter] 🔥 消费样本 {result['sample_id']}: "
+                f"{'成功' if success else '失败'}放入到消息队列, "
+                f"处理时间 {result['processing_time']:.2f}s"
+            )
+
+            # 标记结果队列任务完成
+            self.result_queue.task_done()
 
+    async def _streaming_generation_main(self):
+        """流式处理的主入口方法，包含初始化和验证逻辑"""
         from verl.utils.tracking import Tracking
 
         self.logger = Tracking(
@@ -254,82 +381,52 @@ def _generation_loop(self):
 
         # we start from step 1
         self.global_steps += 1
-        last_val_metrics = None
-        self.max_steps_duration = 0
 
-        continuous_iterator = self._create_continuous_iterator()
-        for epoch, batch_dict in continuous_iterator:
-            with self.lock:
-                if not self.running:
-                    break
+        # 确保async_rollout_manager已经初始化
+        if self.async_rollout_manager is None:
+            self._init_async_rollout_manager()
 
-                if self._should_pause_generation():
-                    self.pause()
+        # 启动流式处理循环
+        """流式样本生成主循环 - 优化版本，确保先完成的样本优先进入队列"""
+        print(f"[FullyAsyncRollouter] 启动流式处理模式，最大并发样本数: {self.max_concurrent_samples}")
 
-                while self.paused and self.running:
-                    print(f"[FullyAsyncRollouter] Generation thread paused, waiting...")
-                    self.condition.wait()
+        # 初始化异步队列
+        self.pending_samples_queue = asyncio.Queue(maxsize=self.max_concurrent_samples)
+        self.result_queue = asyncio.Queue()
 
-                if not self.running:
-                    break
+        # 启动流式处理协程和消费者协程
+        self.feed_task = asyncio.create_task(self._feed_samples())
+        self.stream_processor_task = asyncio.create_task(self._submit_worker())
+        self.consumer_task = asyncio.create_task(self._consumer_worker())
+        # 启动样本添加协程
 
-            metrics = {}
-            timing_raw = {}
-
-            with self.lock:
-                batch, gen_batch = self._prepare_generate_batch(batch_dict)
-
-            is_last_step = self.global_steps >= self.total_rollout_steps
-
-            # generate a batch
-            with self.lock:
-                start_time = time.time()
-                with marked_timer("gen", timing_raw, color="red"):
-                    if not self.async_rollout_mode:
-                        gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
-                    else:
-                        gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
-                    timing_raw.update(gen_batch_output.meta_info["timing"])
-                    gen_batch_output.meta_info.pop("timing", None)
-                end_time = time.time()
-                print(f"[FullyAsyncRollouter] rollout time {end_time - start_time:.2f} seconds")
-
-            if gen_batch_output is not None:
-                # prepare rollout metadata
-                rollout_metadata = {
-                    "timing": timing_raw,
-                    "generation_timestamp": time.time(),
-                    "rollout_param_version": self.current_param_version,
-                }
-                batch = self._post_generate_batch(batch, gen_batch_output, metrics)
-
-                start_time = time.time()
-                for sample in batch:
-                    # for sample in samples:
-                    queue_sample = QueueSample(
-                        data=sample,
-                        rollout_metadata=rollout_metadata,
-                    )
-                    success = self.message_queue_client.put_sample(
-                        sample=ray.cloudpickle.dumps(queue_sample),
-                        param_version=self.current_param_version,
-                    )
-                    with self.lock:
-                        if success:
-                            self.total_generated_samples += 1
-                            self.train_step_samples += 1
-                        else:
-                            self.dropped_stale_samples += 1
-                end_time = time.time()
-                print(f"[FullyAsyncRollouter] mq push time {end_time - start_time:.2f} seconds")
+        try:
+            # 等待样本添加完成
+            await self.feed_task
+            print("[FullyAsyncRollouter] 样本添加完成")
 
-            self.global_steps += 1
+            # 等待流式处理完成
+            await self.stream_processor_task
+            print("[FullyAsyncRollouter] 流式处理完成")
 
-            if is_last_step:
-                pprint(f"[FullyAsyncRollouter] Final validation metrics: {last_val_metrics}")
-                break
+            # 等待结果队列清空
+            await self.result_queue.join()
+            print("[FullyAsyncRollouter] 所有结果处理完成")
+
+        except Exception as e:
+            print(f"[FullyAsyncRollouter] 流式处理异常: {e}")
+
+        finally:
+            # 取消所有任务
+            if self.stream_processor_task:
+                self.stream_processor_task.cancel()
+            if self.consumer_task:
+                self.consumer_task.cancel()
+
+            # 等待任务结束
+            await asyncio.gather(self.stream_processor_task, self.consumer_task, return_exceptions=True)
 
-        with self.lock:
+        async with self.lock:
             self.running = False
 
         # 发送终止信号
@@ -338,34 +435,80 @@ def _generation_loop(self):
             param_version=self.current_param_version,
         )
 
-    def _monitor_loop(self):
+    def fit(self):
+        """Start the async rollouter - entry point that sets up and runs async tasks"""
+        print("[FullyAsyncRollouter] Starting FullyAsyncRollouter...")
+
+        if self.message_queue_client is None:
+            raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
+        if self.param_synchronizer is None:
+            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
+
+        # Run everything in a single async event loop
+        asyncio.run(self._async_fit())
+
+    async def _async_fit(self):
+        """Main async fit method that coordinates all coroutines"""
+        # 设置运行状态
+        async with self.lock:
+            self.running = True
+            self.paused = False
+
+        # 创建主要的异步任务
+        generation_task = asyncio.create_task(self._streaming_generation_main())
+        monitor_task = asyncio.create_task(self._async_monitor_loop())
+
+        try:
+            # 并发运行生成和监控任务
+            await asyncio.gather(generation_task, monitor_task, return_exceptions=True)
+        except Exception as e:
+            print(f"[FullyAsyncRollouter] 异步任务执行出错: {e}")
+        finally:
+            # 清理任务
+            if not generation_task.done():
+                generation_task.cancel()
+            if not monitor_task.done():
+                monitor_task.cancel()
+
+            # 等待任务完成
+            await asyncio.gather(generation_task, monitor_task, return_exceptions=True)
+
+        print("[FullyAsyncRollouter] Rollouter fit completed")
+
+    async def _async_monitor_loop(self):
         """
+        Async coroutine for monitoring:
         Function 1: Log information output
         Function 2: Trigger rollout recovery
         """
         last_stats_time = time.time()
         stats_interval = 30.0
         check_interval = 5.0
+
         while True:
-            with self.lock:
+            async with self.lock:
                 if not self.running:
                     break
-            time.sleep(check_interval)
+
+            await asyncio.sleep(check_interval)
+
             # 定期打印统计信息
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
-                print(f"[FullyAsyncRollouter] {self.get_statistics()}")
+                stats = await self.get_statistics()
+                print(f"[FullyAsyncRollouter] {stats}")
                 last_stats_time = current_time
-            if not self._should_pause_generation():
-                self.resume()
 
-    def _should_pause_generation(self) -> bool:
+            if not await self._should_pause_generation():
+                await self.resume()
+
+    async def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""
-        try:
-            queue_stats = self.message_queue_client.get_statistics()
-            queue_size = queue_stats["queue_size"]
-            current_trainer_version = queue_stats["current_param_version"]
+        queue_stats = self.message_queue_client.get_statistics()
+        queue_size = queue_stats["queue_size"]
+        current_trainer_version = queue_stats["current_param_version"]
 
+        async with self.lock:
             version_diff = self.current_param_version - current_trainer_version
 
             if version_diff > self.staleness_threshold:
@@ -378,26 +521,27 @@ def _should_pause_generation(self) -> bool:
                 return True
 
             if queue_size >= self.max_queue_size:
-                print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
+                print(
+                    f"[FullyAsyncRollouter] Should pause due to full queue: "
+                    f"size={queue_size}, max={self.max_queue_size}"
+                )
                 return True
 
             if self.train_step_samples >= self.max_required_samples:
-                print(f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: "
-                      f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}")
+                print(
+                    f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: "
+                    f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}"
+                )
                 return True
 
             return False
 
-        except Exception as e:
-            print(f"[FullyAsyncRollouter] Error checking pause conditions: {e}")
-            return True
-
-    def pause(self) -> bool:
+    async def pause(self) -> bool:
         """pause rollout
         TODO integrated Partial Rollout
         """
-        print(f"[FullyAsyncRollouter] pause")
-        with self.lock:
+        print("[FullyAsyncRollouter] pause")
+        async with self.lock:
             if not self.running:
                 return False
 
@@ -407,12 +551,12 @@ def pause(self) -> bool:
             self.paused = True
             return True
 
-    def resume(self) -> bool:
+    async def resume(self) -> bool:
         """resume rollout
         TODO integrated Partial Rollout
         """
-        print(f"[FullyAsyncRollouter] resume")
-        with self.lock:
+        print("[FullyAsyncRollouter] resume")
+        async with self.lock:
             if not self.running:
                 return False
 
@@ -423,8 +567,8 @@ def resume(self) -> bool:
             self.condition.notify_all()
             return True
 
-    def get_statistics(self) -> dict:
-        with self.lock:
+    async def get_statistics(self) -> dict:
+        async with self.lock:
             queue_stats = self.message_queue_client.get_statistics()
             stats = {
                 "is_running": self.running,
@@ -432,7 +576,12 @@ def get_statistics(self) -> dict:
                 "train_step_samples": self.train_step_samples,
                 "dropped_stale_samples": self.dropped_stale_samples,
                 "current_param_version": self.current_param_version,
-                "queue_size": queue_stats['queue_size'],
+                "queue_size": queue_stats["queue_size"],
                 "queue_max_size": self.max_queue_size,
+                "max_concurrent_samples": self.max_concurrent_samples,
+                "max_processing_time": self.max_processing_time,
+                "pending_samples_queue_size": self.pending_samples_queue.qsize(),
+                "result_queue_size": self.result_queue.qsize(),
             }
+
             return stats
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 4ee8fa52332..d9883aaf33f 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -44,16 +44,16 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -117,6 +117,7 @@ def get_actor_wg(self):
     def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
         Get samples from message queue and compose gen_batch_output
+        Uses a loop to continuously collect samples until enough are gathered
 
         Returns:
             tuple: (epoch, batch_dict, gen_batch_output)
@@ -132,19 +133,39 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
             flush=True,
         )
 
-        # Get samples from queue
+        # Collect samples using a simple loop calling get_sample
         consumer_start = time.time()
-        queue_samples, queue_len = self.message_queue_client.get_samples(min_batch_count=required_samples)
+        queue_samples = []
+
+        print(f"[FullyAsyncTrainer] Starting sample collection loop, required={required_samples}")
+
+        while len(queue_samples) < required_samples:
+            # 获取单个样本，会一直等待直到有样本或收到None
+            sample = self.message_queue_client.get_sample()
+
+            if sample is None:
+                # 检测到结束信号（None），立即退出
+                logger.info(
+                    f"Detected termination signal (None), stopping sample collection. "
+                    f"Collected {len(queue_samples)}/{required_samples} samples"
+                )
+                break
+
+            queue_samples.append(sample)
+
+            if len(queue_samples) % 10 == 0 or len(queue_samples) >= required_samples:
+                print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{required_samples} samples")
+
         consumer_end = time.time()
 
-        if not queue_samples or len(queue_samples) == 0:
-            logger.warning("required_samples is empty")
+        if not queue_samples or len(queue_samples) < required_samples:
+            logger.warning("not enough samples collected after loop")
             return None, None
 
-        print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue. "
-              f"wait time {consumer_end - consumer_start:.2f} seconds. "
-              f"queue len {queue_len}. "
-              )
+        print(
+            f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{required_samples} samples, "
+            f"total wait time: {consumer_end - consumer_start:.2f} seconds"
+        )
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
         # Assemble batch
@@ -154,12 +175,10 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
     def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]):
         """
-        Assemble gen_batch_output from queue samples
+        Assemble gen_batch_output from queue samples containing AgentLoopOutput
 
         Args:
-            queue_samples: List of samples from queue
-            n_responses_per_prompt: Number of responses per prompt
-            batch_size: Batch size
+            queue_samples: List of samples from queue, each containing AgentLoopOutput
 
         Returns:
             DataProto: Assembled gen_batch_output
@@ -168,23 +187,29 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
 
         import numpy as np
 
-        from verl.protocol import DataProto
-
         if not queue_samples:
             raise ValueError("Empty queue_samples provided for batch assembly")
 
-        print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples")
+        print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples with AgentLoopOutput")
 
-        # Extract data and metadata from all samples
-        sample_data_list = []
+        # Extract AgentLoopOutput and metadata from all samples
+        agent_loop_outputs = []
         rollout_metadata_list = []
-        timing_info = {}
+        processing_times = []
 
-        for i, sample in enumerate(queue_samples):
-            sample_data_list.append(sample.data)
+        for sample in queue_samples:
+            # sample.data is now AgentLoopOutput
+            agent_loop_outputs.append(sample.data)
             rollout_metadata_list.append(sample.rollout_metadata)
+            processing_times.append(sample.rollout_metadata.get("processing_time", 0))
+
+        # Use the static method to postprocess AgentLoopOutput list into DataProto
+        from verl.experimental.agent_loop.agent_loop import AgentLoopWorker
 
-        batch = DataProto.from_items(sample_data_list)
+        batch = AgentLoopWorker.postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config)
+
+        # Apply _post_generate_batch logic here
+        batch = self._post_generate_batch_for_agent_outputs(batch, agent_loop_outputs)
 
         # Collect timing information and metadata
         param_versions = []
@@ -193,21 +218,10 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
             # Extract parameter version and timestamp
             param_versions.append(metadata.get("rollout_param_version", 0))
             sample_timestamps.append(metadata.get("generation_timestamp", time.time()))
-            if "timing" in metadata:
-                for timing_key, timing_value in metadata["timing"].items():
-                    if timing_key not in timing_info:
-                        timing_info[timing_key] = []
-                    # if isinstance(timing_value, (int, float)):
-                    #     timing_info[timing_key].append(timing_value)
-        # Calculate average timing
-        avg_timing = {}
-        for key, values in timing_info.items():
-            if values and len(values) > 0:
-                avg_timing[key] = sum(values) / len(values)
 
         # Create meta_info
         meta_info = {
-            "timing": avg_timing,
+            "timing": {"avg_processing_time": np.mean(processing_times) if processing_times else 0},
             "queue_sample_count": len(queue_samples),
             "rollout_param_versions": param_versions,
             "sample_timestamps": sample_timestamps,
@@ -215,8 +229,47 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
             "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]),
         }
 
+        batch.meta_info.update(meta_info)
+
         end_time = time.time()
-        print(f"[FullyAsyncTrainer] {meta_info} time elapsed: {end_time - start_time:.2f} seconds")
+        print(
+            f"[FullyAsyncTrainer] Assembled batch with meta_info: "
+            f"{meta_info}, time elapsed: {end_time - start_time:.2f} seconds"
+        )
+
+        return batch
+
+    def _post_generate_batch_for_agent_outputs(self, batch, agent_loop_outputs):
+        """
+        Apply _post_generate_batch logic for AgentLoopOutput
+
+        Args:
+            batch: DataProto created from AgentLoopWorker.postprocess_agent_loop_outputs
+            agent_loop_outputs: List of AgentLoopOutput
+
+        Returns:
+            DataProto: Processed batch with additional metadata
+        """
+        import uuid
+
+        import numpy as np
+        import torch
+
+        from verl.trainer.ppo.ray_trainer import compute_response_mask
+
+        # Add UIDs
+        batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+
+        # response_mask should already be in batch from AgentLoopWorker.postprocess_agent_loop_outputs
+        if "response_mask" not in batch.batch.keys():
+            batch.batch["response_mask"] = compute_response_mask(batch)
+
+        # Balance the number of valid tokens across DP ranks if needed
+        if self.config.trainer.balance_batch:
+            self._balance_batch(batch, metrics={})
+
+        # compute global_valid tokens
+        batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
 
         return batch
 
@@ -293,7 +346,7 @@ def fit(self):
                     if batch is None:
                         break
 
-                # 更新统计信息
+                    # 更新统计信息
                     self.processed_samples += len(batch) if isinstance(batch, list) else 1
 
                     # 从meta_info中获取参数版本信息
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index e86c006106e..4d1eddee6ae 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -138,6 +138,26 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
             self.total_consumed += len(samples)
             return samples, len(self.queue)
 
+    def get_sample(self) -> Any | None:
+        """
+        Get a single sample from the queue, wait until one is available
+
+        Returns:
+            Any: Single sample data or None if queue is closed
+        """
+        with self.lock:
+            while len(self.queue) == 0 and self.running:
+                self.consumer_condition.wait()
+
+            # If queue is closed and empty, return None
+            if not self.running and len(self.queue) == 0:
+                return None
+
+            # Get one sample
+            data = self.queue.popleft()
+            self.total_consumed += 1
+            return data
+
     def update_param_version(self, version: int):
         """Update current parameter version"""
         with self.lock:
@@ -221,6 +241,10 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
         """Get batch from queue, wait until enough samples are available"""
         return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
+    def get_sample(self) -> Any | None:
+        """Get single sample from queue, wait until one is available"""
+        return ray.get(self.queue_actor.get_sample.remote())
+
     def update_param_version(self, version: int):
         """Update parameter version"""
         ray.get(self.queue_actor.update_param_version.remote(version))
diff --git a/recipe/fully_async_policy/simple_streaming_demo.py b/recipe/fully_async_policy/simple_streaming_demo.py
new file mode 100644
index 00000000000..d3ae0702e3f
--- /dev/null
+++ b/recipe/fully_async_policy/simple_streaming_demo.py
@@ -0,0 +1,176 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import random
+import time
+
+
+class SimpleStreamingSystem:
+    """简化的流式处理系统演示"""
+
+    def __init__(self, max_concurrent_tasks: int = 4):
+        self.max_concurrent_tasks = max_concurrent_tasks
+        self.data_queue = asyncio.Queue()
+        self.result_queue = asyncio.Queue()
+        self.consumer_count = 0
+
+    # 数据流协程
+    async def data_stream(self):
+        # 添加初始数据
+        # 准备测试数据
+        test_data = [{"id": f"task_{i}", "content": f"数据_{i}"} for i in range(8)]
+        await self.add_data_stream(test_data)
+
+        # 模拟后续数据流
+        await asyncio.sleep(3)
+        print("\n添加第二批数据...")
+        extra_data = [{"id": f"extra_{i}", "content": f"额外数据_{i}"} for i in range(5)]
+        await self.add_data_stream(extra_data)
+
+        # 发送结束信号
+        await asyncio.sleep(1)
+        await self.data_queue.put("DONE")
+        print("发送结束信号")
+
+    async def add_data_stream(self, data_list: list[dict]):
+        """模拟数据流"""
+        print("开始添加数据流...")
+
+        for i, data_item in enumerate(data_list):
+            await self.data_queue.put(data_item)
+            print(f"数据 {data_item['id']} 进入待处理队列")
+
+            # 模拟数据流的间隔
+            if i < len(data_list) - 1:  # 最后一个不等待
+                await asyncio.sleep(0.8)
+
+        print("初始数据流添加完成")
+
+    async def _process_data_async(self, data_item: dict):
+        """异步处理单个数据项"""
+        data_id = data_item["id"]
+        content = data_item["content"]
+
+        # 模拟不同的处理时间（1-3秒）
+        processing_time = random.uniform(1, 3)
+
+        print(f"    开始处理 {data_id}，预计耗时 {processing_time:.1f}s")
+
+        # 异步等待处理完成
+        await asyncio.sleep(processing_time)
+
+        result = {
+            "id": data_id,
+            "processed_content": f"处理后的{content}",
+            "processing_time": round(processing_time, 2),
+            "completed_at": time.time(),
+        }
+
+        # 立即放入结果队列
+        await self.result_queue.put(result)
+        print(f"    {data_id} 处理完成！(耗时 {processing_time:.1f}s) -> 进入结果队列")
+
+    async def _submit_worker(self):
+        """流式提交工作协程"""
+        active_tasks = set()
+
+        print("流式提交器启动...")
+
+        while True:
+            # 获取待处理数据
+            data_item = await self.data_queue.get()
+
+            if data_item == "DONE":
+                print("收到结束信号，等待剩余任务完成...")
+                if active_tasks:
+                    await asyncio.gather(*active_tasks, return_exceptions=True)
+                break
+
+            # 检查并发数限制
+            while len(active_tasks) >= self.max_concurrent_tasks:
+                print(f"达到最大并发数 {self.max_concurrent_tasks}，等待任务完成...")
+                done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
+
+                # 清理完成的任务
+                for task in done_tasks:
+                    try:
+                        await task
+                        print(f"task 完成 {task}")
+                    except Exception as e:
+                        print(f"任务执行失败: {e}")
+
+            # 立即提交新任务
+            task = asyncio.create_task(self._process_data_async(data_item), name=f"active {data_item}")
+            active_tasks.add(task)
+
+            print(f"提交任务 {data_item['id']}，当前并发数: {len(active_tasks)}")
+
+    async def _consumer_worker(self):
+        """结果消费协程"""
+        print("消费者启动...")
+
+        while True:
+            try:
+                # 从结果队列获取处理结果
+                result = await asyncio.wait_for(self.result_queue.get(), timeout=2.0)
+
+                self.consumer_count += 1
+
+                print(
+                    f"消费 #{self.consumer_count}: {result['id']} "
+                    f"(处理时间 {result['processing_time']}s) - {result['processed_content']}"
+                )
+
+            except asyncio.TimeoutError:
+                print("    消费者等待中...")
+                await asyncio.sleep(0.5)
+
+    async def run_demo(self):
+        """运行演示"""
+        print("=" * 60)
+        print(f"最大并发数: {self.max_concurrent_tasks}")
+        print("=" * 60)
+
+        # 启动核心协程
+        stream_task = asyncio.create_task(self.data_stream())
+        submit_task = asyncio.create_task(self._submit_worker())
+        consumer_task = asyncio.create_task(self._consumer_worker())
+
+        try:
+            # 等待数据流完成
+            await stream_task
+            print("数据流完成")
+
+            # 等待处理完成
+            await submit_task
+            print("所有任务处理完成")
+
+        finally:
+            # 清理
+            submit_task.cancel()
+            consumer_task.cancel()
+            await asyncio.gather(submit_task, consumer_task, return_exceptions=True)
+
+        print(f"\n最终统计: 消费了 {self.consumer_count} 个结果")
+
+
+async def main():
+    """主函数"""
+    system = SimpleStreamingSystem(max_concurrent_tasks=3)
+    await system.run_demo()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index 893760965d0..ef8d6d8792e 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -76,20 +76,20 @@ class OneStepOffRayTrainer(RayPPOTrainer):
     # TODO: support each role have individual ray_worker_group_cls,
     # i.e., support different backend of different role
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            train_dataset: Dataset | None = None,
-            val_dataset: Dataset | None = None,
-            collate_fn=None,
-            train_sampler: Sampler | None = None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Dataset | None = None,
+        val_dataset: Dataset | None = None,
+        collate_fn=None,
+        train_sampler: Sampler | None = None,
+        device_name=None,
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index ef86381020b..34f4c78833c 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import random
+import time
 from abc import ABC, abstractmethod
 from typing import Any
 
@@ -200,6 +201,81 @@ def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]:
     return decorator
 
 
+def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto:
+    """Static method to postprocess a list of AgentLoopOutput into DataProto
+
+    Args:
+        inputs: List of AgentLoopOutput
+        tokenizer: Tokenizer instance
+        config: Configuration object
+
+    Returns:
+        DataProto: Processed batch data
+    """
+    # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
+    # prompts: left pad
+    # responses: right pad
+    # input_ids: prompt + response
+    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+    # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
+
+    # prompts
+    tokenizer.padding_side = "left"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.prompt_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.prompt_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # responses
+    tokenizer.padding_side = "right"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # response_mask
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_mask} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=False,
+    )
+    response_mask = outputs["input_ids"]
+    assert response_ids.shape == response_mask.shape, (
+        f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
+    )
+    response_mask = response_mask * response_attention_mask
+
+    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
+    attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
+    position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
+
+    batch = TensorDict(
+        {
+            "prompts": prompt_ids,  # [bsz, prompt_length]
+            "responses": response_ids,  # [bsz, response_length]
+            "response_mask": response_mask,  # [bsz, response_length]
+            "input_ids": input_ids,  # [bsz, prompt_length + response_length]
+            "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
+            "position_ids": position_ids,  # [bsz, prompt_length + response_length]
+        },
+        batch_size=len(input_ids),
+    )
+
+    num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
+    metrics = [input.metrics.model_dump() for input in inputs]
+    return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
+
+
 @ray.remote
 class AgentLoopWorker:
     """Agent loop worker takes a batch of messages and run each message in an agent loop."""
@@ -289,9 +365,60 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
             )
         outputs = await asyncio.gather(*tasks)
 
-        output = self._postprocess(outputs)
+        output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config)
         return output
 
+    async def generate_sequences_no_post(self, batch: DataProto) -> list[AgentLoopOutput]:
+        """Generate sequences from agent loop.
+
+        Args:
+            batch (DataProto): Input batch.
+
+        Returns:
+            list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
+            Each AgentLoopOutput contains:
+            - prompt_ids: prompt token ids
+            - response_ids: response token ids including LLM generated and tool response tokens
+            - response_mask: 1 for LLM generated tokens, 0 for tool response tokens
+            - num_turns: number of chat turns
+            - metrics: performance metrics
+        """
+        config = self.config.actor_rollout_ref.rollout
+        sampling_params = dict(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            repetition_penalty=1.0,
+        )
+
+        # override sampling params for validation
+        if batch.meta_info.get("validate", False):
+            sampling_params["top_p"] = config.val_kwargs.top_p
+            sampling_params["temperature"] = config.val_kwargs.temperature
+
+        # by default, we assume it's a single turn agent
+        if "agent_name" not in batch.non_tensor_batch:
+            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
+
+        tasks = []
+        agent_names = batch.non_tensor_batch["agent_name"]
+        raw_prompts = batch.non_tensor_batch["raw_prompt"]
+        if "index" in batch.non_tensor_batch:
+            index = batch.non_tensor_batch["index"]
+        else:
+            index = np.arange(len(raw_prompts))
+
+        trajectory_info = await get_trajectory_info(
+            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
+        )
+
+        for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True):
+            tasks.append(
+                asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory))
+            )
+        outputs = await asyncio.gather(*tasks)
+
+        return outputs
+
     async def _run_agent_loop(
         self,
         agent_name: str,
@@ -320,70 +447,6 @@ async def _run_agent_loop(
             output = await agent_loop.run(messages, sampling_params)
             return output
 
-    def _postprocess(self, inputs: list[AgentLoopOutput]) -> DataProto:
-        # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
-        # prompts: left pad
-        # responses: right pad
-        # input_ids: prompt + response
-        # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
-        # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
-
-        # prompts
-        self.tokenizer.padding_side = "left"
-        outputs = self.tokenizer.pad(
-            [{"input_ids": input.prompt_ids} for input in inputs],
-            padding="max_length",
-            max_length=self.config.actor_rollout_ref.rollout.prompt_length,
-            return_tensors="pt",
-            return_attention_mask=True,
-        )
-        prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-        # responses
-        self.tokenizer.padding_side = "right"
-        outputs = self.tokenizer.pad(
-            [{"input_ids": input.response_ids} for input in inputs],
-            padding="max_length",
-            max_length=self.config.actor_rollout_ref.rollout.response_length,
-            return_tensors="pt",
-            return_attention_mask=True,
-        )
-        response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-        # response_mask
-        outputs = self.tokenizer.pad(
-            [{"input_ids": input.response_mask} for input in inputs],
-            padding="max_length",
-            max_length=self.config.actor_rollout_ref.rollout.response_length,
-            return_tensors="pt",
-            return_attention_mask=False,
-        )
-        response_mask = outputs["input_ids"]
-        assert response_ids.shape == response_mask.shape, (
-            f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
-        )
-        response_mask = response_mask * response_attention_mask
-
-        input_ids = torch.cat([prompt_ids, response_ids], dim=1)
-        attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
-        position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
-
-        batch = TensorDict(
-            {
-                "prompts": prompt_ids,  # [bsz, prompt_length]
-                "responses": response_ids,  # [bsz, response_length]
-                "response_mask": response_mask,  # [bsz, response_length]
-                "input_ids": input_ids,  # [bsz, prompt_length + response_length]
-                "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
-                "position_ids": position_ids,  # [bsz, prompt_length + response_length]
-            },
-            batch_size=len(input_ids),
-        )
-
-        num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
-        metrics = [input.metrics.model_dump() for input in inputs]
-        return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
-
 
 async def get_trajectory_info(step, index, validate):
     """Get trajectory info.
@@ -407,6 +470,18 @@ async def get_trajectory_info(step, index, validate):
     return trajectory_info
 
 
+async def _ray_future_to_asyncio(ray_future):
+    """将Ray future转换为asyncio可等待的对象"""
+    while True:
+        try:
+            # 非阻塞检查Ray future是否完成
+            result = ray.get(ray_future, timeout=0.001)  # 1ms timeout
+            return result
+        except ray.exceptions.GetTimeoutError:
+            # 未完成，让出控制权给其他协程
+            await asyncio.sleep(1)  # 1s sleep
+
+
 class AgentLoopManager:
     """Agent loop manager that manages a group of agent loop workers."""
 
@@ -512,6 +587,46 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         output.meta_info = {"timing": timing}
         return output
 
+    async def generate_single_sample_async(self, sample: DataProto, sample_id: str) -> tuple[AgentLoopOutput, float]:
+        """
+        异步处理单个样本 - 用于流式推理的核心方法
+
+        Args:
+            sample: 单个样本数据
+            sample_id: 样本ID
+
+        Returns:
+            tuple[AgentLoopOutput, float]: 处理结果和处理时间
+        """
+        start_time = time.time()
+
+        # 使用负载均衡选择 worker
+        worker = self._select_best_worker()
+
+        # 异步处理单个样本
+        output_future = worker.generate_sequences.remote(sample)
+        outputs = await _ray_future_to_asyncio(output_future)
+
+        processing_time = time.time() - start_time
+
+        # outputs 是 AgentLoopOutput 列表，取第一个（因为是单样本）
+        assert len(outputs) == 1, f"Expected single output for single sample, got {len(outputs)}"
+        output = outputs[0]
+
+        # 添加处理时间到metrics
+        output.metrics.generate_sequences = processing_time
+
+        return output, processing_time
+
+    def _select_best_worker(self):
+        """选择最佳的 worker（简单的轮询负载均衡）"""
+        if not hasattr(self, "_worker_index"):
+            self._worker_index = 0
+
+        worker = self.agent_loop_workers[self._worker_index]
+        self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
+        return worker
+
     def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
         timing = {}
         t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk])
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index e81d0b32c1d..fa12105f07f 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -41,7 +41,7 @@ def main(config):
 
 
 # Define a function to run the PPO-like training process
-def run_ppo(config, task_runner_class = None) -> None:
+def run_ppo(config, task_runner_class=None) -> None:
     """Initialize Ray cluster and run distributed PPO training process.
 
     Args:
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index e8398fd0865..26150cc631d 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1247,7 +1247,6 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
-
             old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
             entropys = old_log_prob.batch["entropys"]
             response_masks = batch.batch["response_mask"]

From 0d7233f648a592aa7c352d65fd1e471d00a887ff Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 14 Aug 2025 11:32:35 +0800
Subject: [PATCH 043/182] async mq

---
 .../fully_async_rollouter.py                  |   2 +-
 recipe/fully_async_policy/message_queue.py    | 166 ++++---
 .../unittest/ray_async_resource_config.py     | 366 ++++++++++++++++
 .../{ => unittest}/simple_streaming_demo.py   |   0
 .../unittest/test_asyncio_message_queue.py    | 407 ++++++++++++++++++
 verl/experimental/agent_loop/agent_loop.py    |  27 +-
 6 files changed, 891 insertions(+), 77 deletions(-)
 create mode 100644 recipe/fully_async_policy/unittest/ray_async_resource_config.py
 rename recipe/fully_async_policy/{ => unittest}/simple_streaming_demo.py (100%)
 create mode 100644 recipe/fully_async_policy/unittest/test_asyncio_message_queue.py

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index ae74cc838b1..0060cfa1b02 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -24,7 +24,7 @@
 from verl.utils.tracking import ValidationGenerationsLogger
 
 
-@ray.remote(num_cpus=10, max_concurrency=10)
+@ray.remote(num_cpus=10, max_concurrency=100)
 class FullyAsyncRollouter(RayPPOTrainer):
     """
     Asynchronous sample generator, responsible for continuously generating training samples
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 4d1eddee6ae..2e8ad6b0e79 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
 import logging
-import threading
 from collections import deque
 from dataclasses import dataclass
 from typing import Any
@@ -30,10 +30,11 @@ class QueueSample:
     rollout_metadata: dict[str, Any]
 
 
-@ray.remote(num_cpus=10, max_concurrency=10)
+@ray.remote(num_cpus=2, max_concurrency=20)
 class MessageQueue:
     """
     Simplified Ray-based asynchronous message queue for communication between Rollouter and Trainer
+    使用 asyncio 实现异步消息队列
     """
 
     def __init__(self, config: DictConfig, max_queue_size: int = 1000):
@@ -50,12 +51,12 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         except (AttributeError, RecursionError):
             self.staleness_threshold = 3
 
-        # Threading for message handling
+        # Asyncio for message handling
         self.running = True
 
-        # thread safe
-        self.lock = threading.RLock()
-        self.consumer_condition = threading.Condition(self.lock)
+        # async safe - 在第一次使用时初始化
+        self._lock = None
+        self._consumer_condition = None
 
         # statistic message
         self.total_produced = 0
@@ -67,7 +68,13 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
             f"staleness_threshold={self.staleness_threshold}"
         )
 
-    def put_sample(self, sample: Any, param_version: int) -> bool:
+    async def _ensure_async_primitives(self):
+        """确保异步原语已初始化"""
+        if self._lock is None:
+            self._lock = asyncio.Lock()
+            self._consumer_condition = asyncio.Condition(self._lock)
+
+    async def put_sample(self, sample: Any, param_version: int) -> bool:
         """
         Put a batch sample into the queue
 
@@ -78,7 +85,9 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
         Returns:
             bool: Whether the sample was successfully put into the queue
         """
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             # Check freshness
             staleness = self.current_param_version - param_version
             if staleness > self.staleness_threshold:
@@ -95,14 +104,14 @@ def put_sample(self, sample: Any, param_version: int) -> bool:
             self.total_produced += 1
 
             # Notify waiting consumers
-            self.consumer_condition.notify()
+            self._consumer_condition.notify()
 
             if self.total_produced % 100 == 0:
                 logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}")
 
             return True
 
-    def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
+    async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
         """
         Get batch samples from the queue, wait until enough samples are available
 
@@ -112,13 +121,14 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
         Returns:
             List[Any]: List of retrieved samples
         """
+        await self._ensure_async_primitives()
 
-        with self.lock:
+        async with self._lock:
             while len(self.queue) < min_batch_count and self.running:
                 print(f"[MessageQueue] consumer_condition {len(self.queue)}")
                 if len(self.queue) > 0 and self.queue[-1] is None:
                     return [], len(self.queue)
-                self.consumer_condition.wait()
+                await self._consumer_condition.wait()
 
             # If queue is closed and doesn't have enough samples, return empty list
             if not self.running and len(self.queue) < min_batch_count:
@@ -138,16 +148,18 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
             self.total_consumed += len(samples)
             return samples, len(self.queue)
 
-    def get_sample(self) -> Any | None:
+    async def get_sample(self) -> Any | None:
         """
         Get a single sample from the queue, wait until one is available
 
         Returns:
             Any: Single sample data or None if queue is closed
         """
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             while len(self.queue) == 0 and self.running:
-                self.consumer_condition.wait()
+                await self._consumer_condition.wait()
 
             # If queue is closed and empty, return None
             if not self.running and len(self.queue) == 0:
@@ -158,21 +170,27 @@ def get_sample(self) -> Any | None:
             self.total_consumed += 1
             return data
 
-    def update_param_version(self, version: int):
+    async def update_param_version(self, version: int):
         """Update current parameter version"""
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             old_version = self.current_param_version
             self.current_param_version = version
             logger.debug(f"Parameter version updated from {old_version} to {version}")
 
-    def get_queue_size(self) -> int:
+    async def get_queue_size(self) -> int:
         """Get current queue length"""
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             return len(self.queue)
 
-    def get_statistics(self) -> dict[str, Any]:
+    async def get_statistics(self) -> dict[str, Any]:
         """Get queue statistics"""
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             return {
                 "queue_size": len(self.queue),
                 "total_produced": self.total_produced,
@@ -183,24 +201,30 @@ def get_statistics(self) -> dict[str, Any]:
                 "max_queue_size": self.max_queue_size,
             }
 
-    def clear_queue(self):
+    async def clear_queue(self):
         """Clear the queue"""
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             cleared_count = len(self.queue)
             self.queue.clear()
             logger.info(f"Cleared {cleared_count} samples from queue")
 
-    def shutdown(self):
+    async def shutdown(self):
         """Shutdown the message queue"""
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             self.running = False
-            # Notify all waiting threads so they can exit
-            self.consumer_condition.notify_all()
+            # Notify all waiting coroutines so they can exit
+            self._consumer_condition.notify_all()
         logger.info("MessageQueue shutdown")
 
-    def get_memory_usage(self) -> dict:
+    async def get_memory_usage(self) -> dict:
         """Get memory usage statistics"""
-        with self.lock:
+        await self._ensure_async_primitives()
+
+        async with self._lock:
             # Estimate memory usage of samples in queue
             import sys
 
@@ -228,43 +252,65 @@ def get_memory_usage(self) -> dict:
 
 
 class MessageQueueClient:
-    """MessageQueue client for communicating with MessageQueue Actor"""
+    """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor"""
 
     def __init__(self, queue_actor: Any):
         self.queue_actor = queue_actor
 
-    def put_sample(self, sample: Any, param_version: int) -> bool:
-        """Put batch into queue"""
+    async def put_sample(self, sample: Any, param_version: int) -> bool:
+        """Put batch into queue (async)"""
+        future = self.queue_actor.put_sample.remote(sample, param_version)
+        return await asyncio.wrap_future(future.future())
+
+    async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
+        """Get batch from queue, wait until enough samples are available (async)"""
+        future = self.queue_actor.get_samples.remote(min_batch_count)
+        return await asyncio.wrap_future(future.future())
+
+    async def get_sample(self) -> Any | None:
+        """Get single sample from queue, wait until one is available (async)"""
+        future = self.queue_actor.get_sample.remote()
+        return await asyncio.wrap_future(future.future())
+
+    async def update_param_version(self, version: int):
+        """Update parameter version (async)"""
+        future = self.queue_actor.update_param_version.remote(version)
+        await asyncio.wrap_future(future.future())
+
+    async def get_queue_size(self) -> int:
+        """Get queue size (async)"""
+        future = self.queue_actor.get_queue_size.remote()
+        return await asyncio.wrap_future(future.future())
+
+    async def get_statistics(self) -> dict[str, Any]:
+        """Get statistics (async)"""
+        future = self.queue_actor.get_statistics.remote()
+        return await asyncio.wrap_future(future.future())
+
+    async def clear_queue(self):
+        """Clear queue (async)"""
+        future = self.queue_actor.clear_queue.remote()
+        await asyncio.wrap_future(future.future())
+
+    async def shutdown(self):
+        """Shutdown queue (async)"""
+        future = self.queue_actor.shutdown.remote()
+        await asyncio.wrap_future(future.future())
+
+    async def get_memory_usage(self) -> dict:
+        """Get memory usage statistics (async)"""
+        future = self.queue_actor.get_memory_usage.remote()
+        return await asyncio.wrap_future(future.future())
+
+    # 为了兼容性，保留同步版本的方法（但标记为deprecated）
+    def put_sample_sync(self, sample: Any, param_version: int) -> bool:
+        """Put batch into queue (sync - deprecated, use put_sample instead)"""
         return ray.get(self.queue_actor.put_sample.remote(sample, param_version))
 
-    def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
-        """Get batch from queue, wait until enough samples are available"""
+    def get_samples_sync(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
+        """Get batch from queue (sync - deprecated, use get_samples instead)"""
         return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
-    def get_sample(self) -> Any | None:
-        """Get single sample from queue, wait until one is available"""
-        return ray.get(self.queue_actor.get_sample.remote())
-
-    def update_param_version(self, version: int):
-        """Update parameter version"""
-        ray.get(self.queue_actor.update_param_version.remote(version))
-
-    def get_queue_size(self) -> int:
-        """Get queue size"""
-        return ray.get(self.queue_actor.get_queue_size.remote())
-
-    def get_statistics(self) -> dict[str, Any]:
-        """Get statistics"""
+    def get_statistics_sync(self) -> dict[str, Any]:
+        """Get statistics (sync - deprecated, use get_statistics instead)"""
         return ray.get(self.queue_actor.get_statistics.remote())
-
-    def clear_queue(self):
-        """Clear queue"""
-        ray.get(self.queue_actor.clear_queue.remote())
-
-    def shutdown(self):
-        """Shutdown queue"""
-        ray.get(self.queue_actor.shutdown.remote())
-
-    def get_memory_usage(self) -> dict:
-        """Get memory usage statistics"""
-        return ray.get(self.queue_actor.get_memory_usage.remote())
diff --git a/recipe/fully_async_policy/unittest/ray_async_resource_config.py b/recipe/fully_async_policy/unittest/ray_async_resource_config.py
new file mode 100644
index 00000000000..40e85c9f1bd
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/ray_async_resource_config.py
@@ -0,0 +1,366 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import random
+import time
+
+import ray
+
+
+# 配置1: 默认配置
+class DefaultStreamingActor:
+    """默认配置的流式处理Actor"""
+
+    def __init__(self, actor_id: str):
+        self.actor_id = actor_id
+        self.processed_count = 0
+        self.start_time = time.time()
+        self.max_concurrent_tasks = 0
+        self.current_tasks = 0
+
+    async def process_data_async(self, data_item: dict) -> dict:
+        """异步处理数据"""
+        self.current_tasks += 1
+        self.max_concurrent_tasks = max(self.max_concurrent_tasks, self.current_tasks)
+
+        try:
+            task_id = data_item["id"]
+            processing_time = random.uniform(1, 3)
+
+            print(f"[{self.actor_id}] 开始处理 {task_id} (当前并发: {self.current_tasks})")
+
+            # CPU密集型任务模拟
+            await asyncio.sleep(processing_time * 0.5)  # I/O部分
+
+            # 模拟CPU计算
+            total = 0
+            for i in range(int(processing_time * 100000)):  # CPU密集计算
+                total += i * 0.001
+
+            await asyncio.sleep(processing_time * 0.5)  # 更多I/O
+
+            self.processed_count += 1
+
+            result = {
+                "id": task_id,
+                "actor_id": self.actor_id,
+                "processing_time": processing_time,
+                "processed_count": self.processed_count,
+                "max_concurrent": self.max_concurrent_tasks,
+                "compute_result": total,
+                "completed_at": time.time(),
+            }
+
+            print(f"[{self.actor_id}] 完成处理 {task_id} (耗时: {processing_time:.1f}s)")
+            return result
+
+        finally:
+            self.current_tasks -= 1
+
+    def get_stats(self) -> dict:
+        return {
+            "actor_id": self.actor_id,
+            "processed_count": self.processed_count,
+            "max_concurrent_tasks": self.max_concurrent_tasks,
+            "uptime": time.time() - self.start_time,
+        }
+
+
+# 配置2: 只设置 num_cpus
+@ray.remote(num_cpus=4)
+class HighCpuStreamingActor(DefaultStreamingActor):
+    """高CPU配置的Actor"""
+
+    pass
+
+
+# 配置3: 只设置 max_concurrency
+@ray.remote(max_concurrency=5)
+class HighConcurrencyStreamingActor(DefaultStreamingActor):
+    """高并发配置的Actor"""
+
+    pass
+
+
+# 配置4: 同时设置两者
+@ray.remote(num_cpus=4, max_concurrency=8)
+class OptimalStreamingActor(DefaultStreamingActor):
+    """最优配置的Actor"""
+
+    pass
+
+
+# 配置5: 极端低配置
+@ray.remote(num_cpus=1, max_concurrency=2)
+class LowResourceStreamingActor(DefaultStreamingActor):
+    """低资源配置的Actor"""
+
+    pass
+
+
+class RayStreamingSystemTest:
+    """Ray流式处理系统测试"""
+
+    def __init__(self):
+        self.test_data = []
+        self.results = {}
+
+    def generate_test_data(self, count: int = 20) -> list[dict]:
+        """生成测试数据"""
+        return [
+            {"id": f"task_{i:03d}", "content": f"测试数据_{i}", "priority": random.choice(["high", "normal", "low"])}
+            for i in range(count)
+        ]
+
+    async def test_actor_configuration(self, actor_class, config_name: str, test_data: list[dict]) -> dict:
+        """测试特定配置的Actor"""
+        print(f"\n{'=' * 60}")
+        print(f"测试配置: {config_name}")
+        print(f"{'=' * 60}")
+
+        # 创建Actor实例
+        actor = actor_class.remote(config_name)
+
+        start_time = time.time()
+
+        # 并发提交所有任务
+        print(f"提交 {len(test_data)} 个任务...")
+        task_futures = []
+
+        for i, data_item in enumerate(test_data):
+            future = actor.process_data_async.remote(data_item)
+            task_futures.append(future)
+
+            # 模拟流式数据到达
+            if i < len(test_data) - 1:
+                await asyncio.sleep(0.1)  # 100ms间隔
+
+        print("所有任务已提交，等待完成...")
+
+        # 等待所有任务完成
+        try:
+            results = await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in task_futures])
+        except Exception as e:
+            print(f"任务执行出错: {e}")
+            results = []
+
+        end_time = time.time()
+        total_time = end_time - start_time
+
+        # 获取Actor统计信息
+        stats = ray.get(actor.get_stats.remote())
+
+        # 计算性能指标
+        performance_metrics = {
+            "config_name": config_name,
+            "total_tasks": len(test_data),
+            "completed_tasks": len(results),
+            "total_time": total_time,
+            "throughput": len(results) / total_time if total_time > 0 else 0,
+            "avg_processing_time": sum(r.get("processing_time", 0) for r in results) / len(results) if results else 0,
+            "max_concurrent_tasks": stats["max_concurrent_tasks"],
+            "actor_stats": stats,
+            "success_rate": len(results) / len(test_data) if test_data else 0,
+        }
+
+        print(f"✅ 完成测试 {config_name}:")
+        print(f"   总任务数: {performance_metrics['total_tasks']}")
+        print(f"   完成任务数: {performance_metrics['completed_tasks']}")
+        print(f"   总耗时: {performance_metrics['total_time']:.2f}s")
+        print(f"   吞吐量: {performance_metrics['throughput']:.2f} tasks/s")
+        print(f"   最大并发: {performance_metrics['max_concurrent_tasks']}")
+        print(f"   成功率: {performance_metrics['success_rate'] * 100:.1f}%")
+
+        return performance_metrics
+
+    async def run_comprehensive_test(self):
+        """运行综合测试"""
+        print("🚀 开始Ray异步资源配置测试")
+        print(f"Ray集群状态: {ray.cluster_resources()}")
+
+        # 生成测试数据
+        test_data = self.generate_test_data(15)  # 15个任务便于观察
+
+        # 测试配置列表
+        test_configs = [
+            (DefaultStreamingActor, "默认配置 (无特殊设置)"),
+            (HighCpuStreamingActor, "高CPU配置 (num_cpus=4)"),
+            (HighConcurrencyStreamingActor, "高并发配置 (max_concurrency=5)"),
+            (OptimalStreamingActor, "最优配置 (num_cpus=4, max_concurrency=8)"),
+            (LowResourceStreamingActor, "低资源配置 (num_cpus=1, max_concurrency=2)"),
+        ]
+
+        results = {}
+
+        # 逐个测试各种配置
+        for actor_class, config_name in test_configs:
+            try:
+                result = await self.test_actor_configuration(actor_class, config_name, test_data)
+                results[config_name] = result
+
+                # 测试间隔
+                await asyncio.sleep(2)
+
+            except Exception as e:
+                print(f"❌ 测试 {config_name} 失败: {e}")
+                results[config_name] = {"error": str(e)}
+
+        # 生成对比报告
+        self.generate_comparison_report(results)
+
+        return results
+
+    def generate_comparison_report(self, results: dict):
+        """生成对比报告"""
+        print(f"\n{'=' * 80}")
+        print("📊 配置对比报告")
+        print(f"{'=' * 80}")
+
+        # 表头
+        print(f"{'配置名称':<25} {'吞吐量':<12} {'最大并发':<10} {'平均处理时间':<15} {'成功率':<10}")
+        print("-" * 80)
+
+        # 数据行
+        best_throughput = 0
+        best_config = ""
+
+        for config_name, result in results.items():
+            if "error" in result:
+                print(f"{config_name:<25} {'错误':<12} {'':<10} {'':<15} {'':<10}")
+                continue
+
+            throughput = result.get("throughput", 0)
+            max_concurrent = result.get("max_concurrent_tasks", 0)
+            avg_time = result.get("avg_processing_time", 0)
+            success_rate = result.get("success_rate", 0)
+
+            print(
+                f"{config_name:<25} {throughput:<12.2f} {max_concurrent:<10} "
+                f"{avg_time:<15.2f} {success_rate * 100:<10.1f}%"
+            )
+
+            if throughput > best_throughput:
+                best_throughput = throughput
+                best_config = config_name
+
+        print(f"\n🏆 最佳配置: {best_config} (吞吐量: {best_throughput:.2f} tasks/s)")
+
+        # 详细分析
+        print("\n📋 配置分析:")
+        print("1. num_cpus 作用:")
+        print("   - 资源预留: 确保Actor有足够计算资源")
+        print("   - 节点选择: Ray选择有足够CPU的节点")
+        print("   - 避免资源竞争: 防止过度调度")
+
+        print("\n2. max_concurrency 作用:")
+        print("   - 并发控制: 限制Actor内同时执行的任务数")
+        print("   - 内存保护: 防止过多并发导致内存溢出")
+        print("   - 性能调优: 平衡并发度和资源利用率")
+
+        print("\n3. 建议配置:")
+        print("   - CPU密集型任务: 设置较高的num_cpus，适中的max_concurrency")
+        print("   - I/O密集型任务: 设置较低的num_cpus，较高的max_concurrency")
+        print("   - 混合型任务: 平衡两个参数，根据实际测试调优")
+
+
+async def run_resource_stress_test():
+    """运行资源压力测试"""
+    print(f"\n{'=' * 60}")
+    print("🔥 资源压力测试")
+    print(f"{'=' * 60}")
+
+    # 创建多个不同配置的Actor
+    actors = {
+        "高并发低CPU": OptimalStreamingActor.remote("stress_test_1"),
+        "低并发高CPU": ray.remote(num_cpus=8, max_concurrency=2)(DefaultStreamingActor).remote("stress_test_2"),
+        "平衡配置": ray.remote(num_cpus=2, max_concurrency=4)(DefaultStreamingActor).remote("stress_test_3"),
+    }
+
+    # 大量并发任务
+    heavy_workload = [{"id": f"heavy_{i}", "content": f"重载任务_{i}"} for i in range(50)]
+
+    print("提交大量并发任务，观察资源使用...")
+
+    all_futures = []
+    for actor_name, actor in actors.items():
+        print(f"向 {actor_name} 提交任务...")
+        for task in heavy_workload[:15]:  # 每个Actor处理15个任务
+            future = actor.process_data_async.remote(task)
+            all_futures.append((actor_name, future))
+
+    # 等待完成并记录时间
+    start_time = time.time()
+    results = []
+
+    for actor_name, future in all_futures:
+        try:
+            result = await asyncio.wrap_future(future.future())
+            results.append((actor_name, result))
+        except Exception as e:
+            print(f"{actor_name} 任务失败: {e}")
+
+    end_time = time.time()
+
+    print(f"压力测试完成，总耗时: {end_time - start_time:.2f}s")
+    print(f"完成任务数: {len(results)}")
+
+    # 按Actor分组统计
+    actor_stats = {}
+    for actor_name, result in results:
+        if actor_name not in actor_stats:
+            actor_stats[actor_name] = []
+        actor_stats[actor_name].append(result)
+
+    for actor_name, actor_results in actor_stats.items():
+        avg_time = sum(r["processing_time"] for r in actor_results) / len(actor_results)
+        print(f"{actor_name}: 完成 {len(actor_results)} 个任务, 平均耗时 {avg_time:.2f}s")
+
+
+async def main():
+    """主函数"""
+    # 初始化Ray
+    if not ray.is_initialized():
+        ray.init(
+            num_cpus=16,  # 设置足够的CPU资源
+            object_store_memory=2000000000,  # 2GB
+            ignore_reinit_error=True,
+        )
+
+    print("🎯 Ray异步资源配置测试")
+    print(f"可用资源: {ray.cluster_resources()}")
+
+    try:
+        # 基础配置测试
+        test_system = RayStreamingSystemTest()
+        await test_system.run_comprehensive_test()
+
+        # 压力测试
+        await run_resource_stress_test()
+
+        print("\n✅ 所有测试完成!")
+
+    except Exception as e:
+        print(f"❌ 测试执行失败: {e}")
+        import traceback
+
+        traceback.print_exc()
+
+    finally:
+        # 清理资源
+        ray.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/recipe/fully_async_policy/simple_streaming_demo.py b/recipe/fully_async_policy/unittest/simple_streaming_demo.py
similarity index 100%
rename from recipe/fully_async_policy/simple_streaming_demo.py
rename to recipe/fully_async_policy/unittest/simple_streaming_demo.py
diff --git a/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py b/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py
new file mode 100644
index 00000000000..33e0d9db04d
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py
@@ -0,0 +1,407 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 测试使用 asyncio 的 MessageQueue
+# 对比 @ray.remote(num_cpus, max_concurrency) 参数的实际效果
+
+import asyncio
+import random
+
+# 导入修改后的 MessageQueue
+import time
+from dataclasses import dataclass
+
+import ray
+from omegaconf import DictConfig
+
+from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample
+
+
+@dataclass
+class TestConfig:
+    """测试配置"""
+
+    async_training: dict
+
+
+def create_test_config() -> DictConfig:
+    """创建测试配置"""
+    from omegaconf import OmegaConf
+
+    config_dict = {"async_training": {"staleness_threshold": 3}}
+    return OmegaConf.create(config_dict)
+
+
+class AsyncMessageQueueTester:
+    """异步消息队列测试器"""
+
+    def __init__(self):
+        self.config = create_test_config()
+
+    async def test_basic_async_operations(self):
+        """测试基本异步操作"""
+        print("\n🧪 测试基本异步操作")
+        print("=" * 50)
+
+        # 创建MessageQueue Actor
+        queue_actor = MessageQueue.remote(self.config, max_queue_size=100)
+        client = MessageQueueClient(queue_actor)
+
+        # 测试异步放入样本
+        test_samples = [
+            QueueSample(
+                data={"task_id": f"task_{i}", "content": f"测试数据_{i}"},
+                rollout_metadata={"timestamp": time.time(), "version": 1},
+            )
+            for i in range(10)
+        ]
+
+        # 异步并发放入样本
+        put_tasks = []
+        for i, sample in enumerate(test_samples):
+            task = asyncio.create_task(client.put_sample(sample, param_version=1), name=f"put_task_{i}")
+            put_tasks.append(task)
+
+        # 等待所有放入任务完成
+        put_results = await asyncio.gather(*put_tasks)
+        successful_puts = sum(put_results)
+
+        print(f"✅ 成功放入 {successful_puts}/{len(test_samples)} 个样本")
+
+        # 异步获取统计信息
+        stats = await client.get_statistics()
+        print(f"📊 队列统计: {stats}")
+
+        # 异步获取样本
+        samples_batch, queue_size = await client.get_samples(min_batch_count=5)
+        print(f"📦 获取了 {len(samples_batch)} 个样本，剩余队列大小: {queue_size}")
+
+        # 清理
+        await client.shutdown()
+
+        return successful_puts
+
+    async def test_concurrent_producers_consumers(self):
+        """测试并发生产者和消费者"""
+        print("\n🏭 测试并发生产者和消费者")
+        print("=" * 50)
+
+        # 创建 MessageQueue Actor
+        queue_actor = MessageQueue.remote(self.config, max_queue_size=200)
+        client = MessageQueueClient(queue_actor)
+
+        # 生产者协程
+        async def producer(producer_id: int, sample_count: int):
+            """生产者协程"""
+            produced = 0
+            for i in range(sample_count):
+                sample = QueueSample(
+                    data={
+                        "producer_id": producer_id,
+                        "task_id": f"producer_{producer_id}_task_{i}",
+                        "content": f"来自生产者{producer_id}的数据{i}",
+                    },
+                    rollout_metadata={"producer_timestamp": time.time(), "producer_id": producer_id},
+                )
+
+                success = await client.put_sample(sample, param_version=1)
+                if success:
+                    produced += 1
+
+                # 模拟生产间隔
+                await asyncio.sleep(random.uniform(0.01, 0.1))
+
+            print(f"🏭 生产者{producer_id} 完成，成功生产 {produced} 个样本")
+            return produced
+
+        # 消费者协程
+        async def consumer(consumer_id: int, target_count: int):
+            """消费者协程"""
+            consumed = 0
+            start_time = time.time()
+
+            while consumed < target_count:
+                try:
+                    # 尝试获取样本，设置超时
+                    sample = await asyncio.wait_for(client.get_sample(), timeout=2.0)
+
+                    if sample is not None:
+                        consumed += 1
+
+                        if consumed % 10 == 0:
+                            print(f"🍽️  消费者{consumer_id} 已消费 {consumed} 个样本")
+                    else:
+                        print(f"⚠️ 消费者{consumer_id} 收到空样本，队列可能已关闭")
+                        break
+
+                except asyncio.TimeoutError:
+                    print(f"⏰ 消费者{consumer_id} 超时，检查队列状态...")
+                    stats = await client.get_statistics()
+                    if stats["queue_size"] == 0:
+                        print(f"📭 队列为空，消费者{consumer_id} 等待...")
+                        await asyncio.sleep(0.5)
+                    continue
+
+                # 模拟处理时间
+                await asyncio.sleep(random.uniform(0.02, 0.05))
+
+            elapsed = time.time() - start_time
+            print(f"🍽️  消费者{consumer_id} 完成，消费了 {consumed} 个样本，耗时 {elapsed:.2f}s")
+            return consumed
+
+        # 启动并发生产者和消费者
+        num_producers = 3
+        num_consumers = 2
+        samples_per_producer = 20
+
+        # 创建生产者任务
+        producer_tasks = [
+            asyncio.create_task(producer(i, samples_per_producer), name=f"producer_{i}") for i in range(num_producers)
+        ]
+
+        # 创建消费者任务
+        total_expected_samples = num_producers * samples_per_producer
+        samples_per_consumer = total_expected_samples // num_consumers
+
+        consumer_tasks = [
+            asyncio.create_task(
+                consumer(i, samples_per_consumer + (5 if i == 0 else 0)),  # 第一个消费者多处理一些
+                name=f"consumer_{i}",
+            )
+            for i in range(num_consumers)
+        ]
+
+        # 等待所有任务完成
+        start_time = time.time()
+
+        producer_results = await asyncio.gather(*producer_tasks, return_exceptions=True)
+        consumer_results = await asyncio.gather(*consumer_tasks, return_exceptions=True)
+
+        end_time = time.time()
+
+        # 统计结果
+        total_produced = sum(r for r in producer_results if isinstance(r, int))
+        total_consumed = sum(r for r in consumer_results if isinstance(r, int))
+
+        print("\n📈 并发测试结果:")
+        print(f"   总生产样本: {total_produced}")
+        print(f"   总消费样本: {total_consumed}")
+        print(f"   总耗时: {end_time - start_time:.2f}s")
+        print(f"   生产效率: {total_produced / (end_time - start_time):.2f} samples/s")
+        print(f"   消费效率: {total_consumed / (end_time - start_time):.2f} samples/s")
+
+        # 最终统计
+        final_stats = await client.get_statistics()
+        print(f"📊 最终队列统计: {final_stats}")
+
+        # 清理
+        await client.shutdown()
+
+        return total_produced, total_consumed
+
+    async def compare_resource_configurations(self):
+        """对比不同资源配置的效果"""
+        print("\n⚡ 对比不同资源配置的效果")
+        print("=" * 50)
+
+        # 测试配置列表
+        configs = [
+            {"name": "默认配置", "num_cpus": None, "max_concurrency": None, "decorator": ray.remote},
+            {
+                "name": "高CPU低并发",
+                "num_cpus": 4,
+                "max_concurrency": 5,
+                "decorator": lambda: ray.remote(num_cpus=4, max_concurrency=5),
+            },
+            {
+                "name": "低CPU高并发",
+                "num_cpus": 1,
+                "max_concurrency": 20,
+                "decorator": lambda: ray.remote(num_cpus=1, max_concurrency=20),
+            },
+            {
+                "name": "平衡配置",
+                "num_cpus": 2,
+                "max_concurrency": 10,
+                "decorator": lambda: ray.remote(num_cpus=2, max_concurrency=10),
+            },
+        ]
+
+        results = {}
+
+        for config in configs:
+            print(f"\n🧪 测试配置: {config['name']}")
+            print(f"   num_cpus: {config['num_cpus']}")
+            print(f"   max_concurrency: {config['max_concurrency']}")
+
+            # 动态创建MessageQueue类
+            if config["num_cpus"] is None:
+                QueueClass = MessageQueue
+            else:
+                QueueClass = config["decorator"]()(MessageQueue)
+
+            # 创建queue实例
+            queue_actor = QueueClass.remote(self.config, max_queue_size=100)
+            client = MessageQueueClient(queue_actor)
+
+            # 执行性能测试
+            start_time = time.time()
+
+            # 并发放入大量样本
+            sample_count = 50
+            put_tasks = []
+
+            for i in range(sample_count):
+                sample = QueueSample(
+                    data={
+                        "task_id": f"perf_test_{i}",
+                        "config": config["name"],
+                        "data_size": random.randint(100, 1000),
+                    },
+                    rollout_metadata={"config_test": True},
+                )
+
+                task = asyncio.create_task(client.put_sample(sample, param_version=1))
+                put_tasks.append(task)
+
+                # 模拟流式到达
+                if i % 10 == 0:
+                    await asyncio.sleep(0.01)
+
+            # 等待所有put完成
+            put_results = await asyncio.gather(*put_tasks)
+            put_time = time.time() - start_time
+
+            # 获取所有样本
+            get_start_time = time.time()
+            all_samples = []
+
+            while True:
+                samples_batch, queue_size = await client.get_samples(min_batch_count=1)
+                if not samples_batch:
+                    break
+                all_samples.extend(samples_batch)
+
+                if queue_size == 0:
+                    break
+
+            get_time = time.time() - get_start_time
+            total_time = time.time() - start_time
+
+            successful_puts = sum(put_results)
+
+            # 记录结果
+            results[config["name"]] = {
+                "successful_puts": successful_puts,
+                "retrieved_samples": len(all_samples),
+                "put_time": put_time,
+                "get_time": get_time,
+                "total_time": total_time,
+                "put_throughput": successful_puts / put_time if put_time > 0 else 0,
+                "get_throughput": len(all_samples) / get_time if get_time > 0 else 0,
+                "total_throughput": (successful_puts + len(all_samples)) / total_time if total_time > 0 else 0,
+            }
+
+            print(f"   ✅ 放入: {successful_puts}/{sample_count}")
+            print(f"   📦 获取: {len(all_samples)}")
+            print(f"   ⏱️  放入耗时: {put_time:.3f}s")
+            print(f"   ⏱️  获取耗时: {get_time:.3f}s")
+            print(f"   🚀 放入吞吐量: {successful_puts / put_time:.2f} ops/s")
+
+            # 清理
+            await client.shutdown()
+
+            # 间隔
+            await asyncio.sleep(1)
+
+        # 生成对比报告
+        print("\n📊 资源配置对比报告")
+        print("=" * 80)
+        print(f"{'配置名称':<15} {'放入吞吐量':<12} {'获取吞吐量':<12} {'总吞吐量':<12} {'总耗时':<10}")
+        print("-" * 80)
+
+        best_config = ""
+        best_throughput = 0
+
+        for config_name, result in results.items():
+            put_throughput = result["put_throughput"]
+            get_throughput = result["get_throughput"]
+            total_throughput = result["total_throughput"]
+            total_time = result["total_time"]
+
+            print(
+                f"{config_name:<15} {put_throughput:<12.2f} {get_throughput:<12.2f} "
+                f"{total_throughput:<12.2f} {total_time:<10.3f}s"
+            )
+
+            if total_throughput > best_throughput:
+                best_throughput = total_throughput
+                best_config = config_name
+
+        print(f"\n🏆 最佳配置: {best_config} (总吞吐量: {best_throughput:.2f} ops/s)")
+
+        return results
+
+
+async def main():
+    """主函数"""
+    # 初始化Ray
+    if not ray.is_initialized():
+        ray.init(
+            num_cpus=8,
+            object_store_memory=1000000000,  # 1GB
+            ignore_reinit_error=True,
+        )
+
+    print("🎯 异步MessageQueue测试")
+    print(f"Ray集群资源: {ray.cluster_resources()}")
+
+    tester = AsyncMessageQueueTester()
+
+    try:
+        # 基本异步操作测试
+        await tester.test_basic_async_operations()
+
+        # 并发生产者消费者测试
+        await tester.test_concurrent_producers_consumers()
+
+        # 资源配置对比测试
+        await tester.compare_resource_configurations()
+
+        print("\n✅ 所有测试完成!")
+
+        # 总结
+        print("\n📋 总结:")
+        print("1. 使用 asyncio 后的优势:")
+        print("   - 真正的异步等待，不阻塞事件循环")
+        print("   - 更好的并发性能")
+        print("   - 与Ray的异步接口完美集成")
+
+        print("\n2. 资源配置建议:")
+        print("   - num_cpus: 控制CPU资源分配，影响计算密集型任务")
+        print("   - max_concurrency: 控制并发数，影响I/O密集型任务")
+        print("   - 对于MessageQueue: 推荐 num_cpus=2, max_concurrency=20")
+
+    except Exception as e:
+        print(f"❌ 测试失败: {e}")
+        import traceback
+
+        traceback.print_exc()
+
+    finally:
+        ray.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 34f4c78833c..b658526b7d7 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -470,18 +470,6 @@ async def get_trajectory_info(step, index, validate):
     return trajectory_info
 
 
-async def _ray_future_to_asyncio(ray_future):
-    """将Ray future转换为asyncio可等待的对象"""
-    while True:
-        try:
-            # 非阻塞检查Ray future是否完成
-            result = ray.get(ray_future, timeout=0.001)  # 1ms timeout
-            return result
-        except ray.exceptions.GetTimeoutError:
-            # 未完成，让出控制权给其他协程
-            await asyncio.sleep(1)  # 1s sleep
-
-
 class AgentLoopManager:
     """Agent loop manager that manages a group of agent loop workers."""
 
@@ -551,10 +539,17 @@ def _initialize_llm_servers(self):
 
     def _init_agent_loop_workers(self):
         self.agent_loop_workers = []
-        for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers):
+        # 获取建议的资源配置
+        agent_config = self.config.actor_rollout_ref.rollout.agent
+        max_concurrency = agent_config.get("max_concurrency", 10)
+        num_cpus = agent_config.get("num_cpus", 2)  # 默认2个CPU核心
+
+        for i in range(agent_config.num_workers):
             self.agent_loop_workers.append(
                 AgentLoopWorker.options(
                     name=f"agent_loop_worker_{i}",
+                    max_concurrency=max_concurrency,  # 设置最大并发数
+                    num_cpus=num_cpus,  # 设置CPU资源需求
                 ).remote(self.config, self.async_llm_servers)
             )
 
@@ -603,9 +598,9 @@ async def generate_single_sample_async(self, sample: DataProto, sample_id: str)
         # 使用负载均衡选择 worker
         worker = self._select_best_worker()
 
-        # 异步处理单个样本
-        output_future = worker.generate_sequences.remote(sample)
-        outputs = await _ray_future_to_asyncio(output_future)
+        # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
+        output_future = worker.generate_sequences_no_post.remote(sample)
+        outputs = await asyncio.wrap_future(output_future.future())
 
         processing_time = time.time() - start_time
 

From a59b84f5b2a74690f9aba964da7b98d373b563d3 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 14 Aug 2025 11:38:35 +0800
Subject: [PATCH 044/182] fix ray train bug

---
 verl/trainer/ppo/ray_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 26150cc631d..d16c2736bcc 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -913,7 +913,7 @@ def _init_models(self):
             self.rm_wg.init_model()
 
         # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
-        self.actor_rollout_wg = self.all_wg[Role.ActorRollout]
+        self.actor_rollout_wg = self.all_wg[str(Role.ActorRollout)]
         self.actor_rollout_wg.init_model()
 
     def _init_async_rollout_manager(self):

From 191605b5edda82a82ad260ad429c518ed41ffe9d Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 14 Aug 2025 20:48:48 +0800
Subject: [PATCH 045/182] async server

---
 recipe/fully_async_policy/fully_async_main.py | 31 +++----
 .../fully_async_rollouter.py                  | 50 +++++------
 .../fully_async_policy/fully_async_trainer.py | 48 +++++------
 recipe/fully_async_policy/utils.py            | 19 +++++
 recipe/one_step_off_policy/fsdp_workers.py    | 46 ++++++----
 recipe/one_step_off_policy/main_ppo.py        | 33 +++-----
 .../one_step_off_policy/megatron_workers.py   | 83 +++++++++----------
 ...harding_manager.py => sharding_manager.py} |  8 +-
 verl/experimental/agent_loop/agent_loop.py    |  2 +-
 verl/trainer/ppo/ray_trainer.py               |  8 +-
 .../rollout/vllm_rollout/vllm_async_server.py | 19 ++++-
 .../rollout/vllm_rollout/vllm_rollout_spmd.py | 27 +++---
 12 files changed, 201 insertions(+), 173 deletions(-)
 create mode 100644 recipe/fully_async_policy/utils.py
 rename recipe/one_step_off_policy/{vllm_sharding_manager.py => sharding_manager.py} (94%)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 179929f242a..0e43bd6151b 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -82,39 +82,29 @@ def create_role_worker_mapping(config):
     if config.actor_rollout_ref.actor.strategy == "fsdp2":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
         from recipe.one_step_off_policy.fsdp_workers import (
-            ActorRolloutRefWorker,
-            AsyncActorRolloutRefWorker,
+            DetachActorWorker,
+            DetachAsyncRolloutWorker,
             CriticWorker,
-            RolloutWorker,
         )
         from verl.single_controller.ray import RayWorkerGroup
-
-        actor_rollout_cls = (
-            AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
-        )
         ray_worker_group_cls = RayWorkerGroup
 
     elif config.actor_rollout_ref.actor.strategy == "megatron":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
         from recipe.one_step_off_policy.megatron_workers import (
-            ActorRolloutRefWorker,
-            AsyncActorRolloutRefWorker,
+            DetachActorWorker,
+            DetachAsyncRolloutWorker,
             CriticWorker,
-            RolloutWorker,
         )
         from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
-
-        actor_rollout_cls = (
-            AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
-        )
         ray_worker_group_cls = NVMegatronRayWorkerGroup
 
     else:
         raise NotImplementedError(f"Unsupported strategy: {config.actor_rollout_ref.actor.strategy}")
 
     role_worker_mapping = {
-        Role.Actor: ray.remote(actor_rollout_cls),
-        Role.Rollout: ray.remote(RolloutWorker),
+        Role.Actor: ray.remote(DetachActorWorker),
+        Role.Rollout: ray.remote(DetachAsyncRolloutWorker),
         Role.Critic: ray.remote(CriticWorker),
     }
 
@@ -187,11 +177,10 @@ def _initialize_components(self, config) -> None:
         self.components["reward_fn"] = reward_fn
         self.components["val_reward_fn"] = val_reward_fn
 
-        self.max_queue_size = (
-            (config.async_training.staleness_threshold + 1)
-            * config.data.train_batch_size
-            * config.actor_rollout_ref.rollout.n
-        ) * 10  # x 10 avoid deadlock
+        self.max_queue_size = ((config.async_training.staleness_threshold + 1)
+                               * config.data.train_batch_size
+                               * config.actor_rollout_ref.rollout.n
+                               ) * 10  # x 10 avoid deadlock
         print("[ASYNC MAIN] Creating MessageQueue...")
         message_queue = MessageQueue.remote(config, self.max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 0060cfa1b02..5f6f2b0d589 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -19,6 +19,7 @@
 from omegaconf import OmegaConf
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
+from recipe.fully_async_policy.utils import calculate_one_step_size
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.tracking import ValidationGenerationsLogger
@@ -33,17 +34,17 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
-        max_queue_size=1000,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
+            max_queue_size=1000,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -53,7 +54,11 @@ def __init__(
         self.val_reward_fn = val_reward_fn
 
         self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+
         assert not self.hybrid_engine
+        assert self.config.data.train_batch_size == 0, "train_batch_size must be zero"
+        assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one"
+
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
@@ -81,16 +86,11 @@ def __init__(
         self._validate_config()
         print(f"[FullyAsyncRollouter] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}")
 
-        assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one"
-
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
-        total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
-
+        self.total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
         if self.config.rollout.total_rollout_steps is not None:
-            total_rollout_steps = self.config.rollout.total_rollout_steps
-
-        self.total_rollout_steps = total_rollout_steps
+            self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps)
         print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}")
 
         # Rollouter parameter configuration
@@ -107,12 +107,6 @@ def __init__(
         self.train_step_samples = 0
         self.dropped_stale_samples = 0
 
-        # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
-        n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
-        batch_size = self.config.data.train_batch_size
-        required_samples = n_responses_per_prompt * batch_size
-        self.max_required_samples = required_samples * (self.staleness_threshold + 1)
-
         # Worker groups
         self.rollout_wg = None
         self.message_queue_client = None
@@ -145,6 +139,12 @@ def __init__(
         self.active_sample_count = 0  # 当前正在处理的样本数
         self.queue_full_pause_count = 0  # 队列满导致的暂停次数
 
+        # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
+        self.required_samples = calculate_one_step_size(self.minimal_bsz,
+                                                        config.actor_rollout_ref.actor.ppo_mini_batch_size)
+        self.max_required_samples = self.required_samples * (self.staleness_threshold + 1)
+
+
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
         async with self.lock:
@@ -173,6 +173,8 @@ def _validate_config(self):
         if not hasattr(self.config, "async_training"):
             raise ValueError("[FullyAsyncRollouter] Missing async_training configuration")
 
+        super()._validate_config()
+
     def _create_actor_rollout_classes(self):
         # only create rollout
         for role in [Role.Rollout]:
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index d9883aaf33f..21b2eda259e 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -22,6 +22,8 @@
 from omegaconf import OmegaConf
 
 from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
+from recipe.fully_async_policy.utils import calculate_one_step_size
+from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator
@@ -44,16 +46,16 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -102,6 +104,9 @@ def __init__(
         self.stale_samples_processed = 0
         self.current_param_version = 0
 
+        self.required_samples = calculate_one_step_size(self.minimal_bsz,
+                                                        config.actor_rollout_ref.actor.ppo_mini_batch_size)
+
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
         self.message_queue_client = message_queue_client
@@ -122,14 +127,9 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         Returns:
             tuple: (epoch, batch_dict, gen_batch_output)
         """
-        # Calculate the number of samples needed
-        n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n
-        batch_size = self.config.data.train_batch_size
-        required_samples = n_responses_per_prompt * batch_size
-
         print(
             "[FullyAsyncTrainer] "
-            f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})",
+            f"Requesting {self.required_samples} samples from queue",
             flush=True,
         )
 
@@ -137,9 +137,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         consumer_start = time.time()
         queue_samples = []
 
-        print(f"[FullyAsyncTrainer] Starting sample collection loop, required={required_samples}")
-
-        while len(queue_samples) < required_samples:
+        while len(queue_samples) < self.required_samples:
             # 获取单个样本，会一直等待直到有样本或收到None
             sample = self.message_queue_client.get_sample()
 
@@ -147,23 +145,23 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
                 # 检测到结束信号（None），立即退出
                 logger.info(
                     f"Detected termination signal (None), stopping sample collection. "
-                    f"Collected {len(queue_samples)}/{required_samples} samples"
+                    f"Collected {len(queue_samples)}/{self.required_samples} samples"
                 )
                 break
 
             queue_samples.append(sample)
 
-            if len(queue_samples) % 10 == 0 or len(queue_samples) >= required_samples:
-                print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{required_samples} samples")
+            if len(queue_samples) % 10 == 0 or len(queue_samples) >= self.required_samples:
+                print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples")
 
         consumer_end = time.time()
 
-        if not queue_samples or len(queue_samples) < required_samples:
+        if not queue_samples or len(queue_samples) < self.required_samples:
             logger.warning("not enough samples collected after loop")
             return None, None
 
         print(
-            f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{required_samples} samples, "
+            f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{self.required_samples} samples, "
             f"total wait time: {consumer_end - consumer_start:.2f} seconds"
         )
 
@@ -206,7 +204,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
         # Use the static method to postprocess AgentLoopOutput list into DataProto
         from verl.experimental.agent_loop.agent_loop import AgentLoopWorker
 
-        batch = AgentLoopWorker.postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config)
+        batch = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config)
 
         # Apply _post_generate_batch logic here
         batch = self._post_generate_batch_for_agent_outputs(batch, agent_loop_outputs)
diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py
new file mode 100644
index 00000000000..71ae7c7d16d
--- /dev/null
+++ b/recipe/fully_async_policy/utils.py
@@ -0,0 +1,19 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Calculate the number of samples needed
+
+def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size):
+    return minimal_bsz * ppo_mini_batch_size
diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py
index 0aa21991708..e6ab9d1c241 100644
--- a/recipe/one_step_off_policy/fsdp_workers.py
+++ b/recipe/one_step_off_policy/fsdp_workers.py
@@ -39,7 +39,7 @@
 from verl.utils.import_utils import import_external_libs
 from verl.utils.model import get_generation_config, update_model_config
 from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
-from verl.workers.fsdp_workers import ActorRolloutRefWorker as ARRWorker
+from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
 from verl.workers.fsdp_workers import CriticWorker
 
 logger = logging.getLogger(__file__)
@@ -47,19 +47,13 @@
 
 device_name = get_device_name()
 
-__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RolloutWorker"]
+__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
 
 
-class ActorRolloutRefWorker(ARRWorker):
-    def _get_actor_params(self):
-        assert self._is_actor
-        params = self.actor_module_fsdp.state_dict()
-        from verl.utils.model import convert_weight_keys
+class DetachNcclSync(ActorRolloutRefWorker):
 
-        params = convert_weight_keys(
-            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
-        )
-        return params
+    def _get_actor_params(self):
+        pass
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
     def sync_rollout_weights(self):
@@ -108,7 +102,19 @@ def get_actor_weights_info(self):
         return ret
 
 
-class RolloutWorker(ActorRolloutRefWorker):
+class DetachActorWorker(DetachNcclSync):
+    def _get_actor_params(self):
+        assert self._is_actor
+        params = self.actor_module_fsdp.state_dict()
+        from verl.utils.model import convert_weight_keys
+
+        params = convert_weight_keys(
+            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
+        )
+        return params
+
+
+class DetachRolloutWorker(DetachNcclSync):
     def __init__(self, config: DictConfig, role: str):
         Worker.__init__(self)
         assert role == "rollout"
@@ -202,9 +208,9 @@ def init_model(self):
             trust_remote_code=trust_remote_code,
         )
         log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger)
-        from .vllm_sharding_manager import VLLMShardingManager
 
-        rollout_sharding_manager = VLLMShardingManager(
+        from sharding_manager import DetachShardingManager
+        rollout_sharding_manager = DetachShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
         )
 
@@ -223,6 +229,12 @@ def set_actor_weights_info(self, weights_info):
         self._weights_info = weights_info
 
 
-class AsyncActorRolloutRefWorker(ActorRolloutRefWorker):
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError
+class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker):
+    def __init__(self, config: DictConfig, role: str):
+        print(f"[DetachAsyncRolloutWorker] {DetachAsyncRolloutWorker.__mro__}")
+        DetachRolloutWorker.__init__(self, config, role)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        print(f"[DetachAsyncRolloutWorker] init_model")
+        DetachRolloutWorker.init_model(self)
diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py
index 0a037df17fa..d9d8f0bb849 100644
--- a/recipe/one_step_off_policy/main_ppo.py
+++ b/recipe/one_step_off_policy/main_ppo.py
@@ -62,17 +62,11 @@ def run(self, config):
             assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
             from verl.single_controller.ray import RayWorkerGroup
 
-            from .fsdp_workers import (
-                ActorRolloutRefWorker,
-                AsyncActorRolloutRefWorker,
+            from recipe.one_step_off_policy.fsdp_workers import (
+                DetachActorWorker,
+                DetachRolloutWorker,
+                DetachAsyncRolloutWorker,
                 CriticWorker,
-                RolloutWorker,
-            )
-
-            actor_rollout_cls = (
-                AsyncActorRolloutRefWorker
-                if config.actor_rollout_ref.rollout.mode == "async"
-                else ActorRolloutRefWorker
             )
             ray_worker_group_cls = RayWorkerGroup
 
@@ -80,17 +74,11 @@ def run(self, config):
             assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
             from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
 
-            from .megatron_workers import (
-                ActorRolloutRefWorker,
-                AsyncActorRolloutRefWorker,
+            from recipe.one_step_off_policy.megatron_workers import (
+                DetachActorWorker,
+                DetachRolloutWorker,
+                DetachAsyncRolloutWorker,
                 CriticWorker,
-                RolloutWorker,
-            )
-
-            actor_rollout_cls = (
-                AsyncActorRolloutRefWorker
-                if config.actor_rollout_ref.rollout.mode == "async"
-                else ActorRolloutRefWorker
             )
             ray_worker_group_cls = NVMegatronRayWorkerGroup
 
@@ -100,8 +88,9 @@ def run(self, config):
         from .ray_trainer import ResourcePoolManager, Role
 
         role_worker_mapping = {
-            Role.Actor: ray.remote(actor_rollout_cls),
-            Role.Rollout: ray.remote(RolloutWorker),
+            Role.Actor: ray.remote(DetachActorWorker),
+            Role.Rollout: ray.remote(
+                DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker),
             Role.Critic: ray.remote(CriticWorker),
         }
 
diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py
index f7b58405b4f..9011f5a6023 100644
--- a/recipe/one_step_off_policy/megatron_workers.py
+++ b/recipe/one_step_off_policy/megatron_workers.py
@@ -27,42 +27,18 @@
 from verl.utils.device import get_device_name, get_torch_device
 from verl.utils.fs import copy_to_local
 from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
-from verl.workers.megatron_workers import ActorRolloutRefWorker as ARRWorker
+from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
 from verl.workers.megatron_workers import CriticWorker, RewardModelWorker
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
-__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RewardModelWorker", "RolloutWorker"]
+__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
 
 
-class ActorRolloutRefWorker(ARRWorker):
-    def __init__(self, config: DictConfig, role: str):
-        assert role in ["actor", "ref"]
-        tmp_role = "ref" if role == "ref" else "actor_rollout"
-        super().__init__(config, tmp_role)
-        if role == "actor":
-            self._is_rollout = False
-        self.role = role
-
+class DetachNcclSync(ActorRolloutRefWorker):
     def _get_actor_params_generator(self):
-        assert self._is_actor
-        from verl.models.mcore import get_mcore_weight_converter
-        from verl.utils.megatron_utils import per_tensor_generator
-
-        layer_name_mapping = {
-            "qkv_layer_name": "self_attention.linear_qkv.",
-            "gate_proj_layer_name": "linear_fc1.",
-        }
-        weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
-        generator = per_tensor_generator(
-            self.actor.actor_module,
-            self.actor_model_config,
-            weight_converter,
-            self.tf_config,
-            layer_name_mapping,
-        )
-        return generator
+        pass
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
     def sync_rollout_weights(self):
@@ -106,11 +82,28 @@ def get_actor_weights_info(self):
         return ret
 
 
-class RolloutWorker(ActorRolloutRefWorker):
-    def __init__(self, config: DictConfig, role: str):
-        assert role == "rollout"
-        ARRWorker.__init__(self, config, role)
+class DetachActorWorker(DetachNcclSync):
+    def _get_actor_params_generator(self):
+        assert self._is_actor
+        from verl.models.mcore import get_mcore_weight_converter
+        from verl.utils.megatron_utils import per_tensor_generator
 
+        layer_name_mapping = {
+            "qkv_layer_name": "self_attention.linear_qkv.",
+            "gate_proj_layer_name": "linear_fc1.",
+        }
+        weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
+        generator = per_tensor_generator(
+            self.actor.actor_module,
+            self.actor_model_config,
+            weight_converter,
+            self.tf_config,
+            layer_name_mapping,
+        )
+        return generator
+
+
+class DetachRolloutWorker(DetachNcclSync):
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
         if self.config.model.get("external_lib", None) is not None:
@@ -142,12 +135,9 @@ def init_model(self):
         from torch.distributed.device_mesh import init_device_mesh
 
         assert self.config.rollout.name == "vllm"
-        assert self.config.rollout.mode == "sync"
 
         from verl.workers.rollout.vllm_rollout import vLLMRollout
 
-        from .vllm_sharding_manager import VLLMShardingManager
-
         # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor,
         # we will reorganize their weight format when resharding from actor to rollout.
 
@@ -175,14 +165,16 @@ def init_model(self):
         )
         log_gpu_memory_usage("After building vllm rollout", logger=logger)
 
-        sharding_manager = VLLMShardingManager(
-            inference_engine=rollout.inference_engine,
-            device_mesh=rollout_device_mesh,
+        from sharding_manager import DetachShardingManager
+        rollout_sharding_manager = DetachShardingManager(
+            inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
         )
+
         log_gpu_memory_usage("After building sharding manager", logger=logger)
 
-        self.rollout, self.sharding_manager = rollout, sharding_manager
-        self.rollout.sharding_manager = sharding_manager
+        self.rollout = rollout
+        self.sharding_manager = rollout_sharding_manager
+        self.rollout.sharding_manager = rollout_sharding_manager
 
     @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
     def async_generate_sequences(self, *args, **kwargs):
@@ -194,6 +186,11 @@ def set_actor_weights_info(self, weights_info):
         self._weights_info = weights_info
 
 
-class AsyncActorRolloutRefWorker(ActorRolloutRefWorker):
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError
+class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker):
+    def __init__(self, config: DictConfig, role: str):
+        print(DetachAsyncRolloutWorker.__mro__)
+        DetachRolloutWorker.__init__(self, config, role)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        DetachRolloutWorker.init_model(self)
\ No newline at end of file
diff --git a/recipe/one_step_off_policy/vllm_sharding_manager.py b/recipe/one_step_off_policy/sharding_manager.py
similarity index 94%
rename from recipe/one_step_off_policy/vllm_sharding_manager.py
rename to recipe/one_step_off_policy/sharding_manager.py
index c33ba585470..bc3dae69031 100644
--- a/recipe/one_step_off_policy/vllm_sharding_manager.py
+++ b/recipe/one_step_off_policy/sharding_manager.py
@@ -30,14 +30,14 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
-class VLLMShardingManager(BaseShardingManager):
+class DetachShardingManager(BaseShardingManager):
     @check_device_is_available()
     def __init__(self, inference_engine, device_mesh: DeviceMesh):
         self.device_mesh = device_mesh
         self.inference_engine = inference_engine
-        inference_engine.wake_up()
-        assert device_mesh is not None
-        assert inference_engine is not None
+        # inference_engine.wake_up()
+        # assert device_mesh is not None
+        # assert inference_engine is not None
         self.tp_size = self.device_mesh["infer_tp"].size()
         self.tp_rank = self.device_mesh["infer_tp"].get_local_rank()
         self.timing = {}
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index b658526b7d7..29f2b30edb7 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -478,7 +478,7 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
 
         Args:
             config (DictConfig): trainer config.
-            worker_group (RayWorkerGroup): ActorRolloutRef worker group.
+            worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group.
         """
         self.config = config
         self.worker_group = worker_group
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index d16c2736bcc..60621021b30 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -438,15 +438,15 @@ def _validate_config(self):
             megatron_dp = n_gpus // (
                 model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
             )
-            minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
+            self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
         else:
-            minimal_bsz = n_gpus
+            self.minimal_bsz = n_gpus
 
         # 1. Check total batch size for data correctness
         real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
-        assert real_train_batch_size % minimal_bsz == 0, (
+        assert real_train_batch_size % self.minimal_bsz == 0, (
             f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
-            f"({minimal_bsz})"
+            f"({self.minimal_bsz})"
         )
 
         # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 988dac407d7..8c0d608871f 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -57,6 +57,12 @@ def _get_model_runner_workers(vllm_config, init_ray: bool = True):
         actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict")
     ]
 
+    print(f"namespace: {namespace}")
+    print(f"wg_prefix: {wg_prefix}")
+    print(f"vllm_dp_size: {vllm_dp_size}")
+    print(f"vllm_dp_rank: {vllm_dp_rank}")
+    print(f"actor_names: {actor_names}")
+
     vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size
     assert len(actor_names) == vllm_dp_size * vllm_tp_size, (
         f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: "
@@ -84,6 +90,7 @@ class ExternalRayDistributedExecutor(Executor):
     uses_ray: bool = False
 
     def _init_executor(self) -> None:
+        print("[ExternalRayDistributedExecutor] Initializing ray actors...")
         self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True)
 
         kwargs = dict(
@@ -93,10 +100,11 @@ def _init_executor(self) -> None:
             distributed_init_method="env://",
             is_driver_worker=True,
         )
+        print(f"ray start instance_id: {self.vllm_config.instance_id} initializes")
         self.collective_rpc("init_worker", args=([kwargs],))
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
-        print(f"instance_id: {self.vllm_config.instance_id} initializes finished.")
+        print(f"ray instance_id: {self.vllm_config.instance_id} initializes finished.")
 
     def collective_rpc(
         self,
@@ -128,6 +136,7 @@ class ExternalZeroMQDistributedExecutor(Executor):
     uses_ray: bool = False
 
     def _init_executor(self) -> None:
+        print(f"[ExternalZeroMQDistributedExecutor] Initializing ray actors...")
         addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",")
         self.context = zmq.Context()
         self.sockets = []
@@ -143,9 +152,11 @@ def _init_executor(self) -> None:
             distributed_init_method="env://",
             is_driver_worker=True,
         )
+        print(f"ZeroMQ start instance_id: {self.vllm_config.instance_id} initializes")
         self.collective_rpc("init_worker", args=([kwargs],))
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
+        print(f"ZeroMQ instance_id: {self.vllm_config.instance_id} initializes finished.")
 
     def collective_rpc(
         self,
@@ -264,8 +275,12 @@ async def init_engine(self):
 
         # init async llm engine
         vllm_config = self._create_engine_config(engine_args)
+
+        print(f"AsyncvLLMServer AsyncLLM.from_vllm_config {vllm_config}")
         self.engine = AsyncLLM.from_vllm_config(vllm_config)
 
+        print("AsyncvLLMServer build serving chat")
+
         # build serving chat
         model_config = self.engine.model_config
         BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)]
@@ -282,6 +297,8 @@ async def init_engine(self):
             tool_parser=config.multi_turn.format,  # hermes, llama3_json, ...
         )
 
+        print("AsyncvLLMServer init_engine success")
+
     def _create_engine_config(self, engine_args: AsyncEngineArgs):
         vllm_config = engine_args.create_engine_config()
         namespace = ray.get_runtime_context().namespace
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index 5bd571016ac..307e7e77036 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -58,6 +58,7 @@
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
+
 # TODO
 # 1. support pp in vllm
 # 2. passing tokenizer is not necessary? no encoding/decoding is happending here
@@ -108,11 +109,11 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf
             if hasattr(model_hf_config, "max_position_embeddings"):
                 max_position_embeddings = model_hf_config.max_position_embeddings
             elif hasattr(model_hf_config, "llm_config") and hasattr(
-                model_hf_config.llm_config, "max_position_embeddings"
+                    model_hf_config.llm_config, "max_position_embeddings"
             ):
                 max_position_embeddings = model_hf_config.llm_config.max_position_embeddings
             elif hasattr(model_hf_config, "text_config") and hasattr(
-                model_hf_config.text_config, "max_position_embeddings"
+                    model_hf_config.text_config, "max_position_embeddings"
             ):
                 max_position_embeddings = model_hf_config.text_config.max_position_embeddings
             if max_position_embeddings is None:
@@ -127,12 +128,12 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf
             rope_scaling_factor = rope_scaling_config.get("factor", 1.0)
 
             assert (
-                model_hf_config.max_position_embeddings * rope_scaling_factor
-                >= config.prompt_length + config.response_length
+                    model_hf_config.max_position_embeddings * rope_scaling_factor
+                    >= config.prompt_length + config.response_length
             ), (
-                "model context length should be greater than total sequence length, "
-                + f"got rope_scaling_factor={rope_scaling_factor} and "
-                + f"max_position_embeddings={model_hf_config.max_position_embeddings}"
+                    "model context length should be greater than total sequence length, "
+                    + f"got rope_scaling_factor={rope_scaling_factor} and "
+                    + f"max_position_embeddings={model_hf_config.max_position_embeddings}"
             )
 
         max_model_len = int(config.max_model_len or config.prompt_length + config.response_length)
@@ -267,7 +268,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
         if "multi_modal_data" in non_tensor_batch:
             vllm_inputs = []
             for raw_prompt_ids, multi_modal_data in zip(
-                non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True
+                    non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True
             ):
                 vllm_inputs.append({"prompt_token_ids": raw_prompt_ids, "multi_modal_data": multi_modal_data})
         else:
@@ -389,9 +390,9 @@ def _monkey_patch_compute_logits(model, vocab_size: int):
     original_compute_logits = model.compute_logits
 
     def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
         logits = original_compute_logits(hidden_states, sampling_metadata)
         logits[..., vocab_size:] = float("-inf")
@@ -458,6 +459,8 @@ def get_zeromq_address(self):
 
     def init_worker(self, all_kwargs: list[dict[str, Any]]):
         """Initialize worker engine."""
+
+        print("[vLLMAsyncRollout] init_worker")
         all_kwargs[0]["rank"] = int(os.environ["RANK"])
         all_kwargs[0]["local_rank"] = 0
 
@@ -468,6 +471,8 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]):
     def load_model(self, *args, **kwargs):
         self.inference_engine.load_model(*args, **kwargs)
 
+        print(f"[vLLMAsyncRollout] load_model {args} {kwargs}")
+
         # inference engine is initialized now, update sharding manager
         self.sharding_manager.inference_engine = self.inference_engine
         self.sharding_manager.model_runner = self.inference_engine.worker.model_runner

From 6ddb460a9507bc74d1b4db2afe1d89c66f9c291c Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 14 Aug 2025 21:26:07 +0800
Subject: [PATCH 046/182] update shell

---
 tests/special_e2e/run_fully_async_policy.sh | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 27c033abc1d..5662d6cb479 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -13,6 +13,14 @@ MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
 MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
 huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
 
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
 # Algorithm parameters
 adv_estimator=grpo
 
@@ -33,11 +41,13 @@ overlong_penalty_factor=1.0
 
 # Training parameters
 loss_agg_mode="token-mean"
-train_prompt_bsz=2
-gen_prompt_bsz=2
+train_prompt_bsz=0
+gen_prompt_bsz=1
 n_resp_per_prompt=3
 train_prompt_mini_bsz=1
 
+total_rollout_steps=10
+
 # Temperature parameters
 temperature=1.0
 top_p=1.0
@@ -67,6 +77,7 @@ common_params=(
     data.max_response_length=${max_response_length}
     data.train_batch_size=${train_prompt_bsz}
     data.gen_batch_size=${gen_prompt_bsz}
+    data.return_raw_chat=${return_raw_chat}
     actor_rollout_ref.rollout.n=${n_resp_per_prompt}
     algorithm.adv_estimator=${adv_estimator}
     algorithm.use_kl_in_reward=${use_kl_in_reward}
@@ -95,6 +106,8 @@ common_params=(
     actor_rollout_ref.rollout.val_kwargs.do_sample=True
     actor_rollout_ref.rollout.val_kwargs.n=1
     actor_rollout_ref.rollout.enable_chunked_prefill=True
+    actor_rollout_ref.rollout.name=${rollout_name}
+    actor_rollout_ref.rollout.mode=${rollout_mode}
     reward_model.reward_manager=dapo
     +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer}
     +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len}
@@ -112,7 +125,7 @@ common_params=(
     trainer.n_gpus_per_node=${n_gpus_training}
     rollout.nnodes=1
     rollout.n_gpus_per_node=${n_gpus_rollout}
-    rollout.total_rollout_steps=10
+    rollout.total_rollout_steps=${total_rollout_steps}
     rollout.total_epochs=2
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}

From 12edb900e41c67b7a19ffd46e712a3bb095d1797 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 14 Aug 2025 23:41:58 +0800
Subject: [PATCH 047/182] stream rollout

---
 recipe/fully_async_policy/fully_async_main.py | 28 ++++++------
 .../fully_async_rollouter.py                  | 36 +++++++--------
 .../fully_async_policy/fully_async_trainer.py |  9 ++--
 recipe/fully_async_policy/message_queue.py    |  4 ++
 ..._manager.py => detach_sharding_manager.py} | 10 +++--
 recipe/one_step_off_policy/fsdp_workers.py    | 45 ++++++++++++++++---
 tests/special_e2e/run_fully_async_policy.sh   |  2 +-
 7 files changed, 88 insertions(+), 46 deletions(-)
 rename recipe/one_step_off_policy/{sharding_manager.py => detach_sharding_manager.py} (92%)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 0e43bd6151b..20234847ea8 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -177,23 +177,22 @@ def _initialize_components(self, config) -> None:
         self.components["reward_fn"] = reward_fn
         self.components["val_reward_fn"] = val_reward_fn
 
-        self.max_queue_size = ((config.async_training.staleness_threshold + 1)
-                               * config.data.train_batch_size
-                               * config.actor_rollout_ref.rollout.n
-                               ) * 10  # x 10 avoid deadlock
-        print("[ASYNC MAIN] Creating MessageQueue...")
-        message_queue = MessageQueue.remote(config, self.max_queue_size)
-        message_queue_client = MessageQueueClient(message_queue)
-
-        self.components["message_queue"] = message_queue
-        self.components["message_queue_client"] = message_queue_client
-
         print("[ASYNC MAIN] Creating FullyAsyncRollouter...")
         self._create_rollouter(config)
 
         print("[ASYNC MAIN] Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
+        print("[ASYNC MAIN] Creating MessageQueue...")
+        max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote())
+        message_queue = MessageQueue.remote(config, max_queue_size)
+        message_queue_client = MessageQueueClient(message_queue)
+        self.components["message_queue"] = message_queue
+        self.components["message_queue_client"] = message_queue_client
+
+        ray.get(self.components["rollouter"].set_message_queue_client.remote(self.components["message_queue_client"]))
+        ray.get(self.components["trainer"].set_message_queue_client.remote(self.components["message_queue_client"]))
+
         print("[ASYNC MAIN] Setting up parameter synchronization...")
         from recipe.fully_async_policy.param_sync import ParameterSynchronizer
 
@@ -220,12 +219,10 @@ def _create_rollouter(self, config) -> None:
             resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]),
             ray_worker_group_cls=self.components["ray_worker_group_cls"],
             processor=self.components["processor"],
-            device_name=config.trainer.device,
-            max_queue_size=self.max_queue_size,
+            device_name=config.trainer.device
         )
 
         ray.get(rollouter.init_workers.remote())
-        ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"]))
         self.components["rollouter"] = rollouter
         print("[ASYNC MAIN] Rollouter created and initialized successfully")
 
@@ -249,7 +246,6 @@ def _create_trainer(self, config) -> None:
         )
 
         ray.get(trainer.init_workers.remote())
-        ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"]))
         self.components["trainer"] = trainer
         print("[ASYNC MAIN] FullyAsyncTrainer created and initialized successfully")
 
@@ -279,3 +275,5 @@ def main(config):
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 5f6f2b0d589..a31d903d43a 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -43,8 +43,7 @@ def __init__(
             processor=None,
             reward_fn=None,
             val_reward_fn=None,
-            device_name=None,
-            max_queue_size=1000,
+            device_name=None
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -59,7 +58,6 @@ def __init__(
         assert self.config.data.train_batch_size == 0, "train_batch_size must be zero"
         assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one"
 
-
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
         self.ray_worker_group_cls = ray_worker_group_cls
@@ -125,9 +123,6 @@ def __init__(
         # Parameter synchronization related
         self.param_synchronizer = None
 
-        # queue size
-        self.max_queue_size = max_queue_size
-
         self.async_rollout_manager = None
 
         # 流式处理相关配置
@@ -144,6 +139,8 @@ def __init__(
                                                         config.actor_rollout_ref.actor.ppo_mini_batch_size)
         self.max_required_samples = self.required_samples * (self.staleness_threshold + 1)
 
+        # queue size
+        self.max_queue_size = self.max_required_samples * 10  # x 10 avoid deadlock
 
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
@@ -159,6 +156,9 @@ def get_rollout_wg(self):
         """Get rollout worker group"""
         return self.rollout_wg
 
+    def get_max_queue_size(self):
+        return self.max_queue_size
+
     async def update_param_version(self, version: int):
         """Update current parameter version"""
         async with self.lock:
@@ -227,7 +227,7 @@ async def _feed_samples(self):
 
             # 检查是否到达最后一步
             if self.global_steps >= self.total_rollout_steps:
-                print("[FullyAsyncRollouter] 达到最大步数，停止添加新样本")
+                print(f"[FullyAsyncRollouter] 达到最大步数，停止添加新样本 {self.global_steps} >= {self.total_rollout_steps}")
                 break
 
             self.global_steps += 1
@@ -334,7 +334,7 @@ async def _consumer_worker(self):
                 data=result["agent_loop_output"],  # 直接存储 AgentLoopOutput
                 rollout_metadata=rollout_metadata,
             )
-            success = self.message_queue_client.put_sample(
+            success = await self.message_queue_client.put_sample(
                 sample=ray.cloudpickle.dumps(queue_sample),
                 param_version=result["param_version"],
             )
@@ -432,13 +432,16 @@ async def _streaming_generation_main(self):
             self.running = False
 
         # 发送终止信号
-        self.message_queue_client.put_sample(
+        await self.message_queue_client.put_sample(
             sample=None,
             param_version=self.current_param_version,
         )
 
-    def fit(self):
-        """Start the async rollouter - entry point that sets up and runs async tasks"""
+    async def fit(self):
+        """
+        Start the async rollouter - entry point that sets up and runs async tasks
+        Main async fit method that coordinates all coroutines"""
+
         print("[FullyAsyncRollouter] Starting FullyAsyncRollouter...")
 
         if self.message_queue_client is None:
@@ -446,11 +449,6 @@ def fit(self):
         if self.param_synchronizer is None:
             raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
-        # Run everything in a single async event loop
-        asyncio.run(self._async_fit())
-
-    async def _async_fit(self):
-        """Main async fit method that coordinates all coroutines"""
         # 设置运行状态
         async with self.lock:
             self.running = True
@@ -506,7 +504,7 @@ async def _async_monitor_loop(self):
 
     async def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""
-        queue_stats = self.message_queue_client.get_statistics()
+        queue_stats = self.message_queue_client.get_statistics_sync()
         queue_size = queue_stats["queue_size"]
         current_trainer_version = queue_stats["current_param_version"]
 
@@ -571,7 +569,7 @@ async def resume(self) -> bool:
 
     async def get_statistics(self) -> dict:
         async with self.lock:
-            queue_stats = self.message_queue_client.get_statistics()
+            queue_stats = self.message_queue_client.get_statistics_sync()
             stats = {
                 "is_running": self.running,
                 "total_generated_samples": self.total_generated_samples,
@@ -587,3 +585,5 @@ async def get_statistics(self) -> dict:
             }
 
             return stats
+
+
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 21b2eda259e..7b1c725f667 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -139,7 +139,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
         while len(queue_samples) < self.required_samples:
             # 获取单个样本，会一直等待直到有样本或收到None
-            sample = self.message_queue_client.get_sample()
+            sample = self.message_queue_client.get_sample_sync()
 
             if sample is None:
                 # 检测到结束信号（None），立即退出
@@ -202,7 +202,6 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
             processing_times.append(sample.rollout_metadata.get("processing_time", 0))
 
         # Use the static method to postprocess AgentLoopOutput list into DataProto
-        from verl.experimental.agent_loop.agent_loop import AgentLoopWorker
 
         batch = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config)
 
@@ -299,6 +298,9 @@ def _init_models(self):
         self.actor_wg.init_model()
         self.actor_rollout_wg = self.actor_wg  # to be compatible with the functions that not be modified
 
+    def _init_async_rollout_manager(self):
+        pass
+
     def fit(self):
         """
         The training loop of PPO.
@@ -385,7 +387,7 @@ def fit(self):
 
     def get_statistics(self) -> dict:
         """Get training statistics"""
-        queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {}
+        queue_stats = self.message_queue_client.get_statistics_sync() if self.message_queue_client else {}
         return {
             "global_steps": self.global_steps,
             "processed_samples": self.processed_samples,
@@ -460,3 +462,4 @@ def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) ->
         except Exception as e:
             logger.error(f"Error computing freshness metrics: {e}")
             return {"freshness/error": str(e)}
+
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 2e8ad6b0e79..9a093296743 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -311,6 +311,10 @@ def get_samples_sync(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
         """Get batch from queue (sync - deprecated, use get_samples instead)"""
         return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
 
+    def get_sample_sync(self) -> Any | None:
+        """Get single sample from queue (sync - deprecated, use get_sample instead)"""
+        return ray.get(self.queue_actor.get_sample.remote())
+
     def get_statistics_sync(self) -> dict[str, Any]:
         """Get statistics (sync - deprecated, use get_statistics instead)"""
         return ray.get(self.queue_actor.get_statistics.remote())
diff --git a/recipe/one_step_off_policy/sharding_manager.py b/recipe/one_step_off_policy/detach_sharding_manager.py
similarity index 92%
rename from recipe/one_step_off_policy/sharding_manager.py
rename to recipe/one_step_off_policy/detach_sharding_manager.py
index bc3dae69031..6b304baa276 100644
--- a/recipe/one_step_off_policy/sharding_manager.py
+++ b/recipe/one_step_off_policy/detach_sharding_manager.py
@@ -47,12 +47,14 @@ def __init__(self, inference_engine, device_mesh: DeviceMesh):
 
     @GPUMemoryLogger(role="vllm sharding_manager", logger=logger)
     def __enter__(self):
-        get_torch_device().set_rng_state(self.gen_random_states)
+        # get_torch_device().set_rng_state(self.gen_random_states)
+        pass
 
     @GPUMemoryLogger(role="vllm sharding_manager", logger=logger)
     def __exit__(self, exc_type, exc_value, traceback):
-        self.gen_random_states = get_torch_device().get_rng_state()
-        self.inference_engine.reset_prefix_cache()
+        # self.gen_random_states = get_torch_device().get_rng_state()
+        # self.inference_engine.reset_prefix_cache()
+        pass
 
     @GPUMemoryLogger(role="vllm sharding_manager", logger=logger)
     def preprocess_data(self, data: DataProto) -> DataProto:
@@ -72,3 +74,5 @@ def postprocess_data(self, data: DataProto) -> DataProto:
             return data
 
         return data.chunk(chunks=self.tp_size)[self.tp_rank]
+
+
diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py
index e6ab9d1c241..04c66c8b60a 100644
--- a/recipe/one_step_off_policy/fsdp_workers.py
+++ b/recipe/one_step_off_policy/fsdp_workers.py
@@ -50,6 +50,32 @@
 __all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
 
 
+def get_inference_model(rollout):
+    """
+    根据不同类型的inference_engine获取模型对象
+    Args:
+        rollout: rollout对象，包含inference_engine
+    Returns:
+        model: 模型对象
+    """
+    inference_engine = rollout.inference_engine
+    # 判断inference_engine的类型
+    if hasattr(inference_engine, 'llm_engine'):
+        # LLM类型 - vLLMRollout
+        inference_model = (
+            inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+        )
+    elif hasattr(inference_engine, 'worker'):
+        # WorkerWrapperBase类型 - vLLMAsyncRollout
+        inference_model = inference_engine.worker.model_runner.model
+    else:
+        raise AttributeError(
+            f"Unsupported inference_engine type: {type(inference_engine)}. "
+            f"Expected LLM (with llm_engine attribute) or WorkerWrapperBase (with worker attribute)."
+        )
+    return inference_model
+
+
 class DetachNcclSync(ActorRolloutRefWorker):
 
     def _get_actor_params(self):
@@ -62,9 +88,7 @@ def sync_rollout_weights(self):
 
         params = self._get_actor_params() if self._is_actor else None
         if self._is_rollout:
-            inference_model = (
-                self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
-            )
+            inference_model = get_inference_model(self.rollout)
             patch_vllm_moe_model_weight_loader(inference_model)
         for key, shape, dtype in self._weights_info:
             tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
@@ -209,15 +233,15 @@ def init_model(self):
         )
         log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger)
 
-        from sharding_manager import DetachShardingManager
-        rollout_sharding_manager = DetachShardingManager(
+        from .detach_sharding_manager import DetachShardingManager
+        sharding_manager = DetachShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
         )
 
         log_gpu_memory_usage("After building sharding manager", logger=logger)
 
         self.rollout = rollout
-        self.rollout_sharding_manager = rollout_sharding_manager
+        self.rollout_sharding_manager = sharding_manager
 
     @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
     def async_generate_sequences(self, *args, **kwargs):
@@ -238,3 +262,12 @@ def __init__(self, config: DictConfig, role: str):
     def init_model(self):
         print(f"[DetachAsyncRolloutWorker] init_model")
         DetachRolloutWorker.init_model(self)
+
+        self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size
+        self.vllm_dp_rank = int(os.environ["RANK"]) // self.vllm_tp_size
+        self.vllm_tp_rank = int(os.environ["RANK"]) % self.vllm_tp_size
+
+        # used for sleep/wake_up
+        self.rollout.sharding_manager = self.rollout_sharding_manager
+
+
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 5662d6cb479..337f2991a16 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -46,7 +46,7 @@ gen_prompt_bsz=1
 n_resp_per_prompt=3
 train_prompt_mini_bsz=1
 
-total_rollout_steps=10
+total_rollout_steps=1000
 
 # Temperature parameters
 temperature=1.0

From efa664073cc151a7b1272b509323f511b6bef03b Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 15 Aug 2025 01:03:25 +0800
Subject: [PATCH 048/182] RolloutSample

---
 recipe/fully_async_policy/fully_async_main.py |  12 +-
 .../fully_async_rollouter.py                  | 199 ++++++++++++------
 .../fully_async_policy/fully_async_trainer.py | 198 +++++++++--------
 recipe/fully_async_policy/message_queue.py    |  36 +++-
 recipe/fully_async_policy/utils.py            |   1 +
 .../detach_sharding_manager.py                |   2 -
 recipe/one_step_off_policy/fsdp_workers.py    |  17 +-
 recipe/one_step_off_policy/main_ppo.py        |  21 +-
 .../one_step_off_policy/megatron_workers.py   |  10 +-
 9 files changed, 287 insertions(+), 209 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 20234847ea8..1d4e64b1ca4 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -82,21 +82,23 @@ def create_role_worker_mapping(config):
     if config.actor_rollout_ref.actor.strategy == "fsdp2":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
         from recipe.one_step_off_policy.fsdp_workers import (
+            CriticWorker,
             DetachActorWorker,
             DetachAsyncRolloutWorker,
-            CriticWorker,
         )
         from verl.single_controller.ray import RayWorkerGroup
+
         ray_worker_group_cls = RayWorkerGroup
 
     elif config.actor_rollout_ref.actor.strategy == "megatron":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
         from recipe.one_step_off_policy.megatron_workers import (
+            CriticWorker,
             DetachActorWorker,
             DetachAsyncRolloutWorker,
-            CriticWorker,
         )
         from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+
         ray_worker_group_cls = NVMegatronRayWorkerGroup
 
     else:
@@ -120,7 +122,7 @@ def create_role_worker_mapping(config):
 
     # 添加reference policy（如果需要KL loss或reward）
     if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
-        role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+        role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker)
 
     return role_worker_mapping, ray_worker_group_cls
 
@@ -219,7 +221,7 @@ def _create_rollouter(self, config) -> None:
             resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]),
             ray_worker_group_cls=self.components["ray_worker_group_cls"],
             processor=self.components["processor"],
-            device_name=config.trainer.device
+            device_name=config.trainer.device,
         )
 
         ray.get(rollouter.init_workers.remote())
@@ -275,5 +277,3 @@ def main(config):
 
 if __name__ == "__main__":
     main()
-
-
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index a31d903d43a..2e729228153 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -18,8 +18,9 @@
 import ray
 from omegaconf import OmegaConf
 
-from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
+from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample
 from recipe.fully_async_policy.utils import calculate_one_step_size
+from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.tracking import ValidationGenerationsLogger
@@ -34,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -135,8 +136,9 @@ def __init__(
         self.queue_full_pause_count = 0  # 队列满导致的暂停次数
 
         # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
-        self.required_samples = calculate_one_step_size(self.minimal_bsz,
-                                                        config.actor_rollout_ref.actor.ppo_mini_batch_size)
+        self.required_samples = calculate_one_step_size(
+            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
+        )
         self.max_required_samples = self.required_samples * (self.staleness_threshold + 1)
 
         # queue size
@@ -216,34 +218,105 @@ async def _feed_samples(self):
         continuous_iterator = self._create_continuous_iterator()
         sample_count = 0
         for epoch, batch_dict in continuous_iterator:
-            # 准备样本数据
-            sample_id = f"sample_{epoch}_{sample_count}"
-            batch, gen_batch = self._prepare_generate_batch(batch_dict)
+            # 类似 _prepare_generate_batch 的逻辑：分离数据
+            original_batch, gen_data = self._prepare_single_generation_data(batch_dict)
+
+            # 根据 rollout.n 进行重复
+            n_repeats = self.config.actor_rollout_ref.rollout.n
+
+            for rollout_n_index in range(n_repeats):
+                sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}"
+
+                partial_rollout_sample = RolloutSample(
+                    original_batch_dict=original_batch,
+                    agent_loop_output=None,  # 待处理后填充
+                    sample_id=sample_id,
+                    epoch=epoch,
+                    rollout_n_index=rollout_n_index,
+                    original_sample_index=sample_count,
+                    processing_time=0.0,  # 待处理后填充
+                    generation_timestamp=0.0,  # 待处理后填充
+                    param_version=0,  # 待处理后填充
+                    _gen_data=gen_data,  # 临时字段，处理完后删除
+                )
 
-            sample_data = {"sample_id": sample_id, "gen_batch": gen_batch, "epoch": epoch, "timestamp": time.time()}
+                # 将生成数据附加到 RolloutSample 中（临时字段）
 
-            await self.pending_samples_queue.put(sample_data)
-            sample_count += 1
+                await self.pending_samples_queue.put(partial_rollout_sample)
 
-            # 检查是否到达最后一步
-            if self.global_steps >= self.total_rollout_steps:
-                print(f"[FullyAsyncRollouter] 达到最大步数，停止添加新样本 {self.global_steps} >= {self.total_rollout_steps}")
-                break
+                # 检查是否到达最后一步
+                if self.global_steps >= self.total_rollout_steps:
+                    print(
+                        f"[FullyAsyncRollouter] 达到最大步数，停止添加新样本 "
+                        f"{self.global_steps} >= {self.total_rollout_steps}"
+                    )
+                    break
+
+                self.global_steps += 1
 
-            self.global_steps += 1
+            sample_count += 1
 
         # 发送结束信号
         await self.pending_samples_queue.put("DONE")
 
+    def _prepare_single_generation_data(self, batch_dict):
+        """
+        类似 ray_trainer._prepare_generate_batch 的逻辑，但针对单个样本
+        分离出用于生成的数据和需要保留的原始数据
+
+        Returns:
+            tuple: (original_batch_dict, gen_data_for_single_sample)
+        """
+        from verl import DataProto
+
+        # 创建完整的 DataProto
+        full_batch = DataProto.from_single_dict(batch_dict)
+
+        # 定义需要传递给生成服务器的字段
+        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+
+        # 处理可选字段
+        optional_fields = [
+            "multi_modal_data",
+            "raw_prompt",
+            "tools_kwargs",
+            "interaction_kwargs",
+            "index",
+            "agent_name",
+        ]
+
+        for field in optional_fields:
+            if field in full_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append(field)
+
+        # 分离数据：gen_batch 用于生成，original_batch 保留原始信息
+        gen_batch = full_batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+        )
+
+        # 添加全局步数到生成数据
+        gen_batch.meta_info["global_steps"] = self.global_steps
+
+        # 保留原始 batch 信息（转换为字典格式以便序列化）
+        original_batch_dict = {
+            "batch": {k: v.clone() if hasattr(v, "clone") else v for k, v in full_batch.batch.items()},
+            "non_tensor_batch": dict(full_batch.non_tensor_batch),
+            "meta_info": dict(full_batch.meta_info),
+        }
+
+        return original_batch_dict, gen_batch
+
     async def _submit_worker(self):
         """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
         active_tasks = set()
 
         while True:
-            # 获取待处理样本
-            sample_data = await self.pending_samples_queue.get()
+            # 获取待处理的部分 RolloutSample
+            partial_rollout_sample = await self.pending_samples_queue.get()
 
-            if sample_data == "DONE":
+            if partial_rollout_sample == "DONE":
                 print("收到结束信号，等待剩余任务完成...")
                 # 等待所有活动任务完成
                 if active_tasks:
@@ -261,41 +334,48 @@ async def _submit_worker(self):
 
             # 立即提交单个样本处理
             task = asyncio.create_task(
-                self._process_single_sample_streaming(sample_data), name=f"process_{sample_data['sample_id']}"
+                self._process_single_sample_streaming(partial_rollout_sample),
+                name=f"process_{partial_rollout_sample.sample_id}",
             )
             active_tasks.add(task)
 
             # 标记队列任务完成
             self.pending_samples_queue.task_done()
 
-    async def _process_single_sample_streaming(self, sample_data: dict):
+    async def _process_single_sample_streaming(self, partial_rollout_sample):
         """流式处理单个样本"""
         # 检查是否需要暂停处理
         if await self._should_pause_generation():
-            print(f"[FullyAsyncRollouter] 暂停处理样本 {sample_data['sample_id']}")
+            print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id}")
             # 暂停时重新放回队列
-            await self.pending_samples_queue.put(sample_data)
+            await self.pending_samples_queue.put(partial_rollout_sample)
             return
 
         start_time = time.time()
-        # 直接使用AgentLoopManager的单样本异步处理能力
+
+        # 从 RolloutSample 中提取生成数据（临时字段）
+        gen_data = partial_rollout_sample._gen_data
+
+        # 将单个样本数据包装成 DataProto (用于 generate_single_sample_async)
+        gen_batch_single = DataProto.from_items([gen_data])
+
+        # 调用异步生成方法
         agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async(
-            sample_data["gen_batch"], sample_data["sample_id"]
+            gen_batch_single, partial_rollout_sample.sample_id
         )
         end_time = time.time()
 
-        # 组装最终结果
-        final_result = {
-            "sample_id": sample_data["sample_id"],
-            "agent_loop_output": agent_loop_output,
-            "processing_time": processing_time,
-            "timestamp": time.time(),
-            "param_version": self.current_param_version,
-            "epoch": sample_data["epoch"],
-        }
+        # 直接更新 RolloutSample 对象，填充剩余字段
+        partial_rollout_sample.agent_loop_output = agent_loop_output
+        partial_rollout_sample.processing_time = processing_time
+        partial_rollout_sample.generation_timestamp = time.time()
+        partial_rollout_sample.param_version = self.current_param_version
 
-        # 立即放入结果队列
-        await self.result_queue.put(final_result)
+        # 删除临时字段
+        delattr(partial_rollout_sample, "_gen_data")
+
+        # 直接放入结果队列
+        await self.result_queue.put(partial_rollout_sample)
 
         async with self.lock:
             self.processed_sample_count += 1
@@ -304,7 +384,7 @@ async def _process_single_sample_streaming(self, sample_data: dict):
                 self.max_processing_time = processing_time
 
         print(
-            f"[FullyAsyncRollouter] 样本 {sample_data['sample_id']} 处理完成，"
+            f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 处理完成，"
             f"耗时 {processing_time:.2f}s {end_time - start_time:.2f}s"
         )
 
@@ -317,26 +397,13 @@ async def _consumer_worker(self):
                     if self.result_queue.empty():
                         break
 
-            # 从结果队列获取处理结果
-            result = await self.result_queue.get()
-
-            # 准备rollout metadata
-            rollout_metadata = {
-                "generation_timestamp": result["timestamp"],
-                "rollout_param_version": result["param_version"],
-                "processing_time": result["processing_time"],
-                "epoch": result["epoch"],
-                "agent_loop_metrics": result["agent_loop_output"].metrics.model_dump(),
-            }
+            # 从结果队列获取 RolloutSample
+            rollout_sample = await self.result_queue.get()
 
-            # 直接将 AgentLoopOutput 放入消息队列
-            queue_sample = QueueSample(
-                data=result["agent_loop_output"],  # 直接存储 AgentLoopOutput
-                rollout_metadata=rollout_metadata,
-            )
+            # 直接将 RolloutSample 放入消息队列
             success = await self.message_queue_client.put_sample(
-                sample=ray.cloudpickle.dumps(queue_sample),
-                param_version=result["param_version"],
+                sample=ray.cloudpickle.dumps(rollout_sample),
+                param_version=rollout_sample.param_version,
             )
 
             async with self.lock:
@@ -347,9 +414,9 @@ async def _consumer_worker(self):
                     self.dropped_stale_samples += 1
 
             print(
-                f"[FullyAsyncRollouter] 🔥 消费样本 {result['sample_id']}: "
+                f"[FullyAsyncRollouter] 消费样本 {rollout_sample.sample_id}: "
                 f"{'成功' if success else '失败'}放入到消息队列, "
-                f"处理时间 {result['processing_time']:.2f}s"
+                f"处理时间 {rollout_sample.processing_time:.2f}s"
             )
 
             # 标记结果队列任务完成
@@ -585,5 +652,3 @@ async def get_statistics(self) -> dict:
             }
 
             return stats
-
-
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 7b1c725f667..ffdf261126f 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -21,7 +21,7 @@
 import ray
 from omegaconf import OmegaConf
 
-from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample
+from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample
 from recipe.fully_async_policy.utils import calculate_one_step_size
 from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -46,16 +46,16 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -104,8 +104,9 @@ def __init__(
         self.stale_samples_processed = 0
         self.current_param_version = 0
 
-        self.required_samples = calculate_one_step_size(self.minimal_bsz,
-                                                        config.actor_rollout_ref.actor.ppo_mini_batch_size)
+        self.required_samples = calculate_one_step_size(
+            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
+        )
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
@@ -128,8 +129,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
             tuple: (epoch, batch_dict, gen_batch_output)
         """
         print(
-            "[FullyAsyncTrainer] "
-            f"Requesting {self.required_samples} samples from queue",
+            f"[FullyAsyncTrainer] Requesting {self.required_samples} samples from queue",
             flush=True,
         )
 
@@ -166,17 +166,18 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         )
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
-        # Assemble batch
+        # Assemble batch - now working directly with RolloutSample objects
         batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
 
         return 0, batch
 
-    def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]):
+    def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[RolloutSample]):
         """
-        Assemble gen_batch_output from queue samples containing AgentLoopOutput
+        Assemble gen_batch_output from RolloutSample objects
+        从 RolloutSample 对象中组装批次，类似 ray_trainer 的 _post_generate_batch 逻辑
 
         Args:
-            queue_samples: List of samples from queue, each containing AgentLoopOutput
+            rollout_samples: List of RolloutSample objects
 
         Returns:
             DataProto: Assembled gen_batch_output
@@ -184,91 +185,89 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu
         start_time = time.time()
 
         import numpy as np
+        import torch
 
-        if not queue_samples:
-            raise ValueError("Empty queue_samples provided for batch assembly")
-
-        print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples with AgentLoopOutput")
-
-        # Extract AgentLoopOutput and metadata from all samples
-        agent_loop_outputs = []
-        rollout_metadata_list = []
-        processing_times = []
-
-        for sample in queue_samples:
-            # sample.data is now AgentLoopOutput
-            agent_loop_outputs.append(sample.data)
-            rollout_metadata_list.append(sample.rollout_metadata)
-            processing_times.append(sample.rollout_metadata.get("processing_time", 0))
-
-        # Use the static method to postprocess AgentLoopOutput list into DataProto
-
-        batch = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config)
-
-        # Apply _post_generate_batch logic here
-        batch = self._post_generate_batch_for_agent_outputs(batch, agent_loop_outputs)
-
-        # Collect timing information and metadata
-        param_versions = []
-        sample_timestamps = []
-        for metadata in rollout_metadata_list:
-            # Extract parameter version and timestamp
-            param_versions.append(metadata.get("rollout_param_version", 0))
-            sample_timestamps.append(metadata.get("generation_timestamp", time.time()))
-
-        # Create meta_info
-        meta_info = {
-            "timing": {"avg_processing_time": np.mean(processing_times) if processing_times else 0},
-            "queue_sample_count": len(queue_samples),
-            "rollout_param_versions": param_versions,
-            "sample_timestamps": sample_timestamps,
-            "param_version_diversity": len(set(param_versions)),
-            "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]),
-        }
+        from verl import DataProto
+        from verl.trainer.ppo.ray_trainer import compute_response_mask
 
-        batch.meta_info.update(meta_info)
+        if not rollout_samples:
+            raise ValueError("Empty rollout_samples provided for batch assembly")
 
-        end_time = time.time()
-        print(
-            f"[FullyAsyncTrainer] Assembled batch with meta_info: "
-            f"{meta_info}, time elapsed: {end_time - start_time:.2f} seconds"
-        )
+        print(f"[FullyAsyncTrainer] Assembling batch from {len(rollout_samples)} RolloutSample objects")
 
-        return batch
+        # 直接处理 RolloutSample 对象
+        processing_times = [rs.processing_time for rs in rollout_samples]
 
-    def _post_generate_batch_for_agent_outputs(self, batch, agent_loop_outputs):
-        """
-        Apply _post_generate_batch logic for AgentLoopOutput
+        # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
+        agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples]
+        gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config)
 
-        Args:
-            batch: DataProto created from AgentLoopWorker.postprocess_agent_loop_outputs
-            agent_loop_outputs: List of AgentLoopOutput
+        # 第二步：重建原始 batch 信息
+        # 每个 RolloutSample 都是独立的，直接按顺序重建原始数据
+        original_batch_list = []
+        for rs in rollout_samples:
+            original_batch_dict = rs.original_batch_dict
 
-        Returns:
-            DataProto: Processed batch with additional metadata
-        """
-        import uuid
+            # 重建 DataProto
+            original_batch_item = DataProto.from_single_dict(
+                {
+                    **{k: v for k, v in original_batch_dict["batch"].items()},
+                    **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()},
+                }
+            )
+            original_batch_item.meta_info.update(original_batch_dict["meta_info"])
+            original_batch_list.append(original_batch_item)
 
-        import numpy as np
-        import torch
+        # 合并所有原始样本为一个批次
+        if original_batch_list:
+            original_batch = DataProto.from_items(original_batch_list)
+        else:
+            # 如果没有原始数据，创建空的 DataProto
+            original_batch = DataProto.from_single_dict({})
 
-        from verl.trainer.ppo.ray_trainer import compute_response_mask
+        # 添加 UID
+        uids = []
+        for rs in rollout_samples:
+            uids.append(f"uid_{rs.sample_id}")
+        original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object)
 
-        # Add UIDs
-        batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+        # 直接合并原始数据和生成结果，不需要 repeat
+        # 因为队列中的每个 RolloutSample 都已经是独立的样本
+        final_batch = original_batch.union(gen_batch_output)
 
-        # response_mask should already be in batch from AgentLoopWorker.postprocess_agent_loop_outputs
-        if "response_mask" not in batch.batch.keys():
-            batch.batch["response_mask"] = compute_response_mask(batch)
+        # 计算 response_mask（如果不存在）
+        if "response_mask" not in final_batch.batch.keys():
+            final_batch.batch["response_mask"] = compute_response_mask(final_batch)
 
-        # Balance the number of valid tokens across DP ranks if needed
+        # 平衡批次（如果配置了）
         if self.config.trainer.balance_batch:
-            self._balance_batch(batch, metrics={})
+            self._balance_batch(final_batch, metrics={})
+
+        # 计算全局有效 token 数
+        if "attention_mask" in final_batch.batch:
+            final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist()
+
+        # 收集统计信息和元数据（直接从 RolloutSample 中获取）
+        param_versions = [rs.param_version for rs in rollout_samples]
+        sample_timestamps = [rs.generation_timestamp for rs in rollout_samples]
+
+        # 创建 meta_info
+        final_batch.meta_info.update(
+            {
+                "rollout_param_versions": param_versions,
+                "sample_timestamps": sample_timestamps,
+                "avg_processing_time": np.mean(processing_times) if processing_times else 0,
+                "max_processing_time": np.max(processing_times) if processing_times else 0,
+                "param_version_diversity": len(set(param_versions)) if param_versions else 0,
+                "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0,
+                "assembly_time": time.time() - start_time,
+            }
+        )
 
-        # compute global_valid tokens
-        batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+        print(f"[FullyAsyncTrainer] Batch assembly completed in {time.time() - start_time:.2f}s")
+        print(f"[FullyAsyncTrainer] {final_batch}")
 
-        return batch
+        return final_batch
 
     def _create_actor_rollout_classes(self):
         # create actor
@@ -411,33 +410,29 @@ def _trigger_parameter_sync_after_step(self):
         )
         ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
 
-    def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict:
+    def _compute_sample_freshness_metrics(self, rollout_samples: list[RolloutSample]) -> dict:
         """
         Compute sample freshness metrics
 
         Args:
-            batch_samples: List of queue samples
+            rollout_samples: List of RolloutSample objects
 
         Returns:
             dict: Dictionary of freshness metrics
         """
-        if not batch_samples:
+        if not rollout_samples:
             return {}
 
         try:
-            # Extract parameter versions and timestamps
+            # Extract parameter versions and timestamps directly from RolloutSample
             sample_ages = []
             sample_latencies = []
             current_time = time.time()
 
-            for sample in batch_samples:
-                # Get information from rollout_metadata
-                if hasattr(sample, "rollout_metadata") and sample.rollout_metadata:
-                    rollout_version = sample.rollout_metadata.get("rollout_param_version", 0)
-                    generation_time = sample.rollout_metadata.get("generation_timestamp", current_time)
-                else:
-                    rollout_version = 0
-                    generation_time = current_time
+            for sample in rollout_samples:
+                # Get information directly from RolloutSample
+                rollout_version = sample.param_version
+                generation_time = sample.generation_timestamp
 
                 age = max(0, self.current_param_version - rollout_version)
                 latency = max(0, current_time - generation_time)
@@ -462,4 +457,3 @@ def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) ->
         except Exception as e:
             logger.error(f"Error computing freshness metrics: {e}")
             return {"freshness/error": str(e)}
-
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 9a093296743..4c3232e561b 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -25,9 +25,25 @@
 
 
 @dataclass
-class QueueSample:
-    data: Any
-    rollout_metadata: dict[str, Any]
+class RolloutSample:
+    """Enhanced rollout sample containing both original batch info and AgentLoopOutput"""
+
+    # Original batch information (preserved from _prepare_generate_batch)
+    original_batch_dict: dict[str, Any]
+
+    # AgentLoopOutput from generation
+    agent_loop_output: Any  # AgentLoopOutput
+
+    # Metadata
+    sample_id: str
+    epoch: int
+    rollout_n_index: int  # Index within the rollout.n repetitions (0, 1, ..., n-1)
+    original_sample_index: int  # Index of the original sample before repetition
+
+    # Processing metadata
+    processing_time: float
+    generation_timestamp: float
+    param_version: int
 
 
 @ray.remote(num_cpus=2, max_concurrency=20)
@@ -236,13 +252,17 @@ async def get_memory_usage(self) -> dict:
                 sample = list(self.queue)[0]
                 try:
                     sample_size = sys.getsizeof(sample)
-                    if hasattr(sample.data, "batch") and hasattr(sample.data.batch, "__len__"):
-                        # If batch info is available, estimate data size
-                        batch_size = len(sample.data.batch)
-                        sample_size += batch_size * 1000  # Roughly estimate 1KB per batch entry
+                    # Since we now store RolloutSample directly, estimate based on its components
+                    if hasattr(sample, "original_batch_dict") and sample.original_batch_dict:
+                        # Estimate batch data size
+                        batch_data = sample.original_batch_dict.get("batch", {})
+                        sample_size += len(batch_data) * 1000  # Roughly estimate 1KB per batch entry
+                    if hasattr(sample, "agent_loop_output"):
+                        # Estimate AgentLoopOutput size
+                        sample_size += 5000  # Roughly estimate 5KB for AgentLoopOutput
                     total_size = sample_size * sample_count
                 except Exception:
-                    total_size = sample_count * 10000  # Roughly estimate 10KB per sample
+                    total_size = sample_count * 15000  # Roughly estimate 15KB per RolloutSample
 
             return {
                 "queue_samples": sample_count,
diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py
index 71ae7c7d16d..d9afa0a9ab1 100644
--- a/recipe/fully_async_policy/utils.py
+++ b/recipe/fully_async_policy/utils.py
@@ -15,5 +15,6 @@
 
 # Calculate the number of samples needed
 
+
 def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size):
     return minimal_bsz * ppo_mini_batch_size
diff --git a/recipe/one_step_off_policy/detach_sharding_manager.py b/recipe/one_step_off_policy/detach_sharding_manager.py
index 6b304baa276..a8a7a12c0ba 100644
--- a/recipe/one_step_off_policy/detach_sharding_manager.py
+++ b/recipe/one_step_off_policy/detach_sharding_manager.py
@@ -74,5 +74,3 @@ def postprocess_data(self, data: DataProto) -> DataProto:
             return data
 
         return data.chunk(chunks=self.tp_size)[self.tp_rank]
-
-
diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py
index 04c66c8b60a..086f109e434 100644
--- a/recipe/one_step_off_policy/fsdp_workers.py
+++ b/recipe/one_step_off_policy/fsdp_workers.py
@@ -39,8 +39,7 @@
 from verl.utils.import_utils import import_external_libs
 from verl.utils.model import get_generation_config, update_model_config
 from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
-from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
-from verl.workers.fsdp_workers import CriticWorker
+from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -60,12 +59,10 @@ def get_inference_model(rollout):
     """
     inference_engine = rollout.inference_engine
     # 判断inference_engine的类型
-    if hasattr(inference_engine, 'llm_engine'):
+    if hasattr(inference_engine, "llm_engine"):
         # LLM类型 - vLLMRollout
-        inference_model = (
-            inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
-        )
-    elif hasattr(inference_engine, 'worker'):
+        inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+    elif hasattr(inference_engine, "worker"):
         # WorkerWrapperBase类型 - vLLMAsyncRollout
         inference_model = inference_engine.worker.model_runner.model
     else:
@@ -77,7 +74,6 @@ def get_inference_model(rollout):
 
 
 class DetachNcclSync(ActorRolloutRefWorker):
-
     def _get_actor_params(self):
         pass
 
@@ -234,6 +230,7 @@ def init_model(self):
         log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger)
 
         from .detach_sharding_manager import DetachShardingManager
+
         sharding_manager = DetachShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
         )
@@ -260,7 +257,7 @@ def __init__(self, config: DictConfig, role: str):
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
-        print(f"[DetachAsyncRolloutWorker] init_model")
+        print("[DetachAsyncRolloutWorker] init_model")
         DetachRolloutWorker.init_model(self)
 
         self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size
@@ -269,5 +266,3 @@ def init_model(self):
 
         # used for sleep/wake_up
         self.rollout.sharding_manager = self.rollout_sharding_manager
-
-
diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py
index d9d8f0bb849..0dcdbef3705 100644
--- a/recipe/one_step_off_policy/main_ppo.py
+++ b/recipe/one_step_off_policy/main_ppo.py
@@ -60,26 +60,26 @@ def run(self, config):
         # Define worker classes based on the actor strategy.
         if config.actor_rollout_ref.actor.strategy == "fsdp2":
             assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-            from verl.single_controller.ray import RayWorkerGroup
-
             from recipe.one_step_off_policy.fsdp_workers import (
+                CriticWorker,
                 DetachActorWorker,
-                DetachRolloutWorker,
                 DetachAsyncRolloutWorker,
-                CriticWorker,
+                DetachRolloutWorker,
             )
+            from verl.single_controller.ray import RayWorkerGroup
+
             ray_worker_group_cls = RayWorkerGroup
 
         elif config.actor_rollout_ref.actor.strategy == "megatron":
             assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
-
             from recipe.one_step_off_policy.megatron_workers import (
+                CriticWorker,
                 DetachActorWorker,
-                DetachRolloutWorker,
                 DetachAsyncRolloutWorker,
-                CriticWorker,
+                DetachRolloutWorker,
             )
+            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+
             ray_worker_group_cls = NVMegatronRayWorkerGroup
 
         else:
@@ -90,7 +90,8 @@ def run(self, config):
         role_worker_mapping = {
             Role.Actor: ray.remote(DetachActorWorker),
             Role.Rollout: ray.remote(
-                DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker),
+                DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker
+            ),
             Role.Critic: ray.remote(CriticWorker),
         }
 
@@ -132,7 +133,7 @@ def run(self, config):
 
         # Add a reference policy worker if KL loss or KL reward is used.
         if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
-            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+            role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker)
             mapping[Role.RefPolicy] = global_pool_id
 
         # Load the reward manager for training and validation.
diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py
index 9011f5a6023..5b338c5be42 100644
--- a/recipe/one_step_off_policy/megatron_workers.py
+++ b/recipe/one_step_off_policy/megatron_workers.py
@@ -27,8 +27,11 @@
 from verl.utils.device import get_device_name, get_torch_device
 from verl.utils.fs import copy_to_local
 from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
-from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
-from verl.workers.megatron_workers import CriticWorker, RewardModelWorker
+from verl.workers.megatron_workers import (
+    ActorRolloutRefWorker,
+    AsyncActorRolloutRefWorker,
+    CriticWorker,
+)
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -166,6 +169,7 @@ def init_model(self):
         log_gpu_memory_usage("After building vllm rollout", logger=logger)
 
         from sharding_manager import DetachShardingManager
+
         rollout_sharding_manager = DetachShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
         )
@@ -193,4 +197,4 @@ def __init__(self, config: DictConfig, role: str):
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
-        DetachRolloutWorker.init_model(self)
\ No newline at end of file
+        DetachRolloutWorker.init_model(self)

From 966f58df6bdb8e8353f329e274f598730ca101c4 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 15 Aug 2025 10:50:00 +0800
Subject: [PATCH 049/182] RolloutSample

---
 .../fully_async_rollouter.py                  | 95 +++++++++++--------
 .../fully_async_policy/fully_async_trainer.py | 10 +-
 2 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 2e729228153..3be6661c8e1 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -35,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -217,7 +217,12 @@ def _init_async_rollout_manager(self):
     async def _feed_samples(self):
         continuous_iterator = self._create_continuous_iterator()
         sample_count = 0
+        should_stop = False
+
         for epoch, batch_dict in continuous_iterator:
+            if should_stop:  # 检查停止标志
+                break
+
             # 类似 _prepare_generate_batch 的逻辑：分离数据
             original_batch, gen_data = self._prepare_single_generation_data(batch_dict)
 
@@ -227,6 +232,7 @@ async def _feed_samples(self):
             for rollout_n_index in range(n_repeats):
                 sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}"
 
+                # 创建部分 RolloutSample，不包含 _gen_data（因为它不在数据类定义中）
                 partial_rollout_sample = RolloutSample(
                     original_batch_dict=original_batch,
                     agent_loop_output=None,  # 待处理后填充
@@ -237,10 +243,10 @@ async def _feed_samples(self):
                     processing_time=0.0,  # 待处理后填充
                     generation_timestamp=0.0,  # 待处理后填充
                     param_version=0,  # 待处理后填充
-                    _gen_data=gen_data,  # 临时字段，处理完后删除
                 )
 
-                # 将生成数据附加到 RolloutSample 中（临时字段）
+                # 动态添加临时字段（处理完后删除）
+                partial_rollout_sample._gen_data = gen_data
 
                 await self.pending_samples_queue.put(partial_rollout_sample)
 
@@ -250,6 +256,7 @@ async def _feed_samples(self):
                         f"[FullyAsyncRollouter] 达到最大步数，停止添加新样本 "
                         f"{self.global_steps} >= {self.total_rollout_steps}"
                     )
+                    should_stop = True  # 设置停止标志
                     break
 
                 self.global_steps += 1
@@ -258,6 +265,7 @@ async def _feed_samples(self):
 
         # 发送结束信号
         await self.pending_samples_queue.put("DONE")
+        print(f"[FullyAsyncRollouter] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
 
     def _prepare_single_generation_data(self, batch_dict):
         """
@@ -344,11 +352,16 @@ async def _submit_worker(self):
 
     async def _process_single_sample_streaming(self, partial_rollout_sample):
         """流式处理单个样本"""
-        # 检查是否需要暂停处理
-        if await self._should_pause_generation():
-            print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id}")
-            # 暂停时重新放回队列
-            await self.pending_samples_queue.put(partial_rollout_sample)
+        # 检查是否需要暂停处理，如果需要暂停则等待resume信号
+        while await self._should_pause_generation() and self.running:
+            print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id}，等待resume...")
+            async with self.lock:
+                await self.condition.wait()
+            print(f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 收到resume信号，继续处理")
+
+        # 如果系统已停止，跳过处理
+        if not self.running:
+            print(f"[FullyAsyncRollouter] 系统已停止，跳过样本 {partial_rollout_sample.sample_id}")
             return
 
         start_time = time.time()
@@ -575,33 +588,32 @@ async def _should_pause_generation(self) -> bool:
         queue_size = queue_stats["queue_size"]
         current_trainer_version = queue_stats["current_param_version"]
 
-        async with self.lock:
-            version_diff = self.current_param_version - current_trainer_version
-
-            if version_diff > self.staleness_threshold:
-                print(
-                    "[FullyAsyncRollouter] "
-                    f"Should pause due to version_diff > self.staleness_threshold: "
-                    f"rollout_version={self.current_param_version}, "
-                    f"trainer_version={current_trainer_version}, diff={version_diff}"
-                )
-                return True
+        version_diff = self.current_param_version - current_trainer_version
 
-            if queue_size >= self.max_queue_size:
-                print(
-                    f"[FullyAsyncRollouter] Should pause due to full queue: "
-                    f"size={queue_size}, max={self.max_queue_size}"
-                )
-                return True
+        if version_diff > self.staleness_threshold:
+            print(
+                "[FullyAsyncRollouter] "
+                f"Should pause due to version_diff > self.staleness_threshold: "
+                f"rollout_version={self.current_param_version}, "
+                f"trainer_version={current_trainer_version}, diff={version_diff}"
+            )
+            return True
 
-            if self.train_step_samples >= self.max_required_samples:
-                print(
-                    f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: "
-                    f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}"
-                )
-                return True
+        if queue_size >= self.max_queue_size:
+            print(
+                f"[FullyAsyncRollouter] Should pause due to full queue: "
+                f"size={queue_size}, max={self.max_queue_size}"
+            )
+            return True
 
-            return False
+        if self.train_step_samples >= self.max_required_samples:
+            print(
+                f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: "
+                f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}"
+            )
+            return True
+
+        return False
 
     async def pause(self) -> bool:
         """pause rollout
@@ -652,3 +664,4 @@ async def get_statistics(self) -> dict:
             }
 
             return stats
+
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index ffdf261126f..072a26fea35 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -38,7 +38,7 @@
 logger = logging.getLogger(__name__)
 
 
-@ray.remote
+@ray.remote(num_cpus=10)
 class FullyAsyncTrainer(RayPPOTrainer):
     """
     A fully asynchronous PPO trainer that obtains samples from a MessageQueue for training.
@@ -373,14 +373,14 @@ def fit(self):
                                     "statistics/current_param_version": self.current_param_version,
                                 }
                             )
-                batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
-                self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-                self._check_save_checkpoint(is_last_step, timing_raw)
+                # batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+                # self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+                # self._check_save_checkpoint(is_last_step, timing_raw)
 
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
 
             # Trigger parameter synchronization after training step
-            self._trigger_parameter_sync_after_step()
+            # self._trigger_parameter_sync_after_step()
             print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}")
             self.global_steps += 1
 

From 28809b521a7a0377752f4fe342d263976a2d64ae Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 15 Aug 2025 18:56:28 +0800
Subject: [PATCH 050/182] success rollout

---
 .../fully_async_rollouter.py                  | 331 +++++++-----------
 .../fully_async_policy/fully_async_trainer.py |  74 ++--
 recipe/fully_async_policy/message_queue.py    |  39 +--
 tests/special_e2e/run_fully_async_policy.sh   |   2 +-
 .../rollout/vllm_rollout/vllm_async_server.py |  20 +-
 .../rollout/vllm_rollout/vllm_rollout_spmd.py |   3 -
 6 files changed, 184 insertions(+), 285 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 3be6661c8e1..16b68b3e819 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -35,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -111,9 +111,9 @@ def __init__(
         self.message_queue_client = None
 
         # Concurrency control
-        self.running = False
         self.paused = False
-        # Initialize async locks directly - asyncio.Lock() creation is synchronous
+
+        # Initialize async locks directly
         self.lock = asyncio.Lock()
         self.condition = asyncio.Condition(self.lock)
 
@@ -126,8 +126,14 @@ def __init__(
 
         self.async_rollout_manager = None
 
-        # 流式处理相关配置
-        self.max_concurrent_samples = async_config.get("max_concurrent_samples", 512)  # 最大并发处理样本数
+        # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
+        self.required_samples = calculate_one_step_size(
+            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
+        )
+        self.max_required_samples = self.required_samples * (self.staleness_threshold + 1)
+
+        # 单次最多扔一次迭代需要的样本
+        self.max_concurrent_samples = self.required_samples
 
         # 流式处理统计
         self.max_processing_time = 0.0  # 最长处理时间
@@ -135,14 +141,9 @@ def __init__(
         self.active_sample_count = 0  # 当前正在处理的样本数
         self.queue_full_pause_count = 0  # 队列满导致的暂停次数
 
-        # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
-        self.required_samples = calculate_one_step_size(
-            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
-        )
-        self.max_required_samples = self.required_samples * (self.staleness_threshold + 1)
-
         # queue size
         self.max_queue_size = self.max_required_samples * 10  # x 10 avoid deadlock
+        print(f"[FullyAsyncRollouter] {self.max_queue_size}")
 
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
@@ -213,60 +214,6 @@ def _init_async_rollout_manager(self):
             worker_group=self.rollout_wg,
         )
 
-    # 添加样本到待处理队列的协程
-    async def _feed_samples(self):
-        continuous_iterator = self._create_continuous_iterator()
-        sample_count = 0
-        should_stop = False
-
-        for epoch, batch_dict in continuous_iterator:
-            if should_stop:  # 检查停止标志
-                break
-
-            # 类似 _prepare_generate_batch 的逻辑：分离数据
-            original_batch, gen_data = self._prepare_single_generation_data(batch_dict)
-
-            # 根据 rollout.n 进行重复
-            n_repeats = self.config.actor_rollout_ref.rollout.n
-
-            for rollout_n_index in range(n_repeats):
-                sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}"
-
-                # 创建部分 RolloutSample，不包含 _gen_data（因为它不在数据类定义中）
-                partial_rollout_sample = RolloutSample(
-                    original_batch_dict=original_batch,
-                    agent_loop_output=None,  # 待处理后填充
-                    sample_id=sample_id,
-                    epoch=epoch,
-                    rollout_n_index=rollout_n_index,
-                    original_sample_index=sample_count,
-                    processing_time=0.0,  # 待处理后填充
-                    generation_timestamp=0.0,  # 待处理后填充
-                    param_version=0,  # 待处理后填充
-                )
-
-                # 动态添加临时字段（处理完后删除）
-                partial_rollout_sample._gen_data = gen_data
-
-                await self.pending_samples_queue.put(partial_rollout_sample)
-
-                # 检查是否到达最后一步
-                if self.global_steps >= self.total_rollout_steps:
-                    print(
-                        f"[FullyAsyncRollouter] 达到最大步数，停止添加新样本 "
-                        f"{self.global_steps} >= {self.total_rollout_steps}"
-                    )
-                    should_stop = True  # 设置停止标志
-                    break
-
-                self.global_steps += 1
-
-            sample_count += 1
-
-        # 发送结束信号
-        await self.pending_samples_queue.put("DONE")
-        print(f"[FullyAsyncRollouter] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
-
     def _prepare_single_generation_data(self, batch_dict):
         """
         类似 ray_trainer._prepare_generate_batch 的逻辑，但针对单个样本
@@ -316,26 +263,87 @@ def _prepare_single_generation_data(self, batch_dict):
 
         return original_batch_dict, gen_batch
 
-    async def _submit_worker(self):
+    # 添加样本到待处理队列的协程
+    async def _feed_samples(self):
+        continuous_iterator = self._create_continuous_iterator()
+        sample_count = 0
+        should_stop = False
+
+        for epoch, batch_dict in continuous_iterator:
+            if should_stop:  # 检查停止标志
+                break
+
+            # 类似 _prepare_generate_batch 的逻辑：分离数据
+            original_batch, gen_data = self._prepare_single_generation_data(batch_dict)
+
+            # 根据 rollout.n 进行重复
+            for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n):
+                sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}"
+
+                # 创建部分 RolloutSample，不包含 _gen_data（因为它不在数据类定义中）
+                partial_rollout_sample = RolloutSample(
+                    original_batch_dict=original_batch,
+                    agent_loop_output=None,  # 待处理后填充
+                    sample_id=sample_id,
+                    epoch=epoch,
+                    rollout_n_index=rollout_n_index,
+                    original_sample_index=sample_count,
+                    processing_time=0.0,  # 待处理后填充
+                    generation_timestamp=0.0,  # 待处理后填充
+                    param_version=0,  # 待处理后填充
+                    _gen_data=gen_data,
+                )
+
+                await self.pending_queue.put(partial_rollout_sample)
+
+                # 检查是否到达最后一步
+                if self.global_steps >= self.total_rollout_steps:
+                    print(
+                        f"[FullyAsyncRollouter] 达到最大步数，停止添加新样本 "
+                        f"{self.global_steps} >= {self.total_rollout_steps}"
+                    )
+                    should_stop = True  # 设置停止标志
+                    break
+
+                self.global_steps += 1
+
+            sample_count += 1
+
+        # 发送结束信号
+        await self.pending_queue.put("DONE")
+        print(f"[FullyAsyncRollouter] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
+
+    async def _processor_worker(self):
         """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
-        active_tasks = set()
 
         while True:
-            # 获取待处理的部分 RolloutSample
-            partial_rollout_sample = await self.pending_samples_queue.get()
+            partial_rollout_sample = await self.pending_queue.get()
+            self.train_step_samples += 1
 
+            async with self.lock:
+                if await self._should_pause_generation():
+                    # 等待已提交的任务结束
+                    await asyncio.gather(*self.active_tasks, return_exceptions=True)
+                    self.active_tasks = set()
+                    self.paused = True
+
+                while self.paused:
+                    await self.condition.wait()
+
+            # 获取待处理的部分 RolloutSample
             if partial_rollout_sample == "DONE":
                 print("收到结束信号，等待剩余任务完成...")
                 # 等待所有活动任务完成
-                if active_tasks:
-                    await asyncio.gather(*active_tasks, return_exceptions=True)
+                if self.active_tasks:
+                    await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 break
 
             # 检查并发数是否超限
-            while len(active_tasks) >= self.max_concurrent_samples:
-                print(f"达到最大并发数 {self.max_concurrent_samples}，等待任务完成...")
+            while len(self.active_tasks) >= self.max_concurrent_samples:
                 # 等待至少一个任务完成
-                done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
+                done_tasks, self.active_tasks = await asyncio.wait(
+                    self.active_tasks, return_when=asyncio.FIRST_COMPLETED
+                )
                 # 清理已完成的任务
                 for task in done_tasks:
                     await task
@@ -345,92 +353,51 @@ async def _submit_worker(self):
                 self._process_single_sample_streaming(partial_rollout_sample),
                 name=f"process_{partial_rollout_sample.sample_id}",
             )
-            active_tasks.add(task)
+            self.active_tasks.add(task)
 
             # 标记队列任务完成
-            self.pending_samples_queue.task_done()
+            self.pending_queue.task_done()
 
     async def _process_single_sample_streaming(self, partial_rollout_sample):
         """流式处理单个样本"""
-        # 检查是否需要暂停处理，如果需要暂停则等待resume信号
-        while await self._should_pause_generation() and self.running:
-            print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id}，等待resume...")
-            async with self.lock:
-                await self.condition.wait()
-            print(f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 收到resume信号，继续处理")
-
-        # 如果系统已停止，跳过处理
-        if not self.running:
-            print(f"[FullyAsyncRollouter] 系统已停止，跳过样本 {partial_rollout_sample.sample_id}")
-            return
-
-        start_time = time.time()
-
-        # 从 RolloutSample 中提取生成数据（临时字段）
-        gen_data = partial_rollout_sample._gen_data
-
-        # 将单个样本数据包装成 DataProto (用于 generate_single_sample_async)
-        gen_batch_single = DataProto.from_items([gen_data])
 
         # 调用异步生成方法
         agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async(
-            gen_batch_single, partial_rollout_sample.sample_id
+            partial_rollout_sample._gen_data, partial_rollout_sample.sample_id
         )
-        end_time = time.time()
-
         # 直接更新 RolloutSample 对象，填充剩余字段
         partial_rollout_sample.agent_loop_output = agent_loop_output
         partial_rollout_sample.processing_time = processing_time
         partial_rollout_sample.generation_timestamp = time.time()
         partial_rollout_sample.param_version = self.current_param_version
 
-        # 删除临时字段
-        delattr(partial_rollout_sample, "_gen_data")
-
         # 直接放入结果队列
         await self.result_queue.put(partial_rollout_sample)
 
-        async with self.lock:
-            self.processed_sample_count += 1
-            # 更新最大处理时间统计
-            if processing_time > self.max_processing_time:
-                self.max_processing_time = processing_time
-
-        print(
-            f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 处理完成，"
-            f"耗时 {processing_time:.2f}s {end_time - start_time:.2f}s"
-        )
+        self.processed_sample_count += 1
+        # 更新最大处理时间统计
+        if processing_time > self.max_processing_time:
+            self.max_processing_time = processing_time
+
+        print(f"[FullyAsyncRollouter] process {partial_rollout_sample.sample_id} cost {processing_time:.2f}s")
 
     async def _consumer_worker(self):
         """消费者协程，负责从结果队列获取处理结果并放入消息队列"""
         while True:
-            async with self.lock:
-                if not self.running:
-                    # 如果系统停止但还有结果待处理，继续处理
-                    if self.result_queue.empty():
-                        break
-
             # 从结果队列获取 RolloutSample
             rollout_sample = await self.result_queue.get()
-
             # 直接将 RolloutSample 放入消息队列
             success = await self.message_queue_client.put_sample(
                 sample=ray.cloudpickle.dumps(rollout_sample),
                 param_version=rollout_sample.param_version,
             )
 
-            async with self.lock:
-                if success:
-                    self.total_generated_samples += 1
-                    self.train_step_samples += 1
-                else:
-                    self.dropped_stale_samples += 1
+            if success:
+                self.total_generated_samples += 1
+            else:
+                self.dropped_stale_samples += 1
 
-            print(
-                f"[FullyAsyncRollouter] 消费样本 {rollout_sample.sample_id}: "
-                f"{'成功' if success else '失败'}放入到消息队列, "
-                f"处理时间 {rollout_sample.processing_time:.2f}s"
-            )
+            print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}")
 
             # 标记结果队列任务完成
             self.result_queue.task_done()
@@ -473,12 +440,13 @@ async def _streaming_generation_main(self):
         print(f"[FullyAsyncRollouter] 启动流式处理模式，最大并发样本数: {self.max_concurrent_samples}")
 
         # 初始化异步队列
-        self.pending_samples_queue = asyncio.Queue(maxsize=self.max_concurrent_samples)
+        self.pending_queue = asyncio.Queue(maxsize=100)
+        self.active_tasks = set()
         self.result_queue = asyncio.Queue()
 
         # 启动流式处理协程和消费者协程
         self.feed_task = asyncio.create_task(self._feed_samples())
-        self.stream_processor_task = asyncio.create_task(self._submit_worker())
+        self.processor_task = asyncio.create_task(self._processor_worker())
         self.consumer_task = asyncio.create_task(self._consumer_worker())
         # 启动样本添加协程
 
@@ -488,7 +456,7 @@ async def _streaming_generation_main(self):
             print("[FullyAsyncRollouter] 样本添加完成")
 
             # 等待流式处理完成
-            await self.stream_processor_task
+            await self.processor_task
             print("[FullyAsyncRollouter] 流式处理完成")
 
             # 等待结果队列清空
@@ -500,16 +468,13 @@ async def _streaming_generation_main(self):
 
         finally:
             # 取消所有任务
-            if self.stream_processor_task:
-                self.stream_processor_task.cancel()
+            if self.processor_task:
+                self.processor_task.cancel()
             if self.consumer_task:
                 self.consumer_task.cancel()
 
             # 等待任务结束
-            await asyncio.gather(self.stream_processor_task, self.consumer_task, return_exceptions=True)
-
-        async with self.lock:
-            self.running = False
+            await asyncio.gather(self.processor_task, self.consumer_task, return_exceptions=True)
 
         # 发送终止信号
         await self.message_queue_client.put_sample(
@@ -530,9 +495,7 @@ async def fit(self):
             raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
         # 设置运行状态
-        async with self.lock:
-            self.running = True
-            self.paused = False
+        self.paused = False
 
         # 创建主要的异步任务
         generation_task = asyncio.create_task(self._streaming_generation_main())
@@ -566,17 +529,12 @@ async def _async_monitor_loop(self):
         check_interval = 5.0
 
         while True:
-            async with self.lock:
-                if not self.running:
-                    break
-
             await asyncio.sleep(check_interval)
-
             # 定期打印统计信息
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
-                print(f"[FullyAsyncRollouter] {stats}")
+                print(f"[FullyAsyncRollouter] statistics {stats}")
                 last_stats_time = current_time
 
             if not await self._should_pause_generation():
@@ -600,68 +558,49 @@ async def _should_pause_generation(self) -> bool:
             return True
 
         if queue_size >= self.max_queue_size:
-            print(
-                f"[FullyAsyncRollouter] Should pause due to full queue: "
-                f"size={queue_size}, max={self.max_queue_size}"
-            )
+            print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
             return True
 
-        if self.train_step_samples >= self.max_required_samples:
+        if self.train_step_samples > self.max_required_samples:
             print(
-                f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: "
-                f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}"
+                f"[FullyAsyncRollouter] Should pause due to "
+                f"step_generated_samples {self.train_step_samples} > max_required_samples {self.max_required_samples} "
             )
             return True
 
         return False
 
-    async def pause(self) -> bool:
+    async def pause(self):
         """pause rollout
         TODO integrated Partial Rollout
         """
         print("[FullyAsyncRollouter] pause")
         async with self.lock:
-            if not self.running:
-                return False
-
-            if self.paused:
-                return True
-
             self.paused = True
-            return True
 
-    async def resume(self) -> bool:
+    async def resume(self):
         """resume rollout
         TODO integrated Partial Rollout
         """
         print("[FullyAsyncRollouter] resume")
         async with self.lock:
-            if not self.running:
-                return False
-
-            if not self.paused:
-                return True
-
             self.paused = False
             self.condition.notify_all()
-            return True
 
     async def get_statistics(self) -> dict:
-        async with self.lock:
-            queue_stats = self.message_queue_client.get_statistics_sync()
-            stats = {
-                "is_running": self.running,
-                "total_generated_samples": self.total_generated_samples,
-                "train_step_samples": self.train_step_samples,
-                "dropped_stale_samples": self.dropped_stale_samples,
-                "current_param_version": self.current_param_version,
-                "queue_size": queue_stats["queue_size"],
-                "queue_max_size": self.max_queue_size,
-                "max_concurrent_samples": self.max_concurrent_samples,
-                "max_processing_time": self.max_processing_time,
-                "pending_samples_queue_size": self.pending_samples_queue.qsize(),
-                "result_queue_size": self.result_queue.qsize(),
-            }
-
-            return stats
+        queue_stats = self.message_queue_client.get_statistics_sync()
+
+        stats = {
+            "current_param_version": self.current_param_version,
+            "total_generated_samples": self.total_generated_samples,
+            "train_step_samples": self.train_step_samples,
+            "dropped_stale_samples": self.dropped_stale_samples,
+            "queue_max_size": self.max_queue_size,
+            "queue_size": queue_stats["queue_size"],
+            "max_concurrent_samples": self.max_concurrent_samples,
+            "pending_queue_size": self.pending_queue.qsize(),
+            "active_tasks_size": len(self.active_tasks),
+            "result_queue_size": self.result_queue.qsize(),
+        }
 
+        return stats
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 072a26fea35..fd46fc08b26 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -167,9 +167,11 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
         # Assemble batch - now working directly with RolloutSample objects
-        batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
-
-        return 0, batch
+        # batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
+        # print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
+        return 0, queue_samples
+        #
+        # return 0, batch
 
     def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[RolloutSample]):
         """
@@ -344,39 +346,39 @@ def fit(self):
                     epoch, batch = self._get_samples_from_queue()
                     if batch is None:
                         break
-
-                    # 更新统计信息
-                    self.processed_samples += len(batch) if isinstance(batch, list) else 1
-
-                    # 从meta_info中获取参数版本信息
-                    if hasattr(batch, "meta_info") and batch.meta_info:
-                        rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
-                        if rollout_param_versions:
-                            # 统计陈旧样本
-                            stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
-                            self.stale_samples_processed += stale_count
-
-                        # 添加新鲜度指标到metrics
-                        if rollout_param_versions:
-                            param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
-                            avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
-
-                            metrics.update(
-                                {
-                                    "freshness/param_version_diversity": param_version_diversity,
-                                    "freshness/avg_sample_age": avg_sample_age,
-                                    "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
-                                    if rollout_param_versions
-                                    else 0,
-                                    "statistics/processed_samples": self.processed_samples,
-                                    "statistics/stale_samples_processed": self.stale_samples_processed,
-                                    "statistics/current_param_version": self.current_param_version,
-                                }
-                            )
-                # batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
-                # self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-                # self._check_save_checkpoint(is_last_step, timing_raw)
-
+            #
+            #         # 更新统计信息
+            #         self.processed_samples += len(batch) if isinstance(batch, list) else 1
+            #
+            #         # 从meta_info中获取参数版本信息
+            #         if hasattr(batch, "meta_info") and batch.meta_info:
+            #             rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
+            #             if rollout_param_versions:
+            #                 # 统计陈旧样本
+            #                 stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
+            #                 self.stale_samples_processed += stale_count
+            #
+            #             # 添加新鲜度指标到metrics
+            #             if rollout_param_versions:
+            #                 param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
+            #                 avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
+            #
+            #                 metrics.update(
+            #                     {
+            #                         "freshness/param_version_diversity": param_version_diversity,
+            #                         "freshness/avg_sample_age": avg_sample_age,
+            #                         "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
+            #                         if rollout_param_versions
+            #                         else 0,
+            #                         "statistics/processed_samples": self.processed_samples,
+            #                         "statistics/stale_samples_processed": self.stale_samples_processed,
+            #                         "statistics/current_param_version": self.current_param_version,
+            #                     }
+            #                 )
+            #     batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+            #     self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+            #     self._check_save_checkpoint(is_last_step, timing_raw)
+            #
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
 
             # Trigger parameter synchronization after training step
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 4c3232e561b..3ece118bc81 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -45,6 +45,8 @@ class RolloutSample:
     generation_timestamp: float
     param_version: int
 
+    _gen_data: Any
+
 
 @ray.remote(num_cpus=2, max_concurrency=20)
 class MessageQueue:
@@ -71,25 +73,19 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.running = True
 
         # async safe - 在第一次使用时初始化
-        self._lock = None
-        self._consumer_condition = None
+        self._lock = asyncio.Lock()
+        self._consumer_condition = asyncio.Condition(self._lock)
 
         # statistic message
         self.total_produced = 0
         self.total_consumed = 0
         self.dropped_samples = 0
 
-        logger.info(
-            f"MessageQueue initialized with max_queue_size={max_queue_size},"
+        print(
+            f"[MessageQueue] initialized with max_queue_size={max_queue_size},"
             f"staleness_threshold={self.staleness_threshold}"
         )
 
-    async def _ensure_async_primitives(self):
-        """确保异步原语已初始化"""
-        if self._lock is None:
-            self._lock = asyncio.Lock()
-            self._consumer_condition = asyncio.Condition(self._lock)
-
     async def put_sample(self, sample: Any, param_version: int) -> bool:
         """
         Put a batch sample into the queue
@@ -101,8 +97,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
         Returns:
             bool: Whether the sample was successfully put into the queue
         """
-        await self._ensure_async_primitives()
-
         async with self._lock:
             # Check freshness
             staleness = self.current_param_version - param_version
@@ -115,12 +109,12 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
             if len(self.queue) >= self.max_queue_size:
                 removed = self.queue.popleft()
                 self.dropped_samples += 1
-                logger.warning(f"Queue full, dropped sample {removed}")
+                logger.warning(f"Queue full, dropped sample")
             self.queue.append(sample)
             self.total_produced += 1
 
             # Notify waiting consumers
-            self._consumer_condition.notify()
+            self._consumer_condition.notify_all()
 
             if self.total_produced % 100 == 0:
                 logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}")
@@ -137,8 +131,6 @@ async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
         Returns:
             List[Any]: List of retrieved samples
         """
-        await self._ensure_async_primitives()
-
         async with self._lock:
             while len(self.queue) < min_batch_count and self.running:
                 print(f"[MessageQueue] consumer_condition {len(self.queue)}")
@@ -171,10 +163,9 @@ async def get_sample(self) -> Any | None:
         Returns:
             Any: Single sample data or None if queue is closed
         """
-        await self._ensure_async_primitives()
-
         async with self._lock:
             while len(self.queue) == 0 and self.running:
+                print(f"[MessageQueue] consumer_condition {len(self.queue)}")
                 await self._consumer_condition.wait()
 
             # If queue is closed and empty, return None
@@ -188,8 +179,6 @@ async def get_sample(self) -> Any | None:
 
     async def update_param_version(self, version: int):
         """Update current parameter version"""
-        await self._ensure_async_primitives()
-
         async with self._lock:
             old_version = self.current_param_version
             self.current_param_version = version
@@ -197,15 +186,11 @@ async def update_param_version(self, version: int):
 
     async def get_queue_size(self) -> int:
         """Get current queue length"""
-        await self._ensure_async_primitives()
-
         async with self._lock:
             return len(self.queue)
 
     async def get_statistics(self) -> dict[str, Any]:
         """Get queue statistics"""
-        await self._ensure_async_primitives()
-
         async with self._lock:
             return {
                 "queue_size": len(self.queue),
@@ -219,8 +204,6 @@ async def get_statistics(self) -> dict[str, Any]:
 
     async def clear_queue(self):
         """Clear the queue"""
-        await self._ensure_async_primitives()
-
         async with self._lock:
             cleared_count = len(self.queue)
             self.queue.clear()
@@ -228,8 +211,6 @@ async def clear_queue(self):
 
     async def shutdown(self):
         """Shutdown the message queue"""
-        await self._ensure_async_primitives()
-
         async with self._lock:
             self.running = False
             # Notify all waiting coroutines so they can exit
@@ -238,8 +219,6 @@ async def shutdown(self):
 
     async def get_memory_usage(self) -> dict:
         """Get memory usage statistics"""
-        await self._ensure_async_primitives()
-
         async with self._lock:
             # Estimate memory usage of samples in queue
             import sys
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 337f2991a16..a938499a86b 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -46,7 +46,7 @@ gen_prompt_bsz=1
 n_resp_per_prompt=3
 train_prompt_mini_bsz=1
 
-total_rollout_steps=1000
+total_rollout_steps=50
 
 # Temperature parameters
 temperature=1.0
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 8c0d608871f..a5cc0b83e59 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -57,12 +57,6 @@ def _get_model_runner_workers(vllm_config, init_ray: bool = True):
         actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict")
     ]
 
-    print(f"namespace: {namespace}")
-    print(f"wg_prefix: {wg_prefix}")
-    print(f"vllm_dp_size: {vllm_dp_size}")
-    print(f"vllm_dp_rank: {vllm_dp_rank}")
-    print(f"actor_names: {actor_names}")
-
     vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size
     assert len(actor_names) == vllm_dp_size * vllm_tp_size, (
         f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: "
@@ -79,7 +73,6 @@ def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]:
     actor_names = sorted(actor_names, key=get_pg_index_and_local_rank)
     actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size]
     workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names]
-    print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}")
 
     return workers
 
@@ -90,7 +83,6 @@ class ExternalRayDistributedExecutor(Executor):
     uses_ray: bool = False
 
     def _init_executor(self) -> None:
-        print("[ExternalRayDistributedExecutor] Initializing ray actors...")
         self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True)
 
         kwargs = dict(
@@ -100,11 +92,10 @@ def _init_executor(self) -> None:
             distributed_init_method="env://",
             is_driver_worker=True,
         )
-        print(f"ray start instance_id: {self.vllm_config.instance_id} initializes")
         self.collective_rpc("init_worker", args=([kwargs],))
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
-        print(f"ray instance_id: {self.vllm_config.instance_id} initializes finished.")
+        print(f"instance_id: {self.vllm_config.instance_id} initializes finished.")
 
     def collective_rpc(
         self,
@@ -136,7 +127,6 @@ class ExternalZeroMQDistributedExecutor(Executor):
     uses_ray: bool = False
 
     def _init_executor(self) -> None:
-        print(f"[ExternalZeroMQDistributedExecutor] Initializing ray actors...")
         addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",")
         self.context = zmq.Context()
         self.sockets = []
@@ -152,11 +142,9 @@ def _init_executor(self) -> None:
             distributed_init_method="env://",
             is_driver_worker=True,
         )
-        print(f"ZeroMQ start instance_id: {self.vllm_config.instance_id} initializes")
         self.collective_rpc("init_worker", args=([kwargs],))
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
-        print(f"ZeroMQ instance_id: {self.vllm_config.instance_id} initializes finished.")
 
     def collective_rpc(
         self,
@@ -275,12 +263,8 @@ async def init_engine(self):
 
         # init async llm engine
         vllm_config = self._create_engine_config(engine_args)
-
-        print(f"AsyncvLLMServer AsyncLLM.from_vllm_config {vllm_config}")
         self.engine = AsyncLLM.from_vllm_config(vllm_config)
 
-        print("AsyncvLLMServer build serving chat")
-
         # build serving chat
         model_config = self.engine.model_config
         BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)]
@@ -297,8 +281,6 @@ async def init_engine(self):
             tool_parser=config.multi_turn.format,  # hermes, llama3_json, ...
         )
 
-        print("AsyncvLLMServer init_engine success")
-
     def _create_engine_config(self, engine_args: AsyncEngineArgs):
         vllm_config = engine_args.create_engine_config()
         namespace = ray.get_runtime_context().namespace
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index 307e7e77036..0d419dcf177 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -460,7 +460,6 @@ def get_zeromq_address(self):
     def init_worker(self, all_kwargs: list[dict[str, Any]]):
         """Initialize worker engine."""
 
-        print("[vLLMAsyncRollout] init_worker")
         all_kwargs[0]["rank"] = int(os.environ["RANK"])
         all_kwargs[0]["local_rank"] = 0
 
@@ -471,8 +470,6 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]):
     def load_model(self, *args, **kwargs):
         self.inference_engine.load_model(*args, **kwargs)
 
-        print(f"[vLLMAsyncRollout] load_model {args} {kwargs}")
-
         # inference engine is initialized now, update sharding manager
         self.sharding_manager.inference_engine = self.inference_engine
         self.sharding_manager.model_runner = self.inference_engine.worker.model_runner

From 1c06296f91aefbc5f25b28c8a5efdc0292f219c6 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 15 Aug 2025 19:04:20 +0800
Subject: [PATCH 051/182] staleness_samples

---
 .../fully_async_rollouter.py                  | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 16b68b3e819..fb0787eac69 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -35,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -103,7 +103,7 @@ def __init__(
 
         # Statistics
         self.total_generated_samples = 0
-        self.train_step_samples = 0
+        self.staleness_samples = 0
         self.dropped_stale_samples = 0
 
         # Worker groups
@@ -167,8 +167,8 @@ async def update_param_version(self, version: int):
         async with self.lock:
             old_version = self.current_param_version
             self.current_param_version = version
-            # every time param change, reset train_step_samples
-            self.train_step_samples = 0
+            # every time param change, reset staleness_samples
+            self.staleness_samples = 0
             print(f"[FullyAsyncRollouter] Parameter version updated from {old_version} to {version}")
 
     def _validate_config(self):
@@ -318,7 +318,7 @@ async def _processor_worker(self):
 
         while True:
             partial_rollout_sample = await self.pending_queue.get()
-            self.train_step_samples += 1
+            self.staleness_samples += 1
 
             async with self.lock:
                 if await self._should_pause_generation():
@@ -561,10 +561,10 @@ async def _should_pause_generation(self) -> bool:
             print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
             return True
 
-        if self.train_step_samples > self.max_required_samples:
+        if self.staleness_samples > self.max_required_samples:
             print(
                 f"[FullyAsyncRollouter] Should pause due to "
-                f"step_generated_samples {self.train_step_samples} > max_required_samples {self.max_required_samples} "
+                f"step_generated_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} "
             )
             return True
 
@@ -593,7 +593,7 @@ async def get_statistics(self) -> dict:
         stats = {
             "current_param_version": self.current_param_version,
             "total_generated_samples": self.total_generated_samples,
-            "train_step_samples": self.train_step_samples,
+            "staleness_samples": self.staleness_samples,
             "dropped_stale_samples": self.dropped_stale_samples,
             "queue_max_size": self.max_queue_size,
             "queue_size": queue_stats["queue_size"],

From 0412861d95c57c74e366da0ef8045e4db0487f45 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 15 Aug 2025 22:45:42 +0800
Subject: [PATCH 052/182] assemble_batch_from_rollout_samples

---
 recipe/fully_async_policy/batch_utils.py      | 124 +++++++
 .../fully_async_rollouter.py                  |  27 +-
 .../fully_async_policy/fully_async_trainer.py | 104 +-----
 recipe/fully_async_policy/message_queue.py    |  29 +-
 .../unittest/test_batch_utils.py              | 321 ++++++++++++++++++
 recipe/fully_async_policy/utils.py            |  28 +-
 6 files changed, 503 insertions(+), 130 deletions(-)
 create mode 100644 recipe/fully_async_policy/batch_utils.py
 create mode 100644 recipe/fully_async_policy/unittest/test_batch_utils.py

diff --git a/recipe/fully_async_policy/batch_utils.py b/recipe/fully_async_policy/batch_utils.py
new file mode 100644
index 00000000000..806dd9e1579
--- /dev/null
+++ b/recipe/fully_async_policy/batch_utils.py
@@ -0,0 +1,124 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import numpy as np
+import torch
+
+from recipe.fully_async_policy.utils import RolloutSample
+from verl import DataProto
+from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs
+from verl.trainer.ppo.ray_trainer import compute_response_mask
+
+
+def assemble_batch_from_rollout_samples(
+    rollout_samples: list[RolloutSample], tokenizer, config, balance_batch: bool = False
+) -> DataProto:
+    """
+    Assemble gen_batch_output from RolloutSample objects
+    从 RolloutSample 对象中组装批次，类似 ray_trainer 的 _post_generate_batch 逻辑
+
+    Args:
+        rollout_samples: List of RolloutSample objects
+        tokenizer: Tokenizer instance
+        config: Configuration object containing trainer settings
+        balance_batch: Whether to balance the batch (simplified version)
+
+    Returns:
+        DataProto: Assembled gen_batch_output
+
+    Raises:
+        ValueError: If rollout_samples is empty
+    """
+    start_time = time.time()
+
+    if not rollout_samples:
+        raise ValueError("Empty rollout_samples provided for batch assembly")
+
+    print(f"[BatchUtils] Assembling batch from {len(rollout_samples)} RolloutSample objects")
+
+    # 直接处理 RolloutSample 对象
+    processing_times = [rs.processing_time for rs in rollout_samples]
+
+    # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
+    agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples]
+    gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, tokenizer, config)
+
+    # 第二步：重建原始 batch 信息
+    # 每个 RolloutSample 都是独立的，直接按顺序重建原始数据
+    original_batch_list = []
+    for rs in rollout_samples:
+        original_batch_dict = rs.original_batch_dict
+
+        # 重建 DataProto
+        original_batch_item = DataProto.from_single_dict(
+            {
+                **{k: v for k, v in original_batch_dict["batch"].items()},
+                **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()},
+            }
+        )
+        original_batch_item.meta_info.update(original_batch_dict["meta_info"])
+        original_batch_list.append(original_batch_item)
+
+    # 合并所有原始样本为一个批次
+    if original_batch_list:
+        original_batch = DataProto.from_items(original_batch_list)
+    else:
+        # 如果没有原始数据，创建空的 DataProto
+        original_batch = DataProto.from_single_dict({})
+
+    # 添加 UID
+    uids = []
+    for rs in rollout_samples:
+        uids.append(f"uid_{rs.sample_id}")
+    original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object)
+
+    # 直接合并原始数据和生成结果，不需要 repeat
+    # 因为队列中的每个 RolloutSample 都已经是独立的样本
+    final_batch = original_batch.union(gen_batch_output)
+
+    # 计算 response_mask（如果不存在）
+    if "response_mask" not in final_batch.batch.keys():
+        final_batch.batch["response_mask"] = compute_response_mask(final_batch)
+
+    # 简化的批次平衡逻辑（如果需要的话）
+    if balance_batch and hasattr(config, "trainer") and getattr(config.trainer, "balance_batch", False):
+        # 注意：这里简化了批次平衡逻辑，如果需要完整功能需要额外参数
+        print("[BatchUtils] Batch balancing requested but simplified in static function")
+
+    # 计算全局有效 token 数
+    if "attention_mask" in final_batch.batch:
+        final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist()
+
+    # 收集统计信息和元数据（直接从 RolloutSample 中获取）
+    param_versions = [rs.param_version for rs in rollout_samples]
+    sample_timestamps = [rs.generation_timestamp for rs in rollout_samples]
+
+    # 创建 meta_info
+    final_batch.meta_info.update(
+        {
+            "rollout_param_versions": param_versions,
+            "sample_timestamps": sample_timestamps,
+            "avg_processing_time": np.mean(processing_times) if processing_times else 0,
+            "max_processing_time": np.max(processing_times) if processing_times else 0,
+            "param_version_diversity": len(set(param_versions)) if param_versions else 0,
+            "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0,
+            "assembly_time": time.time() - start_time,
+        }
+    )
+
+    print(f"[BatchUtils] Batch assembly completed in {time.time() - start_time:.2f}s")
+
+    return final_batch
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index fb0787eac69..f166582ef73 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -18,9 +18,8 @@
 import ray
 from omegaconf import OmegaConf
 
-from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample
-from recipe.fully_async_policy.utils import calculate_one_step_size
-from verl import DataProto
+from recipe.fully_async_policy.message_queue import MessageQueueClient
+from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.tracking import ValidationGenerationsLogger
@@ -35,16 +34,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -564,7 +563,7 @@ async def _should_pause_generation(self) -> bool:
         if self.staleness_samples > self.max_required_samples:
             print(
                 f"[FullyAsyncRollouter] Should pause due to "
-                f"step_generated_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} "
+                f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} "
             )
             return True
 
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index fd46fc08b26..cbea37c4083 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -21,9 +21,8 @@
 import ray
 from omegaconf import OmegaConf
 
-from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample
-from recipe.fully_async_policy.utils import calculate_one_step_size
-from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs
+from recipe.fully_async_policy.message_queue import MessageQueueClient
+from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator
@@ -166,9 +165,10 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         )
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
+        print(queue_samples)
         # Assemble batch - now working directly with RolloutSample objects
-        # batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
-        # print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
+        batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
+        print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
         return 0, queue_samples
         #
         # return 0, batch
@@ -184,91 +184,21 @@ def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[Ro
         Returns:
             DataProto: Assembled gen_batch_output
         """
-        start_time = time.time()
-
-        import numpy as np
-        import torch
-
-        from verl import DataProto
-        from verl.trainer.ppo.ray_trainer import compute_response_mask
-
-        if not rollout_samples:
-            raise ValueError("Empty rollout_samples provided for batch assembly")
-
-        print(f"[FullyAsyncTrainer] Assembling batch from {len(rollout_samples)} RolloutSample objects")
-
-        # 直接处理 RolloutSample 对象
-        processing_times = [rs.processing_time for rs in rollout_samples]
-
-        # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
-        agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples]
-        gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config)
-
-        # 第二步：重建原始 batch 信息
-        # 每个 RolloutSample 都是独立的，直接按顺序重建原始数据
-        original_batch_list = []
-        for rs in rollout_samples:
-            original_batch_dict = rs.original_batch_dict
-
-            # 重建 DataProto
-            original_batch_item = DataProto.from_single_dict(
-                {
-                    **{k: v for k, v in original_batch_dict["batch"].items()},
-                    **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()},
-                }
-            )
-            original_batch_item.meta_info.update(original_batch_dict["meta_info"])
-            original_batch_list.append(original_batch_item)
-
-        # 合并所有原始样本为一个批次
-        if original_batch_list:
-            original_batch = DataProto.from_items(original_batch_list)
-        else:
-            # 如果没有原始数据，创建空的 DataProto
-            original_batch = DataProto.from_single_dict({})
-
-        # 添加 UID
-        uids = []
-        for rs in rollout_samples:
-            uids.append(f"uid_{rs.sample_id}")
-        original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object)
-
-        # 直接合并原始数据和生成结果，不需要 repeat
-        # 因为队列中的每个 RolloutSample 都已经是独立的样本
-        final_batch = original_batch.union(gen_batch_output)
-
-        # 计算 response_mask（如果不存在）
-        if "response_mask" not in final_batch.batch.keys():
-            final_batch.batch["response_mask"] = compute_response_mask(final_batch)
+        from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples
+
+        # 使用静态函数进行批次组装
+        final_batch = assemble_batch_from_rollout_samples(
+            rollout_samples=rollout_samples,
+            tokenizer=self.tokenizer,
+            config=self.config,
+            balance_batch=False,  # 不使用静态函数的简化版本
+        )
 
-        # 平衡批次（如果配置了）
+        # 如果需要完整的批次平衡，在这里调用
         if self.config.trainer.balance_batch:
             self._balance_batch(final_batch, metrics={})
 
-        # 计算全局有效 token 数
-        if "attention_mask" in final_batch.batch:
-            final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist()
-
-        # 收集统计信息和元数据（直接从 RolloutSample 中获取）
-        param_versions = [rs.param_version for rs in rollout_samples]
-        sample_timestamps = [rs.generation_timestamp for rs in rollout_samples]
-
-        # 创建 meta_info
-        final_batch.meta_info.update(
-            {
-                "rollout_param_versions": param_versions,
-                "sample_timestamps": sample_timestamps,
-                "avg_processing_time": np.mean(processing_times) if processing_times else 0,
-                "max_processing_time": np.max(processing_times) if processing_times else 0,
-                "param_version_diversity": len(set(param_versions)) if param_versions else 0,
-                "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0,
-                "assembly_time": time.time() - start_time,
-            }
-        )
-
-        print(f"[FullyAsyncTrainer] Batch assembly completed in {time.time() - start_time:.2f}s")
         print(f"[FullyAsyncTrainer] {final_batch}")
-
         return final_batch
 
     def _create_actor_rollout_classes(self):
@@ -336,10 +266,10 @@ def fit(self):
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
         while True:
-            metrics = {}
+            # metrics = {}
             timing_raw = {}
 
-            is_last_step = False
+            # is_last_step = False
 
             with marked_timer("step", timing_raw):
                 with marked_timer("gen", timing_raw, color="red"):
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 3ece118bc81..b2761f95749 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -15,7 +15,6 @@
 import asyncio
 import logging
 from collections import deque
-from dataclasses import dataclass
 from typing import Any
 
 import ray
@@ -24,30 +23,6 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class RolloutSample:
-    """Enhanced rollout sample containing both original batch info and AgentLoopOutput"""
-
-    # Original batch information (preserved from _prepare_generate_batch)
-    original_batch_dict: dict[str, Any]
-
-    # AgentLoopOutput from generation
-    agent_loop_output: Any  # AgentLoopOutput
-
-    # Metadata
-    sample_id: str
-    epoch: int
-    rollout_n_index: int  # Index within the rollout.n repetitions (0, 1, ..., n-1)
-    original_sample_index: int  # Index of the original sample before repetition
-
-    # Processing metadata
-    processing_time: float
-    generation_timestamp: float
-    param_version: int
-
-    _gen_data: Any
-
-
 @ray.remote(num_cpus=2, max_concurrency=20)
 class MessageQueue:
     """
@@ -107,9 +82,9 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
 
             # If queue is full, remove the oldest sample (rarely happens)
             if len(self.queue) >= self.max_queue_size:
-                removed = self.queue.popleft()
+                self.queue.popleft()
                 self.dropped_samples += 1
-                logger.warning(f"Queue full, dropped sample")
+                logger.warning("Queue full, dropped sample")
             self.queue.append(sample)
             self.total_produced += 1
 
diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py
new file mode 100644
index 00000000000..c2593f83ec7
--- /dev/null
+++ b/recipe/fully_async_policy/unittest/test_batch_utils.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import unittest
+from dataclasses import dataclass
+from unittest.mock import MagicMock
+
+import numpy as np
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples
+from recipe.fully_async_policy.message_queue import RolloutSample
+from verl import DataProto
+
+
+@dataclass
+class MockAgentLoopOutput:
+    """Mock AgentLoopOutput for testing"""
+
+    prompt_ids: list[int]
+    response_ids: list[int]
+    response_mask: list[int]
+    num_turns: int = 1
+    metrics: dict = None
+
+    def __post_init__(self):
+        if self.metrics is None:
+            self.metrics = {}
+
+
+class MockConfig:
+    """Mock configuration object"""
+
+    def __init__(self):
+        self.trainer = MockTrainerConfig()
+
+
+class MockTrainerConfig:
+    """Mock trainer configuration"""
+
+    def __init__(self):
+        self.balance_batch = False
+
+
+class TestBatchUtils(unittest.TestCase):
+    def setUp(self):
+        """设置测试环境"""
+        self.tokenizer = MagicMock()
+        self.config = MockConfig()
+
+        # Mock postprocess_agent_loop_outputs function
+        self.mock_postprocess = MagicMock()
+
+        # Patch the postprocess function
+        import recipe.fully_async_policy.batch_utils as batch_utils_module
+
+        self.original_postprocess = batch_utils_module.postprocess_agent_loop_outputs
+        batch_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess
+
+        # Mock compute_response_mask function
+        self.original_compute_response_mask = batch_utils_module.compute_response_mask
+        batch_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64))
+
+    def tearDown(self):
+        """清理测试环境"""
+        import recipe.fully_async_policy.batch_utils as batch_utils_module
+
+        batch_utils_module.postprocess_agent_loop_outputs = self.original_postprocess
+        batch_utils_module.compute_response_mask = self.original_compute_response_mask
+
+    def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> RolloutSample:
+        """创建测试用的 RolloutSample"""
+        # 创建 mock AgentLoopOutput
+        agent_loop_output = MockAgentLoopOutput(
+            prompt_ids=[151644, 8948, 198] + list(range(100)),  # 简化的prompt_ids
+            response_ids=[14374, 14822] + list(range(50)),  # 简化的response_ids
+            response_mask=[1] * 52,  # response_mask
+            num_turns=1,
+            metrics={"generate_time": 0.5},
+        )
+
+        # 创建原始batch信息
+        original_batch_dict = {
+            "batch": {},  # 空的tensor batch用于测试
+            "non_tensor_batch": {
+                "data_source": np.array(["openai/gsm8k"], dtype=object),
+                "ability": np.array(["math"], dtype=object),
+                "reward_model": np.array([{"ground_truth": "6", "style": "rule"}], dtype=object),
+                "extra_info": np.array(
+                    [{"answer": "test answer", "index": 4570, "question": "test question", "split": "train"}],
+                    dtype=object,
+                ),
+                "raw_prompt_ids": np.array([[151644, 8948, 198]], dtype=object),
+                "raw_prompt": np.array([[{"content": "test content", "role": "user"}]], dtype=object),
+                "tools_kwargs": np.array([{}], dtype=object),
+                "interaction_kwargs": np.array([{}], dtype=object),
+                "index": np.array([4570], dtype=object),
+            },
+            "meta_info": {"global_steps": 1},
+        }
+
+        return RolloutSample(
+            original_batch_dict=original_batch_dict,
+            agent_loop_output=agent_loop_output,
+            sample_id=sample_id,
+            epoch=0,
+            rollout_n_index=0,
+            original_sample_index=0,
+            processing_time=0.5,
+            generation_timestamp=time.time(),
+            param_version=param_version,
+            _gen_data=None,
+        )
+
+    def test_assemble_batch_empty_input(self):
+        """测试空输入的情况"""
+        with self.assertRaises(ValueError) as context:
+            assemble_batch_from_rollout_samples([], self.tokenizer, self.config)
+
+        self.assertIn("Empty rollout_samples", str(context.exception))
+
+    def test_assemble_batch_single_sample(self):
+        """测试单个样本的批次组装"""
+        # 设置mock返回值
+        mock_gen_batch = DataProto(
+            batch=torch.nn.utils.rnn.pad_sequence(
+                [
+                    torch.tensor([151644, 8948, 198] + list(range(100))),
+                ],
+                batch_first=True,
+                padding_value=0,
+            ),
+            non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
+            meta_info={"test_meta": "test_value"},
+        )
+        self.mock_postprocess.return_value = mock_gen_batch
+
+        # 创建测试样本
+        rollout_samples = [self.create_mock_rollout_sample("sample_1")]
+
+        # 调用函数
+        result = assemble_batch_from_rollout_samples(
+            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
+        )
+
+        # 验证结果
+        self.assertIsInstance(result, DataProto)
+        self.assertIn("uid", result.non_tensor_batch)
+        self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1")
+
+        # 验证meta_info包含预期字段
+        expected_fields = [
+            "rollout_param_versions",
+            "sample_timestamps",
+            "avg_processing_time",
+            "max_processing_time",
+            "param_version_diversity",
+            "avg_sample_age",
+            "assembly_time",
+        ]
+        for field in expected_fields:
+            self.assertIn(field, result.meta_info)
+
+        # 验证统计信息
+        self.assertEqual(result.meta_info["rollout_param_versions"], [1])
+        self.assertEqual(result.meta_info["avg_processing_time"], 0.5)
+        self.assertEqual(result.meta_info["param_version_diversity"], 1)
+
+    def test_assemble_batch_multiple_samples(self):
+        """测试多个样本的批次组装"""
+        # 设置mock返回值
+        mock_gen_batch = DataProto(
+            batch=torch.nn.utils.rnn.pad_sequence(
+                [
+                    torch.tensor([151644, 8948, 198] + list(range(100))),
+                    torch.tensor([151644, 8948, 198] + list(range(90))),
+                ],
+                batch_first=True,
+                padding_value=0,
+            ),
+            non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)},
+            meta_info={"test_meta": "test_value"},
+        )
+        self.mock_postprocess.return_value = mock_gen_batch
+
+        # 创建测试样本
+        rollout_samples = [
+            self.create_mock_rollout_sample("sample_1", param_version=1),
+            self.create_mock_rollout_sample("sample_2", param_version=2),
+        ]
+
+        # 调用函数
+        result = assemble_batch_from_rollout_samples(
+            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
+        )
+
+        # 验证结果
+        self.assertIsInstance(result, DataProto)
+        self.assertEqual(len(result.non_tensor_batch["uid"]), 2)
+        self.assertListEqual(list(result.non_tensor_batch["uid"]), ["uid_sample_1", "uid_sample_2"])
+
+        # 验证多样本统计
+        self.assertEqual(result.meta_info["rollout_param_versions"], [1, 2])
+        self.assertEqual(result.meta_info["param_version_diversity"], 2)  # 两个不同版本
+        self.assertEqual(result.meta_info["avg_processing_time"], 0.5)
+
+    def test_assemble_batch_with_balance_batch_flag(self):
+        """测试启用balance_batch标志的情况"""
+        # 设置mock返回值
+        mock_gen_batch = DataProto(
+            batch=torch.nn.utils.rnn.pad_sequence(
+                [
+                    torch.tensor([151644, 8948, 198] + list(range(100))),
+                ],
+                batch_first=True,
+                padding_value=0,
+            ),
+            non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
+            meta_info={"test_meta": "test_value"},
+        )
+        self.mock_postprocess.return_value = mock_gen_batch
+
+        # 设置config启用balance_batch
+        self.config.trainer.balance_batch = True
+
+        # 创建测试样本
+        rollout_samples = [self.create_mock_rollout_sample("sample_1")]
+
+        # 调用函数
+        result = assemble_batch_from_rollout_samples(
+            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config, balance_batch=True
+        )
+
+        # 验证结果（主要验证没有抛出异常）
+        self.assertIsInstance(result, DataProto)
+
+    def test_assemble_batch_attention_mask_processing(self):
+        """测试attention_mask处理逻辑"""
+        # 设置mock返回值，包含attention_mask
+        mock_gen_batch = DataProto(
+            batch={
+                "attention_mask": torch.ones(2, 128, dtype=torch.int64),
+                "input_ids": torch.randint(0, 1000, (2, 128)),
+            },
+            non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)},
+            meta_info={"test_meta": "test_value"},
+        )
+        self.mock_postprocess.return_value = mock_gen_batch
+
+        # 创建测试样本
+        rollout_samples = [
+            self.create_mock_rollout_sample("sample_1"),
+            self.create_mock_rollout_sample("sample_2"),
+        ]
+
+        # 调用函数
+        result = assemble_batch_from_rollout_samples(
+            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
+        )
+
+        # 验证global_token_num被正确计算
+        self.assertIn("global_token_num", result.meta_info)
+        self.assertIsInstance(result.meta_info["global_token_num"], list)
+
+    def test_mock_postprocess_called_correctly(self):
+        """测试postprocess_agent_loop_outputs被正确调用"""
+        # 设置mock返回值
+        mock_gen_batch = DataProto(
+            batch=torch.nn.utils.rnn.pad_sequence(
+                [
+                    torch.tensor([151644, 8948, 198] + list(range(100))),
+                ],
+                batch_first=True,
+                padding_value=0,
+            ),
+            non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
+            meta_info={"test_meta": "test_value"},
+        )
+        self.mock_postprocess.return_value = mock_gen_batch
+
+        # 创建测试样本
+        rollout_samples = [self.create_mock_rollout_sample("sample_1")]
+
+        # 调用函数
+        result = assemble_batch_from_rollout_samples(
+            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
+        )
+
+        print(result)
+
+        # 验证postprocess_agent_loop_outputs被调用
+        self.mock_postprocess.assert_called_once()
+        call_args = self.mock_postprocess.call_args
+
+        # 验证调用参数
+        agent_loop_outputs, tokenizer, config = call_args[0]
+        self.assertEqual(len(agent_loop_outputs), 1)
+        self.assertEqual(tokenizer, self.tokenizer)
+        self.assertEqual(config, self.config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py
index d9afa0a9ab1..a2e7d5e6c4c 100644
--- a/recipe/fully_async_policy/utils.py
+++ b/recipe/fully_async_policy/utils.py
@@ -11,10 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dataclasses import dataclass
+from typing import Any
 
 
 # Calculate the number of samples needed
-
-
 def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size):
     return minimal_bsz * ppo_mini_batch_size
+
+
+@dataclass
+class RolloutSample:
+    """Enhanced rollout sample containing both original batch info and AgentLoopOutput"""
+
+    # Original batch information (preserved from _prepare_generate_batch)
+    original_batch_dict: dict[str, Any]
+
+    # AgentLoopOutput from generation
+    agent_loop_output: Any  # AgentLoopOutput
+
+    # Metadata
+    sample_id: str
+    epoch: int
+    rollout_n_index: int  # Index within the rollout.n repetitions (0, 1, ..., n-1)
+    original_sample_index: int  # Index of the original sample before repetition
+
+    # Processing metadata
+    processing_time: float
+    generation_timestamp: float
+    param_version: int
+
+    _gen_data: Any

From 936a6720240e4963e1777452f71acfcad5b28c1f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sat, 16 Aug 2025 01:02:07 +0800
Subject: [PATCH 053/182] assemble_batch_from_rollout_samples

---
 .../{batch_utils.py => detach_utils.py}       |  95 ++-
 .../fully_async_rollouter.py                  |  62 +-
 .../fully_async_policy/fully_async_trainer.py |  43 +-
 .../unittest/test_batch_utils.py              | 675 +++++++++++++-----
 recipe/fully_async_policy/utils.py            |  44 --
 5 files changed, 582 insertions(+), 337 deletions(-)
 rename recipe/fully_async_policy/{batch_utils.py => detach_utils.py} (63%)
 delete mode 100644 recipe/fully_async_policy/utils.py

diff --git a/recipe/fully_async_policy/batch_utils.py b/recipe/fully_async_policy/detach_utils.py
similarity index 63%
rename from recipe/fully_async_policy/batch_utils.py
rename to recipe/fully_async_policy/detach_utils.py
index 806dd9e1579..202cfdcf783 100644
--- a/recipe/fully_async_policy/batch_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -11,20 +11,79 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time
+from dataclasses import dataclass
+from typing import Any
 
 import numpy as np
 import torch
 
-from recipe.fully_async_policy.utils import RolloutSample
 from verl import DataProto
 from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs
 from verl.trainer.ppo.ray_trainer import compute_response_mask
 
 
+# Calculate the number of samples needed
+def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size):
+    return minimal_bsz * ppo_mini_batch_size
+
+
+@dataclass
+class RolloutSample:
+    """Enhanced rollout sample containing both original batch info and AgentLoopOutput"""
+
+    # Original batch information
+    full_batch: Any
+
+    # AgentLoopOutput from generation
+    agent_loop_output: Any  # AgentLoopOutput
+
+    # Metadata
+    sample_id: str
+    epoch: int
+    rollout_n_index: int  # Index within the rollout.n repetitions (0, 1, ..., n-1)
+    original_sample_index: int  # Index of the original sample before repetition
+
+    # Processing metadata
+    processing_time: float
+    generation_timestamp: float
+    param_version: int
+
+
+def prepare_single_generation_data(batch_dict, global_steps) -> DataProto:
+    """
+    类似 ray_trainer._prepare_generate_batch 的逻辑，但针对单个样本
+    分离出用于生成的数据和需要保留的原始数据
+
+    Returns:
+        tuple: (original_batch_dict, gen_data_for_single_sample)
+    """
+
+    # 创建完整的 DataProto
+    full_batch = DataProto.from_single_dict(batch_dict)
+
+    # batch : TensorDict { input_ids, attention_mask, position_ids}
+    # non_tensor_batch: raw_prompt_ids, raw_prompt,
+    #                   multi_modal_data, tools_kwargs, interaction_kwargs, index, agent_name,
+    #                   data_source, ability, reward_model
+    # meta_info: {}
+
+    # 定义需要传递给生成服务器的字段
+    batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+    non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+
+    full_batch.pop(
+        batch_keys=batch_keys_to_pop,
+        non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+    )
+    # 添加全局步数到生成数据
+    full_batch.meta_info["global_steps"] = global_steps
+
+    return full_batch
+
+
 def assemble_batch_from_rollout_samples(
-    rollout_samples: list[RolloutSample], tokenizer, config, balance_batch: bool = False
+    rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None
 ) -> DataProto:
     """
     Assemble gen_batch_output from RolloutSample objects
@@ -60,17 +119,11 @@ def assemble_batch_from_rollout_samples(
     # 每个 RolloutSample 都是独立的，直接按顺序重建原始数据
     original_batch_list = []
     for rs in rollout_samples:
-        original_batch_dict = rs.original_batch_dict
-
-        # 重建 DataProto
-        original_batch_item = DataProto.from_single_dict(
-            {
-                **{k: v for k, v in original_batch_dict["batch"].items()},
-                **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()},
-            }
-        )
-        original_batch_item.meta_info.update(original_batch_dict["meta_info"])
-        original_batch_list.append(original_batch_item)
+        item = rs.full_batch.to_items()[0]
+        original_batch_list.append(item)
+
+    print("=" * 300)
+    print(original_batch_list)
 
     # 合并所有原始样本为一个批次
     if original_batch_list:
@@ -79,6 +132,9 @@ def assemble_batch_from_rollout_samples(
         # 如果没有原始数据，创建空的 DataProto
         original_batch = DataProto.from_single_dict({})
 
+    print("=" * 300)
+    print(original_batch)
+
     # 添加 UID
     uids = []
     for rs in rollout_samples:
@@ -87,16 +143,21 @@ def assemble_batch_from_rollout_samples(
 
     # 直接合并原始数据和生成结果，不需要 repeat
     # 因为队列中的每个 RolloutSample 都已经是独立的样本
-    final_batch = original_batch.union(gen_batch_output)
+    if original_batch.batch is None:
+        final_batch = gen_batch_output
+        # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch
+        for key, value in original_batch.non_tensor_batch.items():
+            final_batch.non_tensor_batch[key] = value
+        final_batch.meta_info.update(original_batch.meta_info)
 
     # 计算 response_mask（如果不存在）
     if "response_mask" not in final_batch.batch.keys():
         final_batch.batch["response_mask"] = compute_response_mask(final_batch)
 
     # 简化的批次平衡逻辑（如果需要的话）
-    if balance_batch and hasattr(config, "trainer") and getattr(config.trainer, "balance_batch", False):
+    if balance_batch:
         # 注意：这里简化了批次平衡逻辑，如果需要完整功能需要额外参数
-        print("[BatchUtils] Batch balancing requested but simplified in static function")
+        balance_batch(final_batch, metrics={})
 
     # 计算全局有效 token 数
     if "attention_mask" in final_batch.batch:
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index f166582ef73..888068b12b6 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -18,8 +18,12 @@
 import ray
 from omegaconf import OmegaConf
 
+from recipe.fully_async_policy.detach_utils import (
+    RolloutSample,
+    calculate_one_step_size,
+    prepare_single_generation_data,
+)
 from recipe.fully_async_policy.message_queue import MessageQueueClient
-from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.tracking import ValidationGenerationsLogger
@@ -213,55 +217,6 @@ def _init_async_rollout_manager(self):
             worker_group=self.rollout_wg,
         )
 
-    def _prepare_single_generation_data(self, batch_dict):
-        """
-        类似 ray_trainer._prepare_generate_batch 的逻辑，但针对单个样本
-        分离出用于生成的数据和需要保留的原始数据
-
-        Returns:
-            tuple: (original_batch_dict, gen_data_for_single_sample)
-        """
-        from verl import DataProto
-
-        # 创建完整的 DataProto
-        full_batch = DataProto.from_single_dict(batch_dict)
-
-        # 定义需要传递给生成服务器的字段
-        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-
-        # 处理可选字段
-        optional_fields = [
-            "multi_modal_data",
-            "raw_prompt",
-            "tools_kwargs",
-            "interaction_kwargs",
-            "index",
-            "agent_name",
-        ]
-
-        for field in optional_fields:
-            if field in full_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append(field)
-
-        # 分离数据：gen_batch 用于生成，original_batch 保留原始信息
-        gen_batch = full_batch.pop(
-            batch_keys=batch_keys_to_pop,
-            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-        )
-
-        # 添加全局步数到生成数据
-        gen_batch.meta_info["global_steps"] = self.global_steps
-
-        # 保留原始 batch 信息（转换为字典格式以便序列化）
-        original_batch_dict = {
-            "batch": {k: v.clone() if hasattr(v, "clone") else v for k, v in full_batch.batch.items()},
-            "non_tensor_batch": dict(full_batch.non_tensor_batch),
-            "meta_info": dict(full_batch.meta_info),
-        }
-
-        return original_batch_dict, gen_batch
-
     # 添加样本到待处理队列的协程
     async def _feed_samples(self):
         continuous_iterator = self._create_continuous_iterator()
@@ -273,7 +228,7 @@ async def _feed_samples(self):
                 break
 
             # 类似 _prepare_generate_batch 的逻辑：分离数据
-            original_batch, gen_data = self._prepare_single_generation_data(batch_dict)
+            full_batch = prepare_single_generation_data(batch_dict, self.global_steps)
 
             # 根据 rollout.n 进行重复
             for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n):
@@ -281,7 +236,7 @@ async def _feed_samples(self):
 
                 # 创建部分 RolloutSample，不包含 _gen_data（因为它不在数据类定义中）
                 partial_rollout_sample = RolloutSample(
-                    original_batch_dict=original_batch,
+                    full_batch=full_batch,
                     agent_loop_output=None,  # 待处理后填充
                     sample_id=sample_id,
                     epoch=epoch,
@@ -290,7 +245,6 @@ async def _feed_samples(self):
                     processing_time=0.0,  # 待处理后填充
                     generation_timestamp=0.0,  # 待处理后填充
                     param_version=0,  # 待处理后填充
-                    _gen_data=gen_data,
                 )
 
                 await self.pending_queue.put(partial_rollout_sample)
@@ -362,7 +316,7 @@ async def _process_single_sample_streaming(self, partial_rollout_sample):
 
         # 调用异步生成方法
         agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async(
-            partial_rollout_sample._gen_data, partial_rollout_sample.sample_id
+            partial_rollout_sample.full_batch, partial_rollout_sample.sample_id
         )
         # 直接更新 RolloutSample 对象，填充剩余字段
         partial_rollout_sample.agent_loop_output = agent_loop_output
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index cbea37c4083..c9a495c60ed 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -21,8 +21,12 @@
 import ray
 from omegaconf import OmegaConf
 
+from recipe.fully_async_policy.detach_utils import (
+    RolloutSample,
+    assemble_batch_from_rollout_samples,
+    calculate_one_step_size,
+)
 from recipe.fully_async_policy.message_queue import MessageQueueClient
-from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator
@@ -167,39 +171,12 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
         print(queue_samples)
         # Assemble batch - now working directly with RolloutSample objects
-        batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples)
-        print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
-        return 0, queue_samples
-        #
-        # return 0, batch
-
-    def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[RolloutSample]):
-        """
-        Assemble gen_batch_output from RolloutSample objects
-        从 RolloutSample 对象中组装批次，类似 ray_trainer 的 _post_generate_batch 逻辑
-
-        Args:
-            rollout_samples: List of RolloutSample objects
-
-        Returns:
-            DataProto: Assembled gen_batch_output
-        """
-        from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples
-
-        # 使用静态函数进行批次组装
-        final_batch = assemble_batch_from_rollout_samples(
-            rollout_samples=rollout_samples,
-            tokenizer=self.tokenizer,
-            config=self.config,
-            balance_batch=False,  # 不使用静态函数的简化版本
-        )
-
-        # 如果需要完整的批次平衡，在这里调用
         if self.config.trainer.balance_batch:
-            self._balance_batch(final_batch, metrics={})
-
-        print(f"[FullyAsyncTrainer] {final_batch}")
-        return final_batch
+            batch = assemble_batch_from_rollout_samples(queue_samples, self._balance_batch)
+        else:
+            batch = assemble_batch_from_rollout_samples(queue_samples)
+        print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
+        return 0, batch
 
     def _create_actor_rollout_classes(self):
         # create actor
diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py
index c2593f83ec7..ddde3a4ad92 100644
--- a/recipe/fully_async_policy/unittest/test_batch_utils.py
+++ b/recipe/fully_async_policy/unittest/test_batch_utils.py
@@ -23,14 +23,22 @@
 
 import numpy as np
 import torch
+from tensordict import TensorDict
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
 
-from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples
-from recipe.fully_async_policy.message_queue import RolloutSample
+from recipe.fully_async_policy.detach_utils import RolloutSample, assemble_batch_from_rollout_samples
 from verl import DataProto
 
 
+@dataclass
+class MockAgentLoopMetrics:
+    """Mock AgentLoopMetrics for testing"""
+
+    generate_sequences: float = 0.5
+    tool_calls: float = 0.0
+
+
 @dataclass
 class MockAgentLoopOutput:
     """Mock AgentLoopOutput for testing"""
@@ -39,11 +47,11 @@ class MockAgentLoopOutput:
     response_ids: list[int]
     response_mask: list[int]
     num_turns: int = 1
-    metrics: dict = None
+    metrics: MockAgentLoopMetrics = None
 
     def __post_init__(self):
         if self.metrics is None:
-            self.metrics = {}
+            self.metrics = MockAgentLoopMetrics()
 
 
 class MockConfig:
@@ -70,131 +78,406 @@ def setUp(self):
         self.mock_postprocess = MagicMock()
 
         # Patch the postprocess function
-        import recipe.fully_async_policy.batch_utils as batch_utils_module
+        import recipe.fully_async_policy.detach_utils as detach_utils_module
 
-        self.original_postprocess = batch_utils_module.postprocess_agent_loop_outputs
-        batch_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess
+        self.original_postprocess = detach_utils_module.postprocess_agent_loop_outputs
+        detach_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess
 
         # Mock compute_response_mask function
-        self.original_compute_response_mask = batch_utils_module.compute_response_mask
-        batch_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64))
+        self.original_compute_response_mask = detach_utils_module.compute_response_mask
+        detach_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64))
 
     def tearDown(self):
         """清理测试环境"""
-        import recipe.fully_async_policy.batch_utils as batch_utils_module
+        import recipe.fully_async_policy.detach_utils as detach_utils_module
 
-        batch_utils_module.postprocess_agent_loop_outputs = self.original_postprocess
-        batch_utils_module.compute_response_mask = self.original_compute_response_mask
+        detach_utils_module.postprocess_agent_loop_outputs = self.original_postprocess
+        detach_utils_module.compute_response_mask = self.original_compute_response_mask
 
     def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> RolloutSample:
         """创建测试用的 RolloutSample"""
         # 创建 mock AgentLoopOutput
         agent_loop_output = MockAgentLoopOutput(
-            prompt_ids=[151644, 8948, 198] + list(range(100)),  # 简化的prompt_ids
-            response_ids=[14374, 14822] + list(range(50)),  # 简化的response_ids
-            response_mask=[1] * 52,  # response_mask
-            num_turns=1,
-            metrics={"generate_time": 0.5},
+            prompt_ids=[
+                151644,
+                8948,
+                198,
+                2610,
+                525,
+                1207,
+                16948,
+                11,
+                3465,
+                553,
+                54364,
+                14817,
+                13,
+                1446,
+                525,
+                264,
+                10950,
+                17847,
+                13,
+                151645,
+                198,
+                151644,
+                872,
+                198,
+                24732,
+                21189,
+                264,
+                400,
+                16,
+                17,
+                40358,
+                817,
+                2254,
+                13,
+                758,
+                279,
+                1156,
+                2003,
+                11,
+                566,
+                37102,
+                264,
+                4843,
+                315,
+                432,
+                26,
+                304,
+                279,
+                2086,
+                2003,
+                11,
+                566,
+                37102,
+                264,
+                8338,
+                315,
+                1128,
+                566,
+                702,
+                2115,
+                13,
+                2585,
+                1753,
+                3220,
+                1558,
+                566,
+                614,
+                2115,
+                311,
+                6248,
+                279,
+                2254,
+                30,
+                6771,
+                594,
+                1744,
+                3019,
+                553,
+                3019,
+                323,
+                2550,
+                279,
+                1590,
+                4226,
+                1283,
+                330,
+                820,
+                3263,
+                151645,
+                198,
+                151644,
+                77091,
+                198,
+            ],
+            response_ids=[
+                14374,
+                14822,
+                14319,
+                12,
+                8304,
+                74216,
+                510,
+                16,
+                13,
+                4127,
+                40358,
+                25,
+                400,
+                16,
+                17,
+                198,
+                17,
+                13,
+                5512,
+                2003,
+                18024,
+                510,
+                262,
+                481,
+                8364,
+                37102,
+                264,
+                4843,
+                315,
+                279,
+                400,
+                16,
+                17,
+                624,
+                262,
+                481,
+                25783,
+                7391,
+                284,
+                57960,
+                37018,
+                90,
+                16,
+                15170,
+                18,
+                92,
+                1124,
+                15136,
+                32882,
+                16,
+                17,
+                284,
+                32882,
+                19,
+                66426,
+                18,
+                13,
+                10657,
+                3311,
+                1283,
+                1156,
+                2003,
+                25,
+                400,
+                16,
+                17,
+                481,
+                32882,
+                19,
+                284,
+                32882,
+                23,
+                66426,
+                19,
+                13,
+                10440,
+                2003,
+                18024,
+                510,
+                262,
+                481,
+                8364,
+                37102,
+                264,
+                8338,
+                315,
+                279,
+                9664,
+                3311,
+                1283,
+                279,
+                1156,
+                2003,
+                624,
+                262,
+                481,
+                11487,
+                2115,
+                284,
+                400,
+                23,
+                481,
+                400,
+                19,
+                284,
+                400,
+                19,
+                198,
+                262,
+                481,
+                25783,
+                7391,
+                2049,
+                57960,
+                37018,
+                90,
+                16,
+                15170,
+                19,
+                92,
+                1124,
+                15136,
+                32882,
+                19,
+                284,
+                32882,
+                16,
+                66426,
+                20,
+                13,
+                13023,
+                3311,
+                2115,
+                510,
+                262,
+                481,
+                8364,
+                702,
+                3322,
+                369,
+                264,
+                2480,
+                2003,
+                311,
+                6248,
+                279,
+                2254,
+                2041,
+                32821,
+                894,
+                803,
+                40358,
+                382,
+                43434,
+                510,
+                24732,
+                702,
+                3070,
+                65039,
+                23,
+                334,
+                2115,
+                13,
+                1260,
+                686,
+                614,
+                3322,
+                3220,
+                311,
+                6248,
+                279,
+                2254,
+                2041,
+                32821,
+                894,
+                803,
+                40358,
+                13,
+                151645,
+            ],
+            response_mask=[1] * 175,  # 真实的response长度
+            num_turns=2,
+            metrics=MockAgentLoopMetrics(generate_sequences=1.6468379497528076, tool_calls=0.0),
         )
 
-        # 创建原始batch信息
-        original_batch_dict = {
-            "batch": {},  # 空的tensor batch用于测试
-            "non_tensor_batch": {
-                "data_source": np.array(["openai/gsm8k"], dtype=object),
-                "ability": np.array(["math"], dtype=object),
-                "reward_model": np.array([{"ground_truth": "6", "style": "rule"}], dtype=object),
-                "extra_info": np.array(
-                    [{"answer": "test answer", "index": 4570, "question": "test question", "split": "train"}],
+        # 创建mock _gen_data
+        mock_gen_data = DataProto(
+            non_tensor_batch={
+                "raw_prompt": np.array(
+                    [
+                        [
+                            {
+                                "content": "Tom receives a $12 allowance per month.",
+                                "role": "user",
+                            }
+                        ]
+                    ],
                     dtype=object,
                 ),
-                "raw_prompt_ids": np.array([[151644, 8948, 198]], dtype=object),
-                "raw_prompt": np.array([[{"content": "test content", "role": "user"}]], dtype=object),
                 "tools_kwargs": np.array([{}], dtype=object),
                 "interaction_kwargs": np.array([{}], dtype=object),
                 "index": np.array([4570], dtype=object),
             },
-            "meta_info": {"global_steps": 1},
-        }
+            meta_info={"global_steps": 1},
+        )
 
         return RolloutSample(
-            original_batch_dict=original_batch_dict,
+            full_batch=mock_gen_data,
             agent_loop_output=agent_loop_output,
             sample_id=sample_id,
             epoch=0,
             rollout_n_index=0,
             original_sample_index=0,
-            processing_time=0.5,
+            processing_time=1.6468379497528076,
             generation_timestamp=time.time(),
             param_version=param_version,
-            _gen_data=None,
         )
 
-    def test_assemble_batch_empty_input(self):
-        """测试空输入的情况"""
-        with self.assertRaises(ValueError) as context:
-            assemble_batch_from_rollout_samples([], self.tokenizer, self.config)
-
-        self.assertIn("Empty rollout_samples", str(context.exception))
-
-    def test_assemble_batch_single_sample(self):
-        """测试单个样本的批次组装"""
-        # 设置mock返回值
-        mock_gen_batch = DataProto(
-            batch=torch.nn.utils.rnn.pad_sequence(
-                [
-                    torch.tensor([151644, 8948, 198] + list(range(100))),
-                ],
-                batch_first=True,
-                padding_value=0,
-            ),
-            non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
-            meta_info={"test_meta": "test_value"},
-        )
-        self.mock_postprocess.return_value = mock_gen_batch
-
-        # 创建测试样本
-        rollout_samples = [self.create_mock_rollout_sample("sample_1")]
-
-        # 调用函数
-        result = assemble_batch_from_rollout_samples(
-            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
-        )
-
-        # 验证结果
-        self.assertIsInstance(result, DataProto)
-        self.assertIn("uid", result.non_tensor_batch)
-        self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1")
-
-        # 验证meta_info包含预期字段
-        expected_fields = [
-            "rollout_param_versions",
-            "sample_timestamps",
-            "avg_processing_time",
-            "max_processing_time",
-            "param_version_diversity",
-            "avg_sample_age",
-            "assembly_time",
-        ]
-        for field in expected_fields:
-            self.assertIn(field, result.meta_info)
-
-        # 验证统计信息
-        self.assertEqual(result.meta_info["rollout_param_versions"], [1])
-        self.assertEqual(result.meta_info["avg_processing_time"], 0.5)
-        self.assertEqual(result.meta_info["param_version_diversity"], 1)
+    # def test_assemble_batch_empty_input(self):
+    #     """测试空输入的情况"""
+    #     with self.assertRaises(ValueError) as context:
+    #         assemble_batch_from_rollout_samples([], self.tokenizer, self.config)
+    #
+    #     self.assertIn("Empty rollout_samples", str(context.exception))
+    #
+    # def test_assemble_batch_single_sample(self):
+    #     """测试单个样本的批次组装"""
+    #     # 设置mock返回值 - 使用正确的TensorDict格式
+    #     mock_gen_batch = DataProto(
+    #         batch=TensorDict({
+    #             "input_ids": torch.randint(0, 1000, (1, 256)),
+    #             "attention_mask": torch.ones(1, 256, dtype=torch.int64),
+    #             "position_ids": torch.arange(256).unsqueeze(0),
+    #             "prompts": torch.randint(0, 1000, (1, 128)),
+    #             "responses": torch.randint(0, 1000, (1, 128)),
+    #             "response_mask": torch.ones(1, 128, dtype=torch.int64),
+    #         }, batch_size=1),
+    #         non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
+    #         meta_info={"test_meta": "test_value"}
+    #     )
+    #     self.mock_postprocess.return_value = mock_gen_batch
+    #
+    #     # 创建测试样本
+    #     rollout_samples = [self.create_mock_rollout_sample("sample_1")]
+    #
+    #     # 调用函数
+    #     result = assemble_batch_from_rollout_samples(
+    #         rollout_samples=rollout_samples,
+    #         tokenizer=self.tokenizer,
+    #         config=self.config
+    #     )
+    #
+    #     # 验证结果
+    #     self.assertIsInstance(result, DataProto)
+    #     self.assertIn("uid", result.non_tensor_batch)
+    #     self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1")
+    #
+    #     # 验证meta_info包含预期字段
+    #     expected_fields = [
+    #         "rollout_param_versions", "sample_timestamps", "avg_processing_time",
+    #         "max_processing_time", "param_version_diversity", "avg_sample_age", "assembly_time"
+    #     ]
+    #     for field in expected_fields:
+    #         self.assertIn(field, result.meta_info)
+    #
+    #     # 验证统计信息
+    #     self.assertEqual(result.meta_info["rollout_param_versions"], [1])
+    #     self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5)
+    #     self.assertEqual(result.meta_info["param_version_diversity"], 1)
 
     def test_assemble_batch_multiple_samples(self):
         """测试多个样本的批次组装"""
-        # 设置mock返回值
+        # 设置mock返回值 - 使用正确的TensorDict格式
         mock_gen_batch = DataProto(
-            batch=torch.nn.utils.rnn.pad_sequence(
-                [
-                    torch.tensor([151644, 8948, 198] + list(range(100))),
-                    torch.tensor([151644, 8948, 198] + list(range(90))),
-                ],
-                batch_first=True,
-                padding_value=0,
+            batch=TensorDict(
+                {
+                    "input_ids": torch.randint(0, 1000, (2, 256)),
+                    "attention_mask": torch.ones(2, 256, dtype=torch.int64),
+                    "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1),
+                    "prompts": torch.randint(0, 1000, (2, 128)),
+                    "responses": torch.randint(0, 1000, (2, 128)),
+                    "response_mask": torch.ones(2, 128, dtype=torch.int64),
+                },
+                batch_size=2,
             ),
             non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)},
             meta_info={"test_meta": "test_value"},
@@ -207,6 +490,8 @@ def test_assemble_batch_multiple_samples(self):
             self.create_mock_rollout_sample("sample_2", param_version=2),
         ]
 
+        print(rollout_samples)
+
         # 调用函数
         result = assemble_batch_from_rollout_samples(
             rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
@@ -220,101 +505,113 @@ def test_assemble_batch_multiple_samples(self):
         # 验证多样本统计
         self.assertEqual(result.meta_info["rollout_param_versions"], [1, 2])
         self.assertEqual(result.meta_info["param_version_diversity"], 2)  # 两个不同版本
-        self.assertEqual(result.meta_info["avg_processing_time"], 0.5)
-
-    def test_assemble_batch_with_balance_batch_flag(self):
-        """测试启用balance_batch标志的情况"""
-        # 设置mock返回值
-        mock_gen_batch = DataProto(
-            batch=torch.nn.utils.rnn.pad_sequence(
-                [
-                    torch.tensor([151644, 8948, 198] + list(range(100))),
-                ],
-                batch_first=True,
-                padding_value=0,
-            ),
-            non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
-            meta_info={"test_meta": "test_value"},
-        )
-        self.mock_postprocess.return_value = mock_gen_batch
-
-        # 设置config启用balance_batch
-        self.config.trainer.balance_batch = True
-
-        # 创建测试样本
-        rollout_samples = [self.create_mock_rollout_sample("sample_1")]
-
-        # 调用函数
-        result = assemble_batch_from_rollout_samples(
-            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config, balance_batch=True
-        )
-
-        # 验证结果（主要验证没有抛出异常）
-        self.assertIsInstance(result, DataProto)
-
-    def test_assemble_batch_attention_mask_processing(self):
-        """测试attention_mask处理逻辑"""
-        # 设置mock返回值，包含attention_mask
-        mock_gen_batch = DataProto(
-            batch={
-                "attention_mask": torch.ones(2, 128, dtype=torch.int64),
-                "input_ids": torch.randint(0, 1000, (2, 128)),
-            },
-            non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)},
-            meta_info={"test_meta": "test_value"},
-        )
-        self.mock_postprocess.return_value = mock_gen_batch
-
-        # 创建测试样本
-        rollout_samples = [
-            self.create_mock_rollout_sample("sample_1"),
-            self.create_mock_rollout_sample("sample_2"),
-        ]
-
-        # 调用函数
-        result = assemble_batch_from_rollout_samples(
-            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
-        )
-
-        # 验证global_token_num被正确计算
-        self.assertIn("global_token_num", result.meta_info)
-        self.assertIsInstance(result.meta_info["global_token_num"], list)
-
-    def test_mock_postprocess_called_correctly(self):
-        """测试postprocess_agent_loop_outputs被正确调用"""
-        # 设置mock返回值
-        mock_gen_batch = DataProto(
-            batch=torch.nn.utils.rnn.pad_sequence(
-                [
-                    torch.tensor([151644, 8948, 198] + list(range(100))),
-                ],
-                batch_first=True,
-                padding_value=0,
-            ),
-            non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
-            meta_info={"test_meta": "test_value"},
-        )
-        self.mock_postprocess.return_value = mock_gen_batch
-
-        # 创建测试样本
-        rollout_samples = [self.create_mock_rollout_sample("sample_1")]
-
-        # 调用函数
-        result = assemble_batch_from_rollout_samples(
-            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
-        )
-
-        print(result)
-
-        # 验证postprocess_agent_loop_outputs被调用
-        self.mock_postprocess.assert_called_once()
-        call_args = self.mock_postprocess.call_args
-
-        # 验证调用参数
-        agent_loop_outputs, tokenizer, config = call_args[0]
-        self.assertEqual(len(agent_loop_outputs), 1)
-        self.assertEqual(tokenizer, self.tokenizer)
-        self.assertEqual(config, self.config)
+        self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5)
+
+    # def test_assemble_batch_with_balance_batch_flag(self):
+    #     """测试启用balance_batch标志的情况"""
+    #     # 设置mock返回值 - 使用正确的TensorDict格式
+    #     mock_gen_batch = DataProto(
+    #         batch=TensorDict({
+    #             "input_ids": torch.randint(0, 1000, (1, 256)),
+    #             "attention_mask": torch.ones(1, 256, dtype=torch.int64),
+    #             "position_ids": torch.arange(256).unsqueeze(0),
+    #             "prompts": torch.randint(0, 1000, (1, 128)),
+    #             "responses": torch.randint(0, 1000, (1, 128)),
+    #             "response_mask": torch.ones(1, 128, dtype=torch.int64),
+    #         }, batch_size=1),
+    #         non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
+    #         meta_info={"test_meta": "test_value"}
+    #     )
+    #     self.mock_postprocess.return_value = mock_gen_batch
+    #
+    #     # 设置config启用balance_batch
+    #     self.config.trainer.balance_batch = True
+    #
+    #     # 创建测试样本
+    #     rollout_samples = [self.create_mock_rollout_sample("sample_1")]
+    #
+    #     # 调用函数
+    #     result = assemble_batch_from_rollout_samples(
+    #         rollout_samples=rollout_samples,
+    #         tokenizer=self.tokenizer,
+    #         config=self.config,
+    #         balance_batch=True
+    #     )
+    #
+    #     # 验证结果（主要验证没有抛出异常）
+    #     self.assertIsInstance(result, DataProto)
+    #
+    # def test_assemble_batch_attention_mask_processing(self):
+    #     """测试attention_mask处理逻辑"""
+    #     # 设置mock返回值 - 使用正确的TensorDict格式
+    #     mock_gen_batch = DataProto(
+    #         batch=TensorDict({
+    #             "input_ids": torch.randint(0, 1000, (2, 256)),
+    #             "attention_mask": torch.ones(2, 256, dtype=torch.int64),
+    #             "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1),
+    #             "prompts": torch.randint(0, 1000, (2, 128)),
+    #             "responses": torch.randint(0, 1000, (2, 128)),
+    #             "response_mask": torch.ones(2, 128, dtype=torch.int64),
+    #         }, batch_size=2),
+    #         non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)},
+    #         meta_info={"test_meta": "test_value"}
+    #     )
+    #     self.mock_postprocess.return_value = mock_gen_batch
+    #
+    #     # 创建测试样本
+    #     rollout_samples = [
+    #         self.create_mock_rollout_sample("sample_1"),
+    #         self.create_mock_rollout_sample("sample_2"),
+    #     ]
+    #
+    #     # 调用函数
+    #     result = assemble_batch_from_rollout_samples(
+    #         rollout_samples=rollout_samples,
+    #         tokenizer=self.tokenizer,
+    #         config=self.config
+    #     )
+    #
+    #     # 验证global_token_num被正确计算
+    #     self.assertIn("global_token_num", result.meta_info)
+    #     self.assertIsInstance(result.meta_info["global_token_num"], list)
+    #
+    # def test_mock_postprocess_called_correctly(self):
+    #     """测试postprocess_agent_loop_outputs被正确调用"""
+    #     # 设置mock返回值 - 使用正确的TensorDict格式
+    #     mock_gen_batch = DataProto(
+    #         batch=TensorDict({
+    #             "input_ids": torch.randint(0, 1000, (1, 256)),
+    #             "attention_mask": torch.ones(1, 256, dtype=torch.int64),
+    #             "position_ids": torch.arange(256).unsqueeze(0),
+    #             "prompts": torch.randint(0, 1000, (1, 128)),
+    #             "responses": torch.randint(0, 1000, (1, 128)),
+    #             "response_mask": torch.ones(1, 128, dtype=torch.int64),
+    #         }, batch_size=1),
+    #         non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
+    #         meta_info={"test_meta": "test_value"}
+    #     )
+    #     self.mock_postprocess.return_value = mock_gen_batch
+    #
+    #     # 创建测试样本
+    #     rollout_samples = [self.create_mock_rollout_sample("sample_1")]
+    #
+    #     # 调用函数
+    #     result = assemble_batch_from_rollout_samples(
+    #         rollout_samples=rollout_samples,
+    #         tokenizer=self.tokenizer,
+    #         config=self.config
+    #     )
+    #
+    #     # 验证postprocess_agent_loop_outputs被调用
+    #     self.mock_postprocess.assert_called_once()
+    #     call_args = self.mock_postprocess.call_args
+    #
+    #     # 验证调用参数
+    #     agent_loop_outputs, tokenizer, config = call_args[0]
+    #     self.assertEqual(len(agent_loop_outputs), 1)
+    #     self.assertEqual(tokenizer, self.tokenizer)
+    #     self.assertEqual(config, self.config)
+    #
 
 
 if __name__ == "__main__":
diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py
deleted file mode 100644
index a2e7d5e6c4c..00000000000
--- a/recipe/fully_async_policy/utils.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any
-
-
-# Calculate the number of samples needed
-def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size):
-    return minimal_bsz * ppo_mini_batch_size
-
-
-@dataclass
-class RolloutSample:
-    """Enhanced rollout sample containing both original batch info and AgentLoopOutput"""
-
-    # Original batch information (preserved from _prepare_generate_batch)
-    original_batch_dict: dict[str, Any]
-
-    # AgentLoopOutput from generation
-    agent_loop_output: Any  # AgentLoopOutput
-
-    # Metadata
-    sample_id: str
-    epoch: int
-    rollout_n_index: int  # Index within the rollout.n repetitions (0, 1, ..., n-1)
-    original_sample_index: int  # Index of the original sample before repetition
-
-    # Processing metadata
-    processing_time: float
-    generation_timestamp: float
-    param_version: int
-
-    _gen_data: Any

From 7763c689122f3b9b2df9968f7f2f8053ccf4200e Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sat, 16 Aug 2025 01:40:27 +0800
Subject: [PATCH 054/182] train success

---
 .../fully_async_policy/fully_async_trainer.py | 90 +++++++++++--------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index c9a495c60ed..07694498378 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -172,9 +172,9 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         print(queue_samples)
         # Assemble batch - now working directly with RolloutSample objects
         if self.config.trainer.balance_batch:
-            batch = assemble_batch_from_rollout_samples(queue_samples, self._balance_batch)
+            batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, self._balance_batch)
         else:
-            batch = assemble_batch_from_rollout_samples(queue_samples)
+            batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, None)
         print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
         return 0, batch
 
@@ -243,49 +243,63 @@ def fit(self):
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
         while True:
-            # metrics = {}
+            metrics = {}
             timing_raw = {}
 
-            # is_last_step = False
-
             with marked_timer("step", timing_raw):
                 with marked_timer("gen", timing_raw, color="red"):
                     epoch, batch = self._get_samples_from_queue()
                     if batch is None:
                         break
-            #
-            #         # 更新统计信息
-            #         self.processed_samples += len(batch) if isinstance(batch, list) else 1
-            #
-            #         # 从meta_info中获取参数版本信息
-            #         if hasattr(batch, "meta_info") and batch.meta_info:
-            #             rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
-            #             if rollout_param_versions:
-            #                 # 统计陈旧样本
-            #                 stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
-            #                 self.stale_samples_processed += stale_count
-            #
-            #             # 添加新鲜度指标到metrics
-            #             if rollout_param_versions:
-            #                 param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
-            #                 avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
-            #
-            #                 metrics.update(
-            #                     {
-            #                         "freshness/param_version_diversity": param_version_diversity,
-            #                         "freshness/avg_sample_age": avg_sample_age,
-            #                         "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
-            #                         if rollout_param_versions
-            #                         else 0,
-            #                         "statistics/processed_samples": self.processed_samples,
-            #                         "statistics/stale_samples_processed": self.stale_samples_processed,
-            #                         "statistics/current_param_version": self.current_param_version,
-            #                     }
-            #                 )
-            #     batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
-            #     self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-            #     self._check_save_checkpoint(is_last_step, timing_raw)
-            #
+
+                    # 更新统计信息
+                    self.processed_samples += len(batch) if isinstance(batch, list) else 1
+
+                    # 从meta_info中获取参数版本信息
+                    if hasattr(batch, "meta_info") and batch.meta_info:
+                        # meta_info={'metrics': [{'generate_sequences': 1.8240885734558105, 'tool_calls': 0.0},
+                        # {'generate_sequences': 2.5197629928588867, 'tool_calls': 0.0},
+                        # {'generate_sequences': 3.5084900856018066, 'tool_calls': 0.0},
+                        # {'generate_sequences': 2.4329097270965576, 'tool_calls': 0.0},
+                        # {'generate_sequences': 3.0567924976348877, 'tool_calls': 0.0},
+                        # {'generate_sequences': 4.271160840988159, 'tool_calls': 0.0}],
+                        # 'global_steps': 22,
+                        # 'global_token_num': [588, 517, 422, 406, 355, 288],
+                        # 'rollout_param_versions': [0, 0, 0, 0, 0, 0],
+                        # 'sample_timestamps': [1755278023.7771623, 1755278024.101492, 1755278024.3597627,
+                        #                       1755278024.4885263, 1755278025.1039019, 1755278025.555585],
+                        # 'avg_processing_time': 2.935534119606018,
+                        # 'max_processing_time': 4.271160840988159,
+                        # 'param_version_diversity': 1,
+                        # 'avg_sample_age': 1.0503787994384766,
+                        # 'assembly_time': 0.05373978614807129})
+                        rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
+                        if rollout_param_versions:
+                            # 统计陈旧样本
+                            stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
+                            self.stale_samples_processed += stale_count
+
+                        # 添加新鲜度指标到metrics
+                        if rollout_param_versions:
+                            param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
+                            avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
+
+                            metrics.update(
+                                {
+                                    "freshness/param_version_diversity": param_version_diversity,
+                                    "freshness/avg_sample_age": avg_sample_age,
+                                    "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
+                                    if rollout_param_versions
+                                    else 0,
+                                    "statistics/processed_samples": self.processed_samples,
+                                    "statistics/stale_samples_processed": self.stale_samples_processed,
+                                    "statistics/current_param_version": self.current_param_version,
+                                }
+                            )
+                batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+                self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+                self._check_save_checkpoint(False, timing_raw)
+
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
 
             # Trigger parameter synchronization after training step

From d8212d9d7f4ca167e6923a4c27674b38e9fc3096 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 10:45:30 +0800
Subject: [PATCH 055/182] refactor log

---
 recipe/fully_async_policy/detach_utils.py          | 10 ++++------
 recipe/fully_async_policy/fully_async_rollouter.py |  4 ++++
 recipe/fully_async_policy/fully_async_trainer.py   |  4 ++--
 tests/special_e2e/run_fully_async_policy.sh        |  6 +++---
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 202cfdcf783..a76f42d7362 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -122,8 +122,8 @@ def assemble_batch_from_rollout_samples(
         item = rs.full_batch.to_items()[0]
         original_batch_list.append(item)
 
-    print("=" * 300)
-    print(original_batch_list)
+    # print("=" * 300)
+    # print(original_batch_list)
 
     # 合并所有原始样本为一个批次
     if original_batch_list:
@@ -132,8 +132,8 @@ def assemble_batch_from_rollout_samples(
         # 如果没有原始数据，创建空的 DataProto
         original_batch = DataProto.from_single_dict({})
 
-    print("=" * 300)
-    print(original_batch)
+    # print("=" * 300)
+    # print(original_batch)
 
     # 添加 UID
     uids = []
@@ -154,9 +154,7 @@ def assemble_batch_from_rollout_samples(
     if "response_mask" not in final_batch.batch.keys():
         final_batch.batch["response_mask"] = compute_response_mask(final_batch)
 
-    # 简化的批次平衡逻辑（如果需要的话）
     if balance_batch:
-        # 注意：这里简化了批次平衡逻辑，如果需要完整功能需要额外参数
         balance_batch(final_batch, metrics={})
 
     # 计算全局有效 token 数
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 888068b12b6..939f7a45b93 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -134,6 +134,10 @@ def __init__(
             self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
         )
         self.max_required_samples = self.required_samples * (self.staleness_threshold + 1)
+        print(
+            f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
+            f"max_required_samples: {self.max_required_samples}"
+        )
 
         # 单次最多扔一次迭代需要的样本
         self.max_concurrent_samples = self.required_samples
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 07694498378..ce8735bd8cc 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -169,13 +169,13 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         )
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
-        print(queue_samples)
+        # print(queue_samples)
         # Assemble batch - now working directly with RolloutSample objects
         if self.config.trainer.balance_batch:
             batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, self._balance_batch)
         else:
             batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, None)
-        print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
+        # print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
         return 0, batch
 
     def _create_actor_rollout_classes(self):
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index a938499a86b..c48f7b7507c 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -44,9 +44,9 @@ loss_agg_mode="token-mean"
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=3
-train_prompt_mini_bsz=1
+train_prompt_mini_bsz=32
 
-total_rollout_steps=50
+total_rollout_steps=5000
 
 # Temperature parameters
 temperature=1.0
@@ -60,7 +60,7 @@ n_gpus_rollout=2
 n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
 # Async training specific configurations
-staleness_threshold=3
+staleness_threshold=30000
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"
 

From 25740b20d116bdfcab825a6045c104bbb9d84f0e Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 12:55:35 +0800
Subject: [PATCH 056/182] stop system run

---
 .../fully_async_rollouter.py                  | 37 ++++++++++------
 .../fully_async_policy/fully_async_trainer.py |  4 +-
 recipe/fully_async_policy/message_queue.py    | 44 -------------------
 tests/special_e2e/run_fully_async_policy.sh   |  2 +-
 4 files changed, 26 insertions(+), 61 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 939f7a45b93..1e6101a6f5d 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -38,16 +38,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -115,6 +115,7 @@ def __init__(
 
         # Concurrency control
         self.paused = False
+        self.running = True
 
         # Initialize async locks directly
         self.lock = asyncio.Lock()
@@ -279,9 +280,9 @@ async def _processor_worker(self):
 
             async with self.lock:
                 if await self._should_pause_generation():
-                    # 等待已提交的任务结束
+                    print("等待已提交的任务结束")
                     await asyncio.gather(*self.active_tasks, return_exceptions=True)
-                    self.active_tasks = set()
+                    self.active_tasks.clear()
                     self.paused = True
 
                 while self.paused:
@@ -293,6 +294,7 @@ async def _processor_worker(self):
                 # 等待所有活动任务完成
                 if self.active_tasks:
                     await asyncio.gather(*self.active_tasks, return_exceptions=True)
+                    self.active_tasks.clear()
                 break
 
             # 检查并发数是否超限
@@ -393,7 +395,6 @@ async def _streaming_generation_main(self):
             self._init_async_rollout_manager()
 
         # 启动流式处理循环
-        """流式样本生成主循环 - 优化版本，确保先完成的样本优先进入队列"""
         print(f"[FullyAsyncRollouter] 启动流式处理模式，最大并发样本数: {self.max_concurrent_samples}")
 
         # 初始化异步队列
@@ -439,6 +440,9 @@ async def _streaming_generation_main(self):
             param_version=self.current_param_version,
         )
 
+        async with self.lock:
+            self.running = False
+
     async def fit(self):
         """
         Start the async rollouter - entry point that sets up and runs async tasks
@@ -452,7 +456,9 @@ async def fit(self):
             raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
         # 设置运行状态
-        self.paused = False
+        async with self.lock:
+            self.paused = False
+            self.running = True
 
         # 创建主要的异步任务
         generation_task = asyncio.create_task(self._streaming_generation_main())
@@ -486,6 +492,9 @@ async def _async_monitor_loop(self):
         check_interval = 5.0
 
         while True:
+            async with self.lock:
+                if not self.running:
+                    break
             await asyncio.sleep(check_interval)
             # 定期打印统计信息
             current_time = time.time()
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index ce8735bd8cc..784a3318166 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -146,8 +146,8 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
             if sample is None:
                 # 检测到结束信号（None），立即退出
-                logger.info(
-                    f"Detected termination signal (None), stopping sample collection. "
+                print(
+                    f"[FullyAsyncTrainer] Detected termination signal (None), stopping sample collection. "
                     f"Collected {len(queue_samples)}/{self.required_samples} samples"
                 )
                 break
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index b2761f95749..fc1c133412e 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -96,40 +96,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
 
             return True
 
-    async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
-        """
-        Get batch samples from the queue, wait until enough samples are available
-
-        Args:
-            min_batch_count: Get samples at once when sample count meets min_batch
-
-        Returns:
-            List[Any]: List of retrieved samples
-        """
-        async with self._lock:
-            while len(self.queue) < min_batch_count and self.running:
-                print(f"[MessageQueue] consumer_condition {len(self.queue)}")
-                if len(self.queue) > 0 and self.queue[-1] is None:
-                    return [], len(self.queue)
-                await self._consumer_condition.wait()
-
-            # If queue is closed and doesn't have enough samples, return empty list
-            if not self.running and len(self.queue) < min_batch_count:
-                return [], len(self.queue)
-
-            # Get specified number of samples
-            batch_count = min(min_batch_count, len(self.queue))
-            samples = []
-            for _ in range(batch_count):
-                if self.queue:
-                    data = self.queue.popleft()
-                    if data is None:
-                        return [], len(self.queue)
-                    else:
-                        samples.append(data)
-
-            self.total_consumed += len(samples)
-            return samples, len(self.queue)
 
     async def get_sample(self) -> Any | None:
         """
@@ -140,7 +106,6 @@ async def get_sample(self) -> Any | None:
         """
         async with self._lock:
             while len(self.queue) == 0 and self.running:
-                print(f"[MessageQueue] consumer_condition {len(self.queue)}")
                 await self._consumer_condition.wait()
 
             # If queue is closed and empty, return None
@@ -236,11 +201,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
         future = self.queue_actor.put_sample.remote(sample, param_version)
         return await asyncio.wrap_future(future.future())
 
-    async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
-        """Get batch from queue, wait until enough samples are available (async)"""
-        future = self.queue_actor.get_samples.remote(min_batch_count)
-        return await asyncio.wrap_future(future.future())
-
     async def get_sample(self) -> Any | None:
         """Get single sample from queue, wait until one is available (async)"""
         future = self.queue_actor.get_sample.remote()
@@ -281,10 +241,6 @@ def put_sample_sync(self, sample: Any, param_version: int) -> bool:
         """Put batch into queue (sync - deprecated, use put_sample instead)"""
         return ray.get(self.queue_actor.put_sample.remote(sample, param_version))
 
-    def get_samples_sync(self, min_batch_count: int = 1) -> tuple[list[Any], int]:
-        """Get batch from queue (sync - deprecated, use get_samples instead)"""
-        return ray.get(self.queue_actor.get_samples.remote(min_batch_count))
-
     def get_sample_sync(self) -> Any | None:
         """Get single sample from queue (sync - deprecated, use get_sample instead)"""
         return ray.get(self.queue_actor.get_sample.remote())
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index c48f7b7507c..7674fcd08cd 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -44,7 +44,7 @@ loss_agg_mode="token-mean"
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=3
-train_prompt_mini_bsz=32
+train_prompt_mini_bsz=256
 
 total_rollout_steps=5000
 

From 737a8ce967bddfea10e48445938b1424f08b9e51 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 15:23:54 +0800
Subject: [PATCH 057/182] system run suceess trigger_parameter_sync_step

---
 .../config/fully_async_ppo_trainer.yaml       |  1 +
 .../fully_async_rollouter.py                  | 66 +++++++++++--------
 .../fully_async_policy/fully_async_trainer.py | 65 +++++++++++-------
 recipe/fully_async_policy/message_queue.py    | 16 ++---
 recipe/fully_async_policy/param_sync.py       |  2 +-
 tests/special_e2e/run_fully_async_policy.sh   |  6 +-
 6 files changed, 90 insertions(+), 66 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 665f7a8be89..a1dbaa7a79b 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -11,6 +11,7 @@ defaults:
 async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
+  trigger_parameter_sync_step: 10
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 1e6101a6f5d..62ce3c24347 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -134,7 +134,8 @@ def __init__(
         self.required_samples = calculate_one_step_size(
             self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
         )
-        self.max_required_samples = self.required_samples * (self.staleness_threshold + 1)
+        self.max_required_samples = self.required_samples * (
+                    self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
         print(
             f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
             f"max_required_samples: {self.max_required_samples}"
@@ -153,6 +154,11 @@ def __init__(
         self.max_queue_size = self.max_required_samples * 10  # x 10 avoid deadlock
         print(f"[FullyAsyncRollouter] {self.max_queue_size}")
 
+        # 初始化异步队列
+        self.pending_queue = asyncio.Queue(maxsize=100)
+        self.active_tasks = set()
+        self.result_queue = asyncio.Queue()
+
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
         async with self.lock:
@@ -284,35 +290,37 @@ async def _processor_worker(self):
                     await asyncio.gather(*self.active_tasks, return_exceptions=True)
                     self.active_tasks.clear()
                     self.paused = True
-
                 while self.paused:
                     await self.condition.wait()
 
             # 获取待处理的部分 RolloutSample
-            if partial_rollout_sample == "DONE":
-                print("收到结束信号，等待剩余任务完成...")
-                # 等待所有活动任务完成
-                if self.active_tasks:
-                    await asyncio.gather(*self.active_tasks, return_exceptions=True)
-                    self.active_tasks.clear()
-                break
+            async with self.lock:
+                if partial_rollout_sample == "DONE":
+                    print("收到结束信号，等待剩余任务完成...")
+                    # 等待所有活动任务完成
+                    if self.active_tasks:
+                        await asyncio.gather(*self.active_tasks, return_exceptions=True)
+                        self.active_tasks.clear()
+                    break
 
             # 检查并发数是否超限
-            while len(self.active_tasks) >= self.max_concurrent_samples:
-                # 等待至少一个任务完成
-                done_tasks, self.active_tasks = await asyncio.wait(
-                    self.active_tasks, return_when=asyncio.FIRST_COMPLETED
-                )
-                # 清理已完成的任务
-                for task in done_tasks:
-                    await task
+            async with self.lock:
+                while len(self.active_tasks) >= self.max_concurrent_samples:
+                    # 等待至少一个任务完成
+                    done_tasks, self.active_tasks = await asyncio.wait(
+                        self.active_tasks, return_when=asyncio.FIRST_COMPLETED
+                    )
+                    # 清理已完成的任务
+                    for task in done_tasks:
+                        await task
 
             # 立即提交单个样本处理
-            task = asyncio.create_task(
-                self._process_single_sample_streaming(partial_rollout_sample),
-                name=f"process_{partial_rollout_sample.sample_id}",
-            )
-            self.active_tasks.add(task)
+            async with self.lock:
+                task = asyncio.create_task(
+                    self._process_single_sample_streaming(partial_rollout_sample),
+                    name=f"process_{partial_rollout_sample.sample_id}",
+                )
+                self.active_tasks.add(task)
 
             # 标记队列任务完成
             self.pending_queue.task_done()
@@ -350,13 +358,12 @@ async def _consumer_worker(self):
                 sample=ray.cloudpickle.dumps(rollout_sample),
                 param_version=rollout_sample.param_version,
             )
-
             if success:
                 self.total_generated_samples += 1
             else:
                 self.dropped_stale_samples += 1
 
-            print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}")
+            # print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}")
 
             # 标记结果队列任务完成
             self.result_queue.task_done()
@@ -397,11 +404,6 @@ async def _streaming_generation_main(self):
         # 启动流式处理循环
         print(f"[FullyAsyncRollouter] 启动流式处理模式，最大并发样本数: {self.max_concurrent_samples}")
 
-        # 初始化异步队列
-        self.pending_queue = asyncio.Queue(maxsize=100)
-        self.active_tasks = set()
-        self.result_queue = asyncio.Queue()
-
         # 启动流式处理协程和消费者协程
         self.feed_task = asyncio.create_task(self._feed_samples())
         self.processor_task = asyncio.create_task(self._processor_worker())
@@ -507,6 +509,8 @@ async def _async_monitor_loop(self):
                 await self.resume()
 
     async def _should_pause_generation(self) -> bool:
+        if self.paused:
+            return True
         """Determine whether the build should be paused"""
         queue_stats = self.message_queue_client.get_statistics_sync()
         queue_size = queue_stats["queue_size"]
@@ -543,6 +547,10 @@ async def pause(self):
         print("[FullyAsyncRollouter] pause")
         async with self.lock:
             self.paused = True
+            if self.active_tasks:
+                await asyncio.gather(*self.active_tasks, return_exceptions=True)
+                self.active_tasks.clear()
+            print("[FullyAsyncRollouter] All active tasks completed")
 
     async def resume(self):
         """resume rollout
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 784a3318166..e84cecee387 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -15,6 +15,7 @@
 import logging
 import time
 import warnings
+from pprint import pprint
 from typing import Any
 
 import numpy as np
@@ -49,16 +50,16 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -107,6 +108,9 @@ def __init__(
         self.stale_samples_processed = 0
         self.current_param_version = 0
 
+        self.local_trigger_step = 1
+        self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step
+
         self.required_samples = calculate_one_step_size(
             self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
         )
@@ -302,11 +306,36 @@ def fit(self):
 
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
 
+            pprint(metrics)
+
             # Trigger parameter synchronization after training step
-            # self._trigger_parameter_sync_after_step()
-            print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}")
+            print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}"
+                  f"[FullyAsyncTrainer] _trigger_parameter_sync_after_step {self.local_trigger_step} {self.trigger_parameter_sync_step}")
+            self._trigger_parameter_sync_after_step()
             self.global_steps += 1
 
+    def _trigger_parameter_sync_after_step(self):
+        """
+        Trigger parameter synchronization after training step
+        This ensures rollouter always uses the latest trained parameters
+        """
+        print("[FullyAsyncTrainer] Trigger parameter synchronization after training step")
+        if self.local_trigger_step >= self.trigger_parameter_sync_step:
+            print(f"[FullyAsyncTrainer] Trigger start run")
+            self.local_trigger_step = 1
+            print(f"[FullyAsyncTrainer] {self.current_param_version}")
+            self.current_param_version = self.current_param_version + 1
+            print(
+                f"[FullyAsyncTrainer] Triggering parameter sync after "
+                f"training step {self.global_steps}, version: {self.current_param_version}"
+            )
+            ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
+            return
+        else:
+            print(f"[FullyAsyncTrainer] Trigger {self.local_trigger_step}")
+            self.local_trigger_step += 1
+            return
+
     def get_statistics(self) -> dict:
         """Get training statistics"""
         queue_stats = self.message_queue_client.get_statistics_sync() if self.message_queue_client else {}
@@ -321,18 +350,6 @@ def get_statistics(self) -> dict:
             "queue_dropped_samples": queue_stats.get("dropped_samples", 0),
         }
 
-    def _trigger_parameter_sync_after_step(self):
-        """
-        Trigger parameter synchronization after training step
-        This ensures rollouter always uses the latest trained parameters
-        """
-        self.current_param_version = self.current_param_version + 1
-        print(
-            f"[FullyAsyncTrainer] Triggering parameter sync after "
-            f"training step {self.global_steps}, version: {self.current_param_version}"
-        )
-        ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
-
     def _compute_sample_freshness_metrics(self, rollout_samples: list[RolloutSample]) -> dict:
         """
         Compute sample freshness metrics
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index fc1c133412e..2a12cb21c90 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -77,7 +77,7 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
             staleness = self.current_param_version - param_version
             if staleness > self.staleness_threshold:
                 self.dropped_samples += 1
-                logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
+                print(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
                 return False
 
             # If queue is full, remove the oldest sample (rarely happens)
@@ -92,11 +92,10 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
             self._consumer_condition.notify_all()
 
             if self.total_produced % 100 == 0:
-                logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}")
+                print(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}")
 
             return True
 
-
     async def get_sample(self) -> Any | None:
         """
         Get a single sample from the queue, wait until one is available
@@ -122,7 +121,7 @@ async def update_param_version(self, version: int):
         async with self._lock:
             old_version = self.current_param_version
             self.current_param_version = version
-            logger.debug(f"Parameter version updated from {old_version} to {version}")
+            print(f"Parameter version updated from {old_version} to {version}")
 
     async def get_queue_size(self) -> int:
         """Get current queue length"""
@@ -206,11 +205,6 @@ async def get_sample(self) -> Any | None:
         future = self.queue_actor.get_sample.remote()
         return await asyncio.wrap_future(future.future())
 
-    async def update_param_version(self, version: int):
-        """Update parameter version (async)"""
-        future = self.queue_actor.update_param_version.remote(version)
-        await asyncio.wrap_future(future.future())
-
     async def get_queue_size(self) -> int:
         """Get queue size (async)"""
         future = self.queue_actor.get_queue_size.remote()
@@ -248,3 +242,7 @@ def get_sample_sync(self) -> Any | None:
     def get_statistics_sync(self) -> dict[str, Any]:
         """Get statistics (sync - deprecated, use get_statistics instead)"""
         return ray.get(self.queue_actor.get_statistics.remote())
+
+    def update_param_version_sync(self, version: int):
+        """Update parameter version (async)"""
+        return ray.get(self.queue_actor.update_param_version.remote(version))
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 7e40e755a12..53ced11956c 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -80,7 +80,7 @@ def sync_weights(self, version):
         ray.get(self.rollouter.pause.remote())
 
         # Update MQ version
-        self.mq_client.update_param_version(version)
+        self.mq_client.update_param_version_sync(version)
 
         # sync weights
         self.actor_wg.sync_rollout_weights()
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 7674fcd08cd..1b2df475598 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -44,7 +44,7 @@ loss_agg_mode="token-mean"
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=3
-train_prompt_mini_bsz=256
+train_prompt_mini_bsz=32
 
 total_rollout_steps=5000
 
@@ -56,11 +56,11 @@ val_top_p=0.7
 
 # Fully async specific parameters
 # Allocate 2 GPUs for rollout, remaining for training
-n_gpus_rollout=2
+n_gpus_rollout=4
 n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
 # Async training specific configurations
-staleness_threshold=30000
+staleness_threshold=0
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"
 

From defd61f369ad24acdde6b91c9d72b7e6637ce9ff Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 15:24:09 +0800
Subject: [PATCH 058/182] system run suceess trigger_parameter_sync_step

---
 recipe/fully_async_policy/fully_async_trainer.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index e84cecee387..3021c536eca 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -309,8 +309,9 @@ def fit(self):
             pprint(metrics)
 
             # Trigger parameter synchronization after training step
-            print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}"
-                  f"[FullyAsyncTrainer] _trigger_parameter_sync_after_step {self.local_trigger_step} {self.trigger_parameter_sync_step}")
+            print(f"[FullyAsyncTrainer] global_steps: {self.global_steps} "
+                  f"local_trigger_step: {self.local_trigger_step} "
+                  f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}")
             self._trigger_parameter_sync_after_step()
             self.global_steps += 1
 
@@ -319,11 +320,8 @@ def _trigger_parameter_sync_after_step(self):
         Trigger parameter synchronization after training step
         This ensures rollouter always uses the latest trained parameters
         """
-        print("[FullyAsyncTrainer] Trigger parameter synchronization after training step")
         if self.local_trigger_step >= self.trigger_parameter_sync_step:
-            print(f"[FullyAsyncTrainer] Trigger start run")
             self.local_trigger_step = 1
-            print(f"[FullyAsyncTrainer] {self.current_param_version}")
             self.current_param_version = self.current_param_version + 1
             print(
                 f"[FullyAsyncTrainer] Triggering parameter sync after "

From e86625b459dfa9d62560c90a41b471ca24083c3b Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 18:04:29 +0800
Subject: [PATCH 059/182] All active tasks completed

---
 .../config/fully_async_ppo_trainer.yaml       |  2 +-
 .../fully_async_rollouter.py                  | 61 ++++++++++++-------
 tests/special_e2e/run_fully_async_policy.sh   |  4 +-
 3 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index a1dbaa7a79b..8ccb6d36b71 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -11,7 +11,7 @@ defaults:
 async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
-  trigger_parameter_sync_step: 10
+  trigger_parameter_sync_step: 1      # >=1
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 62ce3c24347..e501a6ab142 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -135,11 +135,9 @@ def __init__(
             self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
         )
         self.max_required_samples = self.required_samples * (
-                    self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
-        print(
-            f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
-            f"max_required_samples: {self.max_required_samples}"
-        )
+                self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
+        print(f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
+              f"max_required_samples: {self.max_required_samples}")
 
         # 单次最多扔一次迭代需要的样本
         self.max_concurrent_samples = self.required_samples
@@ -183,7 +181,8 @@ async def update_param_version(self, version: int):
             self.current_param_version = version
             # every time param change, reset staleness_samples
             self.staleness_samples = 0
-            print(f"[FullyAsyncRollouter] Parameter version updated from {old_version} to {version}")
+            print(f"[FullyAsyncRollouter][Public][update_param_version] "
+                  f"Parameter version updated from {old_version} to {version}")
 
     def _validate_config(self):
         # Validate asynchronous training configuration
@@ -263,7 +262,8 @@ async def _feed_samples(self):
                 # 检查是否到达最后一步
                 if self.global_steps >= self.total_rollout_steps:
                     print(
-                        f"[FullyAsyncRollouter] 达到最大步数，停止添加新样本 "
+                        f"[FullyAsyncRollouter][Feed] "
+                        f"达到最大步数，停止添加新样本 "
                         f"{self.global_steps} >= {self.total_rollout_steps}"
                     )
                     should_stop = True  # 设置停止标志
@@ -275,7 +275,8 @@ async def _feed_samples(self):
 
         # 发送结束信号
         await self.pending_queue.put("DONE")
-        print(f"[FullyAsyncRollouter] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
+        print(f"[FullyAsyncRollouter][Feed] "
+              f"样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
 
     async def _processor_worker(self):
         """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
@@ -286,9 +287,11 @@ async def _processor_worker(self):
 
             async with self.lock:
                 if await self._should_pause_generation():
-                    print("等待已提交的任务结束")
-                    await asyncio.gather(*self.active_tasks, return_exceptions=True)
-                    self.active_tasks.clear()
+                    print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束 "
+                          f"{[t.get_name() for t in self.active_tasks]}")
+                    if self.active_tasks:
+                        await asyncio.gather(*self.active_tasks, return_exceptions=True)
+                        self.active_tasks.clear()
                     self.paused = True
                 while self.paused:
                     await self.condition.wait()
@@ -296,7 +299,7 @@ async def _processor_worker(self):
             # 获取待处理的部分 RolloutSample
             async with self.lock:
                 if partial_rollout_sample == "DONE":
-                    print("收到结束信号，等待剩余任务完成...")
+                    print(f"[FullyAsyncRollouter][Processor] 收到结束信号，等待剩余任务完成...")
                     # 等待所有活动任务完成
                     if self.active_tasks:
                         await asyncio.gather(*self.active_tasks, return_exceptions=True)
@@ -316,6 +319,9 @@ async def _processor_worker(self):
 
             # 立即提交单个样本处理
             async with self.lock:
+                # pause结束后，获取到锁，还需要判断是否是暂停阶段，否则继续等待
+                while self.paused:
+                    await self.condition.wait()
                 task = asyncio.create_task(
                     self._process_single_sample_streaming(partial_rollout_sample),
                     name=f"process_{partial_rollout_sample.sample_id}",
@@ -346,7 +352,7 @@ async def _process_single_sample_streaming(self, partial_rollout_sample):
         if processing_time > self.max_processing_time:
             self.max_processing_time = processing_time
 
-        print(f"[FullyAsyncRollouter] process {partial_rollout_sample.sample_id} cost {processing_time:.2f}s")
+        print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s")
 
     async def _consumer_worker(self):
         """消费者协程，负责从结果队列获取处理结果并放入消息队列"""
@@ -502,11 +508,14 @@ async def _async_monitor_loop(self):
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
-                print(f"[FullyAsyncRollouter] statistics {stats}")
+                print(f"[FullyAsyncRollouter][MonitorLoop] {stats}")
                 last_stats_time = current_time
 
             if not await self._should_pause_generation():
-                await self.resume()
+                async with self.lock:
+                    print(f"[FullyAsyncRollouter][MonitorLoop] trigger resume")
+                    self.paused = False
+                    self.condition.notify_all()
 
     async def _should_pause_generation(self) -> bool:
         if self.paused:
@@ -520,20 +529,23 @@ async def _should_pause_generation(self) -> bool:
 
         if version_diff > self.staleness_threshold:
             print(
-                "[FullyAsyncRollouter] "
-                f"Should pause due to version_diff > self.staleness_threshold: "
+                "[FullyAsyncRollouter][ShouldPause] "
+                f"due to version_diff > self.staleness_threshold: "
                 f"rollout_version={self.current_param_version}, "
                 f"trainer_version={current_trainer_version}, diff={version_diff}"
             )
             return True
 
         if queue_size >= self.max_queue_size:
-            print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}")
+            print(
+                "[FullyAsyncRollouter][ShouldPause] "
+                f" due to full queue: size={queue_size}, max={self.max_queue_size}")
             return True
 
         if self.staleness_samples > self.max_required_samples:
             print(
-                f"[FullyAsyncRollouter] Should pause due to "
+                "[FullyAsyncRollouter][ShouldPause] "
+                f"due to "
                 f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} "
             )
             return True
@@ -544,19 +556,24 @@ async def pause(self):
         """pause rollout
         TODO integrated Partial Rollout
         """
-        print("[FullyAsyncRollouter] pause")
+        print("[FullyAsyncRollouter][Public] pause")
         async with self.lock:
             self.paused = True
             if self.active_tasks:
+                print(f"[FullyAsyncRollouter][Pause] "
+                      f"{[t.get_name() for t in self.active_tasks]}")
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
-            print("[FullyAsyncRollouter] All active tasks completed")
+            print("[FullyAsyncRollouter][Pause] All active tasks completed")
+
+        # print("[FullyAsyncRollouter][Public] pause sleep 10")
+        # await asyncio.sleep(10)
 
     async def resume(self):
         """resume rollout
         TODO integrated Partial Rollout
         """
-        print("[FullyAsyncRollouter] resume")
+        print("[FullyAsyncRollouter][Public] resume")
         async with self.lock:
             self.paused = False
             self.condition.notify_all()
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 1b2df475598..ebcf07b43f7 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -46,7 +46,7 @@ gen_prompt_bsz=1
 n_resp_per_prompt=3
 train_prompt_mini_bsz=32
 
-total_rollout_steps=5000
+total_rollout_steps=50000
 
 # Temperature parameters
 temperature=1.0
@@ -60,7 +60,7 @@ n_gpus_rollout=4
 n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
 # Async training specific configurations
-staleness_threshold=0
+staleness_threshold=10
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"
 

From ed4d5720885d59df617c363ff26f26c1ddfe595f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 18:05:05 +0800
Subject: [PATCH 060/182] pause submit task

---
 recipe/fully_async_policy/fully_async_rollouter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index e501a6ab142..18dd6a5319d 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -560,11 +560,11 @@ async def pause(self):
         async with self.lock:
             self.paused = True
             if self.active_tasks:
-                print(f"[FullyAsyncRollouter][Pause] "
+                print(f"[FullyAsyncRollouter][Public][Pause] "
                       f"{[t.get_name() for t in self.active_tasks]}")
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
-            print("[FullyAsyncRollouter][Pause] All active tasks completed")
+            print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
 
         # print("[FullyAsyncRollouter][Public] pause sleep 10")
         # await asyncio.sleep(10)

From bd99e16411e7f86977f061825bf328ad6c0d00d6 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 19:37:48 +0800
Subject: [PATCH 061/182] steam rollout

---
 .../config/fully_async_ppo_trainer.yaml       |  2 +-
 .../dapo_7b_math_fsdp2_4_12.sh                | 40 +++++++++++++------
 .../fully_async_rollouter.py                  | 17 ++++----
 .../fully_async_policy/fully_async_trainer.py |  7 +---
 tests/special_e2e/run_fully_async_policy.sh   | 15 +++----
 5 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 8ccb6d36b71..9d0a8c67383 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -11,7 +11,7 @@ defaults:
 async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
-  trigger_parameter_sync_step: 1      # >=1
+  trigger_parameter_sync_step: 10      # >=1 train 每次训练一个batch, 迭代多少次后触发更新
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
index 5c2ac5e6017..86cd25affe2 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
@@ -21,7 +21,14 @@ CKPTS_DIR=./ckpts/${project_name}/${exp_name}
 TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
 TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
 
-
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
 adv_estimator=grpo
 
 use_kl_in_reward=False
@@ -32,20 +39,16 @@ kl_loss_coef=0.0
 clip_ratio_low=0.2
 clip_ratio_high=0.28
 
+# Response length parameters
 max_prompt_length=$((1024 * 2))
 max_response_length=$((1024 * 8))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
 
+# Training parameters
 loss_agg_mode="token-mean"
 
-train_prompt_bsz=2
-gen_prompt_bsz=4
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-train_sync_weight_steps=64
-
 # Algorithm
 temperature=1.0
 top_p=1.0
@@ -62,14 +65,21 @@ gen_tp=1
 sp_size=1
 fsdp_size=2
 
-staleness_threshold=3
-
 NNODES=${NNODES:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
+# Fully async specific parameters
 n_gpus_rollout=4
 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+staleness_threshold=10
+total_rollout_steps=$(((512*16*100)))
+trigger_parameter_sync_step=32
+
 /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
@@ -79,6 +89,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
     data.max_response_length=${max_response_length} \
     data.train_batch_size=${train_prompt_bsz} \
     data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
     actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
     algorithm.adv_estimator=${adv_estimator} \
     algorithm.use_kl_in_reward=${use_kl_in_reward} \
@@ -125,6 +136,8 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
     actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
     actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
     actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
     reward_model.reward_manager=dapo \
     +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
     +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
@@ -135,7 +148,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
     trainer.val_before_train=True \
-    trainer.test_freq=10 \
+    trainer.test_freq=-1 \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
@@ -143,6 +156,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
     trainer.n_gpus_per_node="${n_gpus_training}" \
     rollout.nnodes="${NNODES}" \
     rollout.n_gpus_per_node="${n_gpus_rollout}" \
-    rollout.total_rollout_steps=100 \
-    rollout.total_epochs=2 \
-    async_training.staleness_threshold=${staleness_threshold}
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}"
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 18dd6a5319d..85860d7c5e9 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -157,6 +157,9 @@ def __init__(
         self.active_tasks = set()
         self.result_queue = asyncio.Queue()
 
+        # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
+        self.monitor_loop_trigger = True
+
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
         async with self.lock:
@@ -287,8 +290,7 @@ async def _processor_worker(self):
 
             async with self.lock:
                 if await self._should_pause_generation():
-                    print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束 "
-                          f"{[t.get_name() for t in self.active_tasks]}")
+                    print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束")
                     if self.active_tasks:
                         await asyncio.gather(*self.active_tasks, return_exceptions=True)
                         self.active_tasks.clear()
@@ -511,7 +513,8 @@ async def _async_monitor_loop(self):
                 print(f"[FullyAsyncRollouter][MonitorLoop] {stats}")
                 last_stats_time = current_time
 
-            if not await self._should_pause_generation():
+            # pause 和 resume 直接，不进行恢复操作
+            if self.monitor_loop_trigger and not await self._should_pause_generation():
                 async with self.lock:
                     print(f"[FullyAsyncRollouter][MonitorLoop] trigger resume")
                     self.paused = False
@@ -560,14 +563,11 @@ async def pause(self):
         async with self.lock:
             self.paused = True
             if self.active_tasks:
-                print(f"[FullyAsyncRollouter][Public][Pause] "
-                      f"{[t.get_name() for t in self.active_tasks]}")
+                print(f"[FullyAsyncRollouter][Public][Pause]")
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
             print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
-
-        # print("[FullyAsyncRollouter][Public] pause sleep 10")
-        # await asyncio.sleep(10)
+        self.monitor_loop_trigger = False
 
     async def resume(self):
         """resume rollout
@@ -577,6 +577,7 @@ async def resume(self):
         async with self.lock:
             self.paused = False
             self.condition.notify_all()
+        self.monitor_loop_trigger = True
 
     async def get_statistics(self) -> dict:
         queue_stats = self.message_queue_client.get_statistics_sync()
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 3021c536eca..cc0c2378021 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -158,7 +158,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
             queue_samples.append(sample)
 
-            if len(queue_samples) % 10 == 0 or len(queue_samples) >= self.required_samples:
+            if len(queue_samples) % 10 == 0:
                 print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples")
 
         consumer_end = time.time()
@@ -323,14 +323,9 @@ def _trigger_parameter_sync_after_step(self):
         if self.local_trigger_step >= self.trigger_parameter_sync_step:
             self.local_trigger_step = 1
             self.current_param_version = self.current_param_version + 1
-            print(
-                f"[FullyAsyncTrainer] Triggering parameter sync after "
-                f"training step {self.global_steps}, version: {self.current_param_version}"
-            )
             ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
             return
         else:
-            print(f"[FullyAsyncTrainer] Trigger {self.local_trigger_step}")
             self.local_trigger_step += 1
             return
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index ebcf07b43f7..8e0b82ddefc 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -41,12 +41,6 @@ overlong_penalty_factor=1.0
 
 # Training parameters
 loss_agg_mode="token-mean"
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=3
-train_prompt_mini_bsz=32
-
-total_rollout_steps=50000
 
 # Temperature parameters
 temperature=1.0
@@ -55,11 +49,14 @@ top_k=-1
 val_top_p=0.7
 
 # Fully async specific parameters
-# Allocate 2 GPUs for rollout, remaining for training
-n_gpus_rollout=4
+n_gpus_rollout=6
 n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
-# Async training specific configurations
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=3
+train_prompt_mini_bsz=32
+total_rollout_steps=50000
 staleness_threshold=10
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"

From c59055ca7e4ed7eabe46604669312533b25e5260 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 19:37:57 +0800
Subject: [PATCH 062/182] steam rollout

---
 .../fully_async_rollouter.py                  | 44 ++++++++++---------
 .../fully_async_policy/fully_async_trainer.py | 28 ++++++------
 2 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 85860d7c5e9..ac1253454d7 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -38,16 +38,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -134,10 +134,13 @@ def __init__(
         self.required_samples = calculate_one_step_size(
             self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
         )
-        self.max_required_samples = self.required_samples * (
-                self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
-        print(f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
-              f"max_required_samples: {self.max_required_samples}")
+        self.max_required_samples = (
+            self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
+        )
+        print(
+            f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
+            f"max_required_samples: {self.max_required_samples}"
+        )
 
         # 单次最多扔一次迭代需要的样本
         self.max_concurrent_samples = self.required_samples
@@ -184,8 +187,10 @@ async def update_param_version(self, version: int):
             self.current_param_version = version
             # every time param change, reset staleness_samples
             self.staleness_samples = 0
-            print(f"[FullyAsyncRollouter][Public][update_param_version] "
-                  f"Parameter version updated from {old_version} to {version}")
+            print(
+                f"[FullyAsyncRollouter][Public][update_param_version] "
+                f"Parameter version updated from {old_version} to {version}"
+            )
 
     def _validate_config(self):
         # Validate asynchronous training configuration
@@ -278,8 +283,7 @@ async def _feed_samples(self):
 
         # 发送结束信号
         await self.pending_queue.put("DONE")
-        print(f"[FullyAsyncRollouter][Feed] "
-              f"样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
+        print(f"[FullyAsyncRollouter][Feed] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
 
     async def _processor_worker(self):
         """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
@@ -541,8 +545,8 @@ async def _should_pause_generation(self) -> bool:
 
         if queue_size >= self.max_queue_size:
             print(
-                "[FullyAsyncRollouter][ShouldPause] "
-                f" due to full queue: size={queue_size}, max={self.max_queue_size}")
+                f"[FullyAsyncRollouter][ShouldPause]  due to full queue: size={queue_size}, max={self.max_queue_size}"
+            )
             return True
 
         if self.staleness_samples > self.max_required_samples:
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index cc0c2378021..402ccf6c926 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -50,16 +50,16 @@ class FullyAsyncTrainer(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -309,9 +309,11 @@ def fit(self):
             pprint(metrics)
 
             # Trigger parameter synchronization after training step
-            print(f"[FullyAsyncTrainer] global_steps: {self.global_steps} "
-                  f"local_trigger_step: {self.local_trigger_step} "
-                  f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}")
+            print(
+                f"[FullyAsyncTrainer] global_steps: {self.global_steps} "
+                f"local_trigger_step: {self.local_trigger_step} "
+                f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}"
+            )
             self._trigger_parameter_sync_after_step()
             self.global_steps += 1
 

From 5f1302e26bb80fab0ac6bd2b76b0c06b7115fbfb Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 20:36:30 +0800
Subject: [PATCH 063/182] fully log

---
 recipe/fully_async_policy/detach_utils.py     |  16 ++-
 .../fully_async_rollouter.py                  |   7 +
 .../fully_async_policy/fully_async_trainer.py | 134 ++++--------------
 recipe/fully_async_policy/message_queue.py    |   2 +-
 4 files changed, 44 insertions(+), 115 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index a76f42d7362..426a51ae35e 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -163,18 +163,22 @@ def assemble_batch_from_rollout_samples(
 
     # 收集统计信息和元数据（直接从 RolloutSample 中获取）
     param_versions = [rs.param_version for rs in rollout_samples]
-    sample_timestamps = [rs.generation_timestamp for rs in rollout_samples]
+
+    processing_time_stats = {
+        "avg_processing_time": np.mean(processing_times),
+        "max_processing_time": np.max(processing_times),
+        "min_processing_time": np.min(processing_times),
+        "tp50_processing_time": np.percentile(processing_times, 50),  # 中位数
+        "tp99_processing_time": np.percentile(processing_times, 99),  # 99百分位
+        "tp95_processing_time": np.percentile(processing_times, 95),  # 95百分位也很有用
+    }
 
     # 创建 meta_info
     final_batch.meta_info.update(
         {
             "rollout_param_versions": param_versions,
-            "sample_timestamps": sample_timestamps,
-            "avg_processing_time": np.mean(processing_times) if processing_times else 0,
-            "max_processing_time": np.max(processing_times) if processing_times else 0,
             "param_version_diversity": len(set(param_versions)) if param_versions else 0,
-            "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0,
-            "assembly_time": time.time() - start_time,
+            **processing_time_stats,
         }
     )
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index ac1253454d7..cad1542afc4 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -241,6 +241,8 @@ async def _feed_samples(self):
         sample_count = 0
         should_stop = False
 
+        progress_bar = tqdm(total=self.total_rollout_steps, initial=self.global_steps, desc="Training Progress")
+
         for epoch, batch_dict in continuous_iterator:
             if should_stop:  # 检查停止标志
                 break
@@ -277,11 +279,13 @@ async def _feed_samples(self):
                     should_stop = True  # 设置停止标志
                     break
 
+                progress_bar.update(1)
                 self.global_steps += 1
 
             sample_count += 1
 
         # 发送结束信号
+        progress_bar.close()
         await self.pending_queue.put("DONE")
         print(f"[FullyAsyncRollouter][Feed] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
 
@@ -597,6 +601,9 @@ async def get_statistics(self) -> dict:
             "pending_queue_size": self.pending_queue.qsize(),
             "active_tasks_size": len(self.active_tasks),
             "result_queue_size": self.result_queue.qsize(),
+            "max_required_samples": self.max_required_samples,
+            "required_samples": self.required_samples,
+            "staleness_threshold": self.staleness_threshold,
         }
 
         return stats
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 402ccf6c926..ed18e209c96 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -146,7 +146,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
         while len(queue_samples) < self.required_samples:
             # 获取单个样本，会一直等待直到有样本或收到None
-            sample = self.message_queue_client.get_sample_sync()
+            sample, queue_len = self.message_queue_client.get_sample_sync()
 
             if sample is None:
                 # 检测到结束信号（None），立即退出
@@ -159,7 +159,9 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
             queue_samples.append(sample)
 
             if len(queue_samples) % 10 == 0:
-                print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples")
+                print(
+                    f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. mq_len: {queue_len}"
+                )
 
         consumer_end = time.time()
 
@@ -256,58 +258,36 @@ def fit(self):
                     if batch is None:
                         break
 
-                    # 更新统计信息
-                    self.processed_samples += len(batch) if isinstance(batch, list) else 1
-
                     # 从meta_info中获取参数版本信息
                     if hasattr(batch, "meta_info") and batch.meta_info:
-                        # meta_info={'metrics': [{'generate_sequences': 1.8240885734558105, 'tool_calls': 0.0},
-                        # {'generate_sequences': 2.5197629928588867, 'tool_calls': 0.0},
-                        # {'generate_sequences': 3.5084900856018066, 'tool_calls': 0.0},
-                        # {'generate_sequences': 2.4329097270965576, 'tool_calls': 0.0},
-                        # {'generate_sequences': 3.0567924976348877, 'tool_calls': 0.0},
-                        # {'generate_sequences': 4.271160840988159, 'tool_calls': 0.0}],
-                        # 'global_steps': 22,
-                        # 'global_token_num': [588, 517, 422, 406, 355, 288],
-                        # 'rollout_param_versions': [0, 0, 0, 0, 0, 0],
-                        # 'sample_timestamps': [1755278023.7771623, 1755278024.101492, 1755278024.3597627,
-                        #                       1755278024.4885263, 1755278025.1039019, 1755278025.555585],
-                        # 'avg_processing_time': 2.935534119606018,
-                        # 'max_processing_time': 4.271160840988159,
-                        # 'param_version_diversity': 1,
-                        # 'avg_sample_age': 1.0503787994384766,
-                        # 'assembly_time': 0.05373978614807129})
-                        rollout_param_versions = batch.meta_info.get("rollout_param_versions", [])
-                        if rollout_param_versions:
-                            # 统计陈旧样本
-                            stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
-                            self.stale_samples_processed += stale_count
-
-                        # 添加新鲜度指标到metrics
-                        if rollout_param_versions:
-                            param_version_diversity = batch.meta_info.get("param_version_diversity", 0)
-                            avg_sample_age = batch.meta_info.get("avg_sample_age", 0)
-
-                            metrics.update(
-                                {
-                                    "freshness/param_version_diversity": param_version_diversity,
-                                    "freshness/avg_sample_age": avg_sample_age,
-                                    "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions)
-                                    if rollout_param_versions
-                                    else 0,
-                                    "statistics/processed_samples": self.processed_samples,
-                                    "statistics/stale_samples_processed": self.stale_samples_processed,
-                                    "statistics/current_param_version": self.current_param_version,
-                                }
-                            )
+                        # 统计陈旧样本
+                        rollout_param_versions = batch.meta_info["rollout_param_versions"]
+                        stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
+                        self.stale_samples_processed += stale_count
+                        metrics.update(
+                            {
+                                "fully_async/stale_samples_ratio": stale_count / len(rollout_param_versions),
+                                "fully_async/stale_samples_processed": self.stale_samples_processed,
+                                "fully_async/current_param_version": self.current_param_version,
+                            }
+                        )
+                        for metric in [
+                            "avg_processing_time",
+                            "max_processing_time",
+                            "min_processing_time",
+                            "tp50_processing_time",
+                            "tp99_processing_time",
+                            "tp95_processing_time",
+                            "param_version_diversity",
+                        ]:
+                            metrics[f"fully_async/{metric}"] = batch.meta_info.get(metric, 0)
+
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
                 self._check_save_checkpoint(False, timing_raw)
 
             # self._collect_metrics(batch, epoch, metrics, timing_raw)
-
             pprint(metrics)
-
             # Trigger parameter synchronization after training step
             print(
                 f"[FullyAsyncTrainer] global_steps: {self.global_steps} "
@@ -330,65 +310,3 @@ def _trigger_parameter_sync_after_step(self):
         else:
             self.local_trigger_step += 1
             return
-
-    def get_statistics(self) -> dict:
-        """Get training statistics"""
-        queue_stats = self.message_queue_client.get_statistics_sync() if self.message_queue_client else {}
-        return {
-            "global_steps": self.global_steps,
-            "processed_samples": self.processed_samples,
-            "stale_samples_processed": self.stale_samples_processed,
-            "current_param_version": self.current_param_version,
-            "queue_size": queue_stats.get("queue_size", 0),
-            "queue_total_produced": queue_stats.get("total_produced", 0),
-            "queue_total_consumed": queue_stats.get("total_consumed", 0),
-            "queue_dropped_samples": queue_stats.get("dropped_samples", 0),
-        }
-
-    def _compute_sample_freshness_metrics(self, rollout_samples: list[RolloutSample]) -> dict:
-        """
-        Compute sample freshness metrics
-
-        Args:
-            rollout_samples: List of RolloutSample objects
-
-        Returns:
-            dict: Dictionary of freshness metrics
-        """
-        if not rollout_samples:
-            return {}
-
-        try:
-            # Extract parameter versions and timestamps directly from RolloutSample
-            sample_ages = []
-            sample_latencies = []
-            current_time = time.time()
-
-            for sample in rollout_samples:
-                # Get information directly from RolloutSample
-                rollout_version = sample.param_version
-                generation_time = sample.generation_timestamp
-
-                age = max(0, self.current_param_version - rollout_version)
-                latency = max(0, current_time - generation_time)
-
-                sample_ages.append(age)
-                sample_latencies.append(latency)
-
-            if not sample_ages:
-                return {}
-
-            return {
-                "freshness/avg_sample_age": np.mean(sample_ages),
-                "freshness/max_sample_age": max(sample_ages),
-                "freshness/min_sample_age": min(sample_ages),
-                "freshness/avg_sample_latency": np.mean(sample_latencies),
-                "freshness/max_sample_latency": max(sample_latencies),
-                "freshness/min_sample_latency": min(sample_latencies),
-                "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages),
-                "freshness/sample_count": len(sample_ages),
-            }
-
-        except Exception as e:
-            logger.error(f"Error computing freshness metrics: {e}")
-            return {"freshness/error": str(e)}
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 2a12cb21c90..012445d45ed 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -114,7 +114,7 @@ async def get_sample(self) -> Any | None:
             # Get one sample
             data = self.queue.popleft()
             self.total_consumed += 1
-            return data
+            return data, len(self.queue)
 
     async def update_param_version(self, version: int):
         """Update current parameter version"""

From 42789e85be3ab32dbbc52d4bbeb9cb3353e7f6bb Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 20:51:23 +0800
Subject: [PATCH 064/182] fully async log

---
 .../fully_async_rollouter.py                  |  11 +-
 .../fully_async_policy/fully_async_trainer.py |   5 +-
 .../unittest/ray_async_resource_config.py     |   4 +-
 .../unittest/test_asyncio_message_queue.py    | 407 ------------------
 .../unittest/test_batch_utils.py              | 278 +-----------
 5 files changed, 12 insertions(+), 693 deletions(-)
 delete mode 100644 recipe/fully_async_policy/unittest/test_asyncio_message_queue.py

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index cad1542afc4..f349b5e06ed 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -17,6 +17,7 @@
 
 import ray
 from omegaconf import OmegaConf
+from tqdm import tqdm
 
 from recipe.fully_async_policy.detach_utils import (
     RolloutSample,
@@ -298,7 +299,7 @@ async def _processor_worker(self):
 
             async with self.lock:
                 if await self._should_pause_generation():
-                    print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束")
+                    print("[FullyAsyncRollouter][Processor] 等待已提交的任务结束")
                     if self.active_tasks:
                         await asyncio.gather(*self.active_tasks, return_exceptions=True)
                         self.active_tasks.clear()
@@ -309,7 +310,7 @@ async def _processor_worker(self):
             # 获取待处理的部分 RolloutSample
             async with self.lock:
                 if partial_rollout_sample == "DONE":
-                    print(f"[FullyAsyncRollouter][Processor] 收到结束信号，等待剩余任务完成...")
+                    print("[FullyAsyncRollouter][Processor] 收到结束信号，等待剩余任务完成...")
                     # 等待所有活动任务完成
                     if self.active_tasks:
                         await asyncio.gather(*self.active_tasks, return_exceptions=True)
@@ -524,7 +525,7 @@ async def _async_monitor_loop(self):
             # pause 和 resume 直接，不进行恢复操作
             if self.monitor_loop_trigger and not await self._should_pause_generation():
                 async with self.lock:
-                    print(f"[FullyAsyncRollouter][MonitorLoop] trigger resume")
+                    print("[FullyAsyncRollouter][MonitorLoop] trigger resume")
                     self.paused = False
                     self.condition.notify_all()
 
@@ -571,7 +572,7 @@ async def pause(self):
         async with self.lock:
             self.paused = True
             if self.active_tasks:
-                print(f"[FullyAsyncRollouter][Public][Pause]")
+                print("[FullyAsyncRollouter][Public][Pause]")
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
             print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
@@ -595,7 +596,7 @@ async def get_statistics(self) -> dict:
             "total_generated_samples": self.total_generated_samples,
             "staleness_samples": self.staleness_samples,
             "dropped_stale_samples": self.dropped_stale_samples,
-            "queue_max_size": self.max_queue_size,
+            "max_queue_size": self.max_queue_size,
             "queue_size": queue_stats["queue_size"],
             "max_concurrent_samples": self.max_concurrent_samples,
             "pending_queue_size": self.pending_queue.qsize(),
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index ed18e209c96..b82b1c4d5d2 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -18,12 +18,10 @@
 from pprint import pprint
 from typing import Any
 
-import numpy as np
 import ray
 from omegaconf import OmegaConf
 
 from recipe.fully_async_policy.detach_utils import (
-    RolloutSample,
     assemble_batch_from_rollout_samples,
     calculate_one_step_size,
 )
@@ -160,7 +158,8 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
             if len(queue_samples) % 10 == 0:
                 print(
-                    f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. mq_len: {queue_len}"
+                    f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. "
+                    f"mq_len: {queue_len}"
                 )
 
         consumer_end = time.time()
diff --git a/recipe/fully_async_policy/unittest/ray_async_resource_config.py b/recipe/fully_async_policy/unittest/ray_async_resource_config.py
index 40e85c9f1bd..930f8c5169f 100644
--- a/recipe/fully_async_policy/unittest/ray_async_resource_config.py
+++ b/recipe/fully_async_policy/unittest/ray_async_resource_config.py
@@ -349,10 +349,10 @@ async def main():
         # 压力测试
         await run_resource_stress_test()
 
-        print("\n✅ 所有测试完成!")
+        print("\n所有测试完成!")
 
     except Exception as e:
-        print(f"❌ 测试执行失败: {e}")
+        print(f"测试执行失败: {e}")
         import traceback
 
         traceback.print_exc()
diff --git a/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py b/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py
deleted file mode 100644
index 33e0d9db04d..00000000000
--- a/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# 测试使用 asyncio 的 MessageQueue
-# 对比 @ray.remote(num_cpus, max_concurrency) 参数的实际效果
-
-import asyncio
-import random
-
-# 导入修改后的 MessageQueue
-import time
-from dataclasses import dataclass
-
-import ray
-from omegaconf import DictConfig
-
-from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample
-
-
-@dataclass
-class TestConfig:
-    """测试配置"""
-
-    async_training: dict
-
-
-def create_test_config() -> DictConfig:
-    """创建测试配置"""
-    from omegaconf import OmegaConf
-
-    config_dict = {"async_training": {"staleness_threshold": 3}}
-    return OmegaConf.create(config_dict)
-
-
-class AsyncMessageQueueTester:
-    """异步消息队列测试器"""
-
-    def __init__(self):
-        self.config = create_test_config()
-
-    async def test_basic_async_operations(self):
-        """测试基本异步操作"""
-        print("\n🧪 测试基本异步操作")
-        print("=" * 50)
-
-        # 创建MessageQueue Actor
-        queue_actor = MessageQueue.remote(self.config, max_queue_size=100)
-        client = MessageQueueClient(queue_actor)
-
-        # 测试异步放入样本
-        test_samples = [
-            QueueSample(
-                data={"task_id": f"task_{i}", "content": f"测试数据_{i}"},
-                rollout_metadata={"timestamp": time.time(), "version": 1},
-            )
-            for i in range(10)
-        ]
-
-        # 异步并发放入样本
-        put_tasks = []
-        for i, sample in enumerate(test_samples):
-            task = asyncio.create_task(client.put_sample(sample, param_version=1), name=f"put_task_{i}")
-            put_tasks.append(task)
-
-        # 等待所有放入任务完成
-        put_results = await asyncio.gather(*put_tasks)
-        successful_puts = sum(put_results)
-
-        print(f"✅ 成功放入 {successful_puts}/{len(test_samples)} 个样本")
-
-        # 异步获取统计信息
-        stats = await client.get_statistics()
-        print(f"📊 队列统计: {stats}")
-
-        # 异步获取样本
-        samples_batch, queue_size = await client.get_samples(min_batch_count=5)
-        print(f"📦 获取了 {len(samples_batch)} 个样本，剩余队列大小: {queue_size}")
-
-        # 清理
-        await client.shutdown()
-
-        return successful_puts
-
-    async def test_concurrent_producers_consumers(self):
-        """测试并发生产者和消费者"""
-        print("\n🏭 测试并发生产者和消费者")
-        print("=" * 50)
-
-        # 创建 MessageQueue Actor
-        queue_actor = MessageQueue.remote(self.config, max_queue_size=200)
-        client = MessageQueueClient(queue_actor)
-
-        # 生产者协程
-        async def producer(producer_id: int, sample_count: int):
-            """生产者协程"""
-            produced = 0
-            for i in range(sample_count):
-                sample = QueueSample(
-                    data={
-                        "producer_id": producer_id,
-                        "task_id": f"producer_{producer_id}_task_{i}",
-                        "content": f"来自生产者{producer_id}的数据{i}",
-                    },
-                    rollout_metadata={"producer_timestamp": time.time(), "producer_id": producer_id},
-                )
-
-                success = await client.put_sample(sample, param_version=1)
-                if success:
-                    produced += 1
-
-                # 模拟生产间隔
-                await asyncio.sleep(random.uniform(0.01, 0.1))
-
-            print(f"🏭 生产者{producer_id} 完成，成功生产 {produced} 个样本")
-            return produced
-
-        # 消费者协程
-        async def consumer(consumer_id: int, target_count: int):
-            """消费者协程"""
-            consumed = 0
-            start_time = time.time()
-
-            while consumed < target_count:
-                try:
-                    # 尝试获取样本，设置超时
-                    sample = await asyncio.wait_for(client.get_sample(), timeout=2.0)
-
-                    if sample is not None:
-                        consumed += 1
-
-                        if consumed % 10 == 0:
-                            print(f"🍽️  消费者{consumer_id} 已消费 {consumed} 个样本")
-                    else:
-                        print(f"⚠️ 消费者{consumer_id} 收到空样本，队列可能已关闭")
-                        break
-
-                except asyncio.TimeoutError:
-                    print(f"⏰ 消费者{consumer_id} 超时，检查队列状态...")
-                    stats = await client.get_statistics()
-                    if stats["queue_size"] == 0:
-                        print(f"📭 队列为空，消费者{consumer_id} 等待...")
-                        await asyncio.sleep(0.5)
-                    continue
-
-                # 模拟处理时间
-                await asyncio.sleep(random.uniform(0.02, 0.05))
-
-            elapsed = time.time() - start_time
-            print(f"🍽️  消费者{consumer_id} 完成，消费了 {consumed} 个样本，耗时 {elapsed:.2f}s")
-            return consumed
-
-        # 启动并发生产者和消费者
-        num_producers = 3
-        num_consumers = 2
-        samples_per_producer = 20
-
-        # 创建生产者任务
-        producer_tasks = [
-            asyncio.create_task(producer(i, samples_per_producer), name=f"producer_{i}") for i in range(num_producers)
-        ]
-
-        # 创建消费者任务
-        total_expected_samples = num_producers * samples_per_producer
-        samples_per_consumer = total_expected_samples // num_consumers
-
-        consumer_tasks = [
-            asyncio.create_task(
-                consumer(i, samples_per_consumer + (5 if i == 0 else 0)),  # 第一个消费者多处理一些
-                name=f"consumer_{i}",
-            )
-            for i in range(num_consumers)
-        ]
-
-        # 等待所有任务完成
-        start_time = time.time()
-
-        producer_results = await asyncio.gather(*producer_tasks, return_exceptions=True)
-        consumer_results = await asyncio.gather(*consumer_tasks, return_exceptions=True)
-
-        end_time = time.time()
-
-        # 统计结果
-        total_produced = sum(r for r in producer_results if isinstance(r, int))
-        total_consumed = sum(r for r in consumer_results if isinstance(r, int))
-
-        print("\n📈 并发测试结果:")
-        print(f"   总生产样本: {total_produced}")
-        print(f"   总消费样本: {total_consumed}")
-        print(f"   总耗时: {end_time - start_time:.2f}s")
-        print(f"   生产效率: {total_produced / (end_time - start_time):.2f} samples/s")
-        print(f"   消费效率: {total_consumed / (end_time - start_time):.2f} samples/s")
-
-        # 最终统计
-        final_stats = await client.get_statistics()
-        print(f"📊 最终队列统计: {final_stats}")
-
-        # 清理
-        await client.shutdown()
-
-        return total_produced, total_consumed
-
-    async def compare_resource_configurations(self):
-        """对比不同资源配置的效果"""
-        print("\n⚡ 对比不同资源配置的效果")
-        print("=" * 50)
-
-        # 测试配置列表
-        configs = [
-            {"name": "默认配置", "num_cpus": None, "max_concurrency": None, "decorator": ray.remote},
-            {
-                "name": "高CPU低并发",
-                "num_cpus": 4,
-                "max_concurrency": 5,
-                "decorator": lambda: ray.remote(num_cpus=4, max_concurrency=5),
-            },
-            {
-                "name": "低CPU高并发",
-                "num_cpus": 1,
-                "max_concurrency": 20,
-                "decorator": lambda: ray.remote(num_cpus=1, max_concurrency=20),
-            },
-            {
-                "name": "平衡配置",
-                "num_cpus": 2,
-                "max_concurrency": 10,
-                "decorator": lambda: ray.remote(num_cpus=2, max_concurrency=10),
-            },
-        ]
-
-        results = {}
-
-        for config in configs:
-            print(f"\n🧪 测试配置: {config['name']}")
-            print(f"   num_cpus: {config['num_cpus']}")
-            print(f"   max_concurrency: {config['max_concurrency']}")
-
-            # 动态创建MessageQueue类
-            if config["num_cpus"] is None:
-                QueueClass = MessageQueue
-            else:
-                QueueClass = config["decorator"]()(MessageQueue)
-
-            # 创建queue实例
-            queue_actor = QueueClass.remote(self.config, max_queue_size=100)
-            client = MessageQueueClient(queue_actor)
-
-            # 执行性能测试
-            start_time = time.time()
-
-            # 并发放入大量样本
-            sample_count = 50
-            put_tasks = []
-
-            for i in range(sample_count):
-                sample = QueueSample(
-                    data={
-                        "task_id": f"perf_test_{i}",
-                        "config": config["name"],
-                        "data_size": random.randint(100, 1000),
-                    },
-                    rollout_metadata={"config_test": True},
-                )
-
-                task = asyncio.create_task(client.put_sample(sample, param_version=1))
-                put_tasks.append(task)
-
-                # 模拟流式到达
-                if i % 10 == 0:
-                    await asyncio.sleep(0.01)
-
-            # 等待所有put完成
-            put_results = await asyncio.gather(*put_tasks)
-            put_time = time.time() - start_time
-
-            # 获取所有样本
-            get_start_time = time.time()
-            all_samples = []
-
-            while True:
-                samples_batch, queue_size = await client.get_samples(min_batch_count=1)
-                if not samples_batch:
-                    break
-                all_samples.extend(samples_batch)
-
-                if queue_size == 0:
-                    break
-
-            get_time = time.time() - get_start_time
-            total_time = time.time() - start_time
-
-            successful_puts = sum(put_results)
-
-            # 记录结果
-            results[config["name"]] = {
-                "successful_puts": successful_puts,
-                "retrieved_samples": len(all_samples),
-                "put_time": put_time,
-                "get_time": get_time,
-                "total_time": total_time,
-                "put_throughput": successful_puts / put_time if put_time > 0 else 0,
-                "get_throughput": len(all_samples) / get_time if get_time > 0 else 0,
-                "total_throughput": (successful_puts + len(all_samples)) / total_time if total_time > 0 else 0,
-            }
-
-            print(f"   ✅ 放入: {successful_puts}/{sample_count}")
-            print(f"   📦 获取: {len(all_samples)}")
-            print(f"   ⏱️  放入耗时: {put_time:.3f}s")
-            print(f"   ⏱️  获取耗时: {get_time:.3f}s")
-            print(f"   🚀 放入吞吐量: {successful_puts / put_time:.2f} ops/s")
-
-            # 清理
-            await client.shutdown()
-
-            # 间隔
-            await asyncio.sleep(1)
-
-        # 生成对比报告
-        print("\n📊 资源配置对比报告")
-        print("=" * 80)
-        print(f"{'配置名称':<15} {'放入吞吐量':<12} {'获取吞吐量':<12} {'总吞吐量':<12} {'总耗时':<10}")
-        print("-" * 80)
-
-        best_config = ""
-        best_throughput = 0
-
-        for config_name, result in results.items():
-            put_throughput = result["put_throughput"]
-            get_throughput = result["get_throughput"]
-            total_throughput = result["total_throughput"]
-            total_time = result["total_time"]
-
-            print(
-                f"{config_name:<15} {put_throughput:<12.2f} {get_throughput:<12.2f} "
-                f"{total_throughput:<12.2f} {total_time:<10.3f}s"
-            )
-
-            if total_throughput > best_throughput:
-                best_throughput = total_throughput
-                best_config = config_name
-
-        print(f"\n🏆 最佳配置: {best_config} (总吞吐量: {best_throughput:.2f} ops/s)")
-
-        return results
-
-
-async def main():
-    """主函数"""
-    # 初始化Ray
-    if not ray.is_initialized():
-        ray.init(
-            num_cpus=8,
-            object_store_memory=1000000000,  # 1GB
-            ignore_reinit_error=True,
-        )
-
-    print("🎯 异步MessageQueue测试")
-    print(f"Ray集群资源: {ray.cluster_resources()}")
-
-    tester = AsyncMessageQueueTester()
-
-    try:
-        # 基本异步操作测试
-        await tester.test_basic_async_operations()
-
-        # 并发生产者消费者测试
-        await tester.test_concurrent_producers_consumers()
-
-        # 资源配置对比测试
-        await tester.compare_resource_configurations()
-
-        print("\n✅ 所有测试完成!")
-
-        # 总结
-        print("\n📋 总结:")
-        print("1. 使用 asyncio 后的优势:")
-        print("   - 真正的异步等待，不阻塞事件循环")
-        print("   - 更好的并发性能")
-        print("   - 与Ray的异步接口完美集成")
-
-        print("\n2. 资源配置建议:")
-        print("   - num_cpus: 控制CPU资源分配，影响计算密集型任务")
-        print("   - max_concurrency: 控制并发数，影响I/O密集型任务")
-        print("   - 对于MessageQueue: 推荐 num_cpus=2, max_concurrency=20")
-
-    except Exception as e:
-        print(f"❌ 测试失败: {e}")
-        import traceback
-
-        traceback.print_exc()
-
-    finally:
-        ray.shutdown()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py
index ddde3a4ad92..b9351c46c28 100644
--- a/recipe/fully_async_policy/unittest/test_batch_utils.py
+++ b/recipe/fully_async_policy/unittest/test_batch_utils.py
@@ -98,282 +98,8 @@ def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) ->
         """创建测试用的 RolloutSample"""
         # 创建 mock AgentLoopOutput
         agent_loop_output = MockAgentLoopOutput(
-            prompt_ids=[
-                151644,
-                8948,
-                198,
-                2610,
-                525,
-                1207,
-                16948,
-                11,
-                3465,
-                553,
-                54364,
-                14817,
-                13,
-                1446,
-                525,
-                264,
-                10950,
-                17847,
-                13,
-                151645,
-                198,
-                151644,
-                872,
-                198,
-                24732,
-                21189,
-                264,
-                400,
-                16,
-                17,
-                40358,
-                817,
-                2254,
-                13,
-                758,
-                279,
-                1156,
-                2003,
-                11,
-                566,
-                37102,
-                264,
-                4843,
-                315,
-                432,
-                26,
-                304,
-                279,
-                2086,
-                2003,
-                11,
-                566,
-                37102,
-                264,
-                8338,
-                315,
-                1128,
-                566,
-                702,
-                2115,
-                13,
-                2585,
-                1753,
-                3220,
-                1558,
-                566,
-                614,
-                2115,
-                311,
-                6248,
-                279,
-                2254,
-                30,
-                6771,
-                594,
-                1744,
-                3019,
-                553,
-                3019,
-                323,
-                2550,
-                279,
-                1590,
-                4226,
-                1283,
-                330,
-                820,
-                3263,
-                151645,
-                198,
-                151644,
-                77091,
-                198,
-            ],
-            response_ids=[
-                14374,
-                14822,
-                14319,
-                12,
-                8304,
-                74216,
-                510,
-                16,
-                13,
-                4127,
-                40358,
-                25,
-                400,
-                16,
-                17,
-                198,
-                17,
-                13,
-                5512,
-                2003,
-                18024,
-                510,
-                262,
-                481,
-                8364,
-                37102,
-                264,
-                4843,
-                315,
-                279,
-                400,
-                16,
-                17,
-                624,
-                262,
-                481,
-                25783,
-                7391,
-                284,
-                57960,
-                37018,
-                90,
-                16,
-                15170,
-                18,
-                92,
-                1124,
-                15136,
-                32882,
-                16,
-                17,
-                284,
-                32882,
-                19,
-                66426,
-                18,
-                13,
-                10657,
-                3311,
-                1283,
-                1156,
-                2003,
-                25,
-                400,
-                16,
-                17,
-                481,
-                32882,
-                19,
-                284,
-                32882,
-                23,
-                66426,
-                19,
-                13,
-                10440,
-                2003,
-                18024,
-                510,
-                262,
-                481,
-                8364,
-                37102,
-                264,
-                8338,
-                315,
-                279,
-                9664,
-                3311,
-                1283,
-                279,
-                1156,
-                2003,
-                624,
-                262,
-                481,
-                11487,
-                2115,
-                284,
-                400,
-                23,
-                481,
-                400,
-                19,
-                284,
-                400,
-                19,
-                198,
-                262,
-                481,
-                25783,
-                7391,
-                2049,
-                57960,
-                37018,
-                90,
-                16,
-                15170,
-                19,
-                92,
-                1124,
-                15136,
-                32882,
-                19,
-                284,
-                32882,
-                16,
-                66426,
-                20,
-                13,
-                13023,
-                3311,
-                2115,
-                510,
-                262,
-                481,
-                8364,
-                702,
-                3322,
-                369,
-                264,
-                2480,
-                2003,
-                311,
-                6248,
-                279,
-                2254,
-                2041,
-                32821,
-                894,
-                803,
-                40358,
-                382,
-                43434,
-                510,
-                24732,
-                702,
-                3070,
-                65039,
-                23,
-                334,
-                2115,
-                13,
-                1260,
-                686,
-                614,
-                3322,
-                3220,
-                311,
-                6248,
-                279,
-                2254,
-                2041,
-                32821,
-                894,
-                803,
-                40358,
-                13,
-                151645,
-            ],
+            prompt_ids=torch.randint(0, 32000, (175,)).tolist(),
+            response_ids=torch.randint(0, 32000, (175,)).tolist(),
             response_mask=[1] * 175,  # 真实的response长度
             num_turns=2,
             metrics=MockAgentLoopMetrics(generate_sequences=1.6468379497528076, tool_calls=0.0),

From a1c0f5c6edfb2d3f8a938fcc43c25e921ab7377c Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 18 Aug 2025 20:52:18 +0800
Subject: [PATCH 065/182] ruff format

---
 .../rollout/vllm_rollout/vllm_rollout_spmd.py | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index 0d419dcf177..071dd917119 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -109,11 +109,11 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf
             if hasattr(model_hf_config, "max_position_embeddings"):
                 max_position_embeddings = model_hf_config.max_position_embeddings
             elif hasattr(model_hf_config, "llm_config") and hasattr(
-                    model_hf_config.llm_config, "max_position_embeddings"
+                model_hf_config.llm_config, "max_position_embeddings"
             ):
                 max_position_embeddings = model_hf_config.llm_config.max_position_embeddings
             elif hasattr(model_hf_config, "text_config") and hasattr(
-                    model_hf_config.text_config, "max_position_embeddings"
+                model_hf_config.text_config, "max_position_embeddings"
             ):
                 max_position_embeddings = model_hf_config.text_config.max_position_embeddings
             if max_position_embeddings is None:
@@ -128,12 +128,12 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf
             rope_scaling_factor = rope_scaling_config.get("factor", 1.0)
 
             assert (
-                    model_hf_config.max_position_embeddings * rope_scaling_factor
-                    >= config.prompt_length + config.response_length
+                model_hf_config.max_position_embeddings * rope_scaling_factor
+                >= config.prompt_length + config.response_length
             ), (
-                    "model context length should be greater than total sequence length, "
-                    + f"got rope_scaling_factor={rope_scaling_factor} and "
-                    + f"max_position_embeddings={model_hf_config.max_position_embeddings}"
+                "model context length should be greater than total sequence length, "
+                + f"got rope_scaling_factor={rope_scaling_factor} and "
+                + f"max_position_embeddings={model_hf_config.max_position_embeddings}"
             )
 
         max_model_len = int(config.max_model_len or config.prompt_length + config.response_length)
@@ -268,7 +268,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
         if "multi_modal_data" in non_tensor_batch:
             vllm_inputs = []
             for raw_prompt_ids, multi_modal_data in zip(
-                    non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True
+                non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True
             ):
                 vllm_inputs.append({"prompt_token_ids": raw_prompt_ids, "multi_modal_data": multi_modal_data})
         else:
@@ -390,9 +390,9 @@ def _monkey_patch_compute_logits(model, vocab_size: int):
     original_compute_logits = model.compute_logits
 
     def compute_logits(
-            self,
-            hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata,
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
         logits = original_compute_logits(hidden_states, sampling_metadata)
         logits[..., vocab_size:] = float("-inf")

From 26b55d96625e7e9c9f58f86618182b81ab2e5e4a Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 19 Aug 2025 10:33:20 +0800
Subject: [PATCH 066/182] update log

---
 .../config/fully_async_ppo_trainer.yaml       |  2 +-
 .../fully_async_rollouter.py                  | 42 ++++++++++---------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 9d0a8c67383..f1c4a1c602f 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -11,7 +11,7 @@ defaults:
 async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
-  trigger_parameter_sync_step: 10      # >=1 train 每次训练一个batch, 迭代多少次后触发更新
+  trigger_parameter_sync_step: 32      # >=1 train 每次训练一个batch, 迭代多少次后触发更新
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index f349b5e06ed..092ff2add17 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -39,16 +39,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        device_name=None,
+            self,
+            config,
+            tokenizer,
+            role_worker_mapping: dict[Role, WorkerType],
+            resource_pool_manager: ResourcePoolManager,
+            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+            processor=None,
+            reward_fn=None,
+            val_reward_fn=None,
+            device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -136,7 +136,8 @@ def __init__(
             self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
         )
         self.max_required_samples = (
-            self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
+                self.required_samples * (
+                self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
         )
         print(
             f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
@@ -242,7 +243,8 @@ async def _feed_samples(self):
         sample_count = 0
         should_stop = False
 
-        progress_bar = tqdm(total=self.total_rollout_steps, initial=self.global_steps, desc="Training Progress")
+        progress_bar = tqdm(total=self.total_rollout_steps / self.required_samples, initial=self.global_steps,
+                            desc="Training Progress")
 
         for epoch, batch_dict in continuous_iterator:
             if should_stop:  # 检查停止标志
@@ -280,7 +282,8 @@ async def _feed_samples(self):
                     should_stop = True  # 设置停止标志
                     break
 
-                progress_bar.update(1)
+                if self.global_steps % self.required_samples == 0:
+                    progress_bar.update(1)
                 self.global_steps += 1
 
             sample_count += 1
@@ -363,7 +366,7 @@ async def _process_single_sample_streaming(self, partial_rollout_sample):
         if processing_time > self.max_processing_time:
             self.max_processing_time = processing_time
 
-        print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s")
+        # print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s")
 
     async def _consumer_worker(self):
         """消费者协程，负责从结果队列获取处理结果并放入消息队列"""
@@ -523,11 +526,12 @@ async def _async_monitor_loop(self):
                 last_stats_time = current_time
 
             # pause 和 resume 直接，不进行恢复操作
-            if self.monitor_loop_trigger and not await self._should_pause_generation():
-                async with self.lock:
-                    print("[FullyAsyncRollouter][MonitorLoop] trigger resume")
-                    self.paused = False
-                    self.condition.notify_all()
+            if self.monitor_loop_trigger and self.paused:
+                if await self._should_pause_generation():
+                    async with self.lock:
+                        print("[FullyAsyncRollouter][MonitorLoop] trigger resume")
+                        self.paused = False
+                        self.condition.notify_all()
 
     async def _should_pause_generation(self) -> bool:
         if self.paused:

From 749d4df3e33bf19070f255d206444f334c764b11 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 20 Aug 2025 15:55:48 +0800
Subject: [PATCH 067/182] partial rollout

---
 recipe/fully_async_policy/detach_utils.py     |  5 +-
 .../fully_async_rollouter.py                  | 74 ++++++++++---------
 verl/experimental/agent_loop/__init__.py      |  3 +-
 verl/experimental/agent_loop/agent_loop.py    | 38 ++++++++--
 .../partial_single_turn_agent_loop.py         | 68 +++++++++++++++++
 .../rollout/vllm_rollout/vllm_async_server.py | 52 ++++++++++++-
 6 files changed, 194 insertions(+), 46 deletions(-)
 create mode 100644 verl/experimental/agent_loop/partial_single_turn_agent_loop.py

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 426a51ae35e..3ac998bc82a 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -46,7 +46,6 @@ class RolloutSample:
 
     # Processing metadata
     processing_time: float
-    generation_timestamp: float
     param_version: int
 
 
@@ -76,6 +75,10 @@ def prepare_single_generation_data(batch_dict, global_steps) -> DataProto:
         batch_keys=batch_keys_to_pop,
         non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
     )
+
+    # 设置使用支持partial的agent
+    full_batch.non_tensor_batch["agent_name"] = np.array(["partial_single_turn_agent"] * len(full_batch), dtype=object)
+
     # 添加全局步数到生成数据
     full_batch.meta_info["global_steps"] = global_steps
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 092ff2add17..2500a215d9a 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -39,16 +39,16 @@ class FullyAsyncRollouter(RayPPOTrainer):
     """
 
     def __init__(
-            self,
-            config,
-            tokenizer,
-            role_worker_mapping: dict[Role, WorkerType],
-            resource_pool_manager: ResourcePoolManager,
-            ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-            processor=None,
-            reward_fn=None,
-            val_reward_fn=None,
-            device_name=None,
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        device_name=None,
     ):
         # Store the tokenizer for text processing
         self.tokenizer = tokenizer
@@ -136,8 +136,7 @@ def __init__(
             self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
         )
         self.max_required_samples = (
-                self.required_samples * (
-                self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
+            self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
         )
         print(
             f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
@@ -161,6 +160,7 @@ def __init__(
         self.pending_queue = asyncio.Queue(maxsize=100)
         self.active_tasks = set()
         self.result_queue = asyncio.Queue()
+        self.cancel_queue = asyncio.Queue()
 
         # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
         self.monitor_loop_trigger = True
@@ -243,8 +243,9 @@ async def _feed_samples(self):
         sample_count = 0
         should_stop = False
 
-        progress_bar = tqdm(total=self.total_rollout_steps / self.required_samples, initial=self.global_steps,
-                            desc="Training Progress")
+        progress_bar = tqdm(
+            total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, desc="Training Progress"
+        )
 
         for epoch, batch_dict in continuous_iterator:
             if should_stop:  # 检查停止标志
@@ -257,8 +258,7 @@ async def _feed_samples(self):
             for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n):
                 sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}"
 
-                # 创建部分 RolloutSample，不包含 _gen_data（因为它不在数据类定义中）
-                partial_rollout_sample = RolloutSample(
+                rollout_sample = RolloutSample(
                     full_batch=full_batch,
                     agent_loop_output=None,  # 待处理后填充
                     sample_id=sample_id,
@@ -266,11 +266,10 @@ async def _feed_samples(self):
                     rollout_n_index=rollout_n_index,
                     original_sample_index=sample_count,
                     processing_time=0.0,  # 待处理后填充
-                    generation_timestamp=0.0,  # 待处理后填充
                     param_version=0,  # 待处理后填充
                 )
 
-                await self.pending_queue.put(partial_rollout_sample)
+                await self.pending_queue.put(rollout_sample)
 
                 # 检查是否到达最后一步
                 if self.global_steps >= self.total_rollout_steps:
@@ -297,7 +296,11 @@ async def _processor_worker(self):
         """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
 
         while True:
-            partial_rollout_sample = await self.pending_queue.get()
+            if not self.cancel_queue.empty():
+                print(f"self.cancel_queue {self.cancel_queue.qsize()}")
+                rollout_sample = await self.cancel_queue.get()
+            else:
+                rollout_sample = await self.pending_queue.get()
             self.staleness_samples += 1
 
             async with self.lock:
@@ -312,7 +315,7 @@ async def _processor_worker(self):
 
             # 获取待处理的部分 RolloutSample
             async with self.lock:
-                if partial_rollout_sample == "DONE":
+                if rollout_sample == "DONE":
                     print("[FullyAsyncRollouter][Processor] 收到结束信号，等待剩余任务完成...")
                     # 等待所有活动任务完成
                     if self.active_tasks:
@@ -337,37 +340,40 @@ async def _processor_worker(self):
                 while self.paused:
                     await self.condition.wait()
                 task = asyncio.create_task(
-                    self._process_single_sample_streaming(partial_rollout_sample),
-                    name=f"process_{partial_rollout_sample.sample_id}",
+                    self._process_single_sample_streaming(rollout_sample),
+                    name=rollout_sample.sample_id,
                 )
                 self.active_tasks.add(task)
 
             # 标记队列任务完成
             self.pending_queue.task_done()
 
-    async def _process_single_sample_streaming(self, partial_rollout_sample):
+    async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         """流式处理单个样本"""
 
         # 调用异步生成方法
         agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async(
-            partial_rollout_sample.full_batch, partial_rollout_sample.sample_id
+            rollout_sample.full_batch, rollout_sample.agent_loop_output
         )
         # 直接更新 RolloutSample 对象，填充剩余字段
-        partial_rollout_sample.agent_loop_output = agent_loop_output
-        partial_rollout_sample.processing_time = processing_time
-        partial_rollout_sample.generation_timestamp = time.time()
-        partial_rollout_sample.param_version = self.current_param_version
+        rollout_sample.agent_loop_output = agent_loop_output
+        rollout_sample.processing_time += processing_time
+        rollout_sample.param_version = self.current_param_version
 
-        # 直接放入结果队列
-        await self.result_queue.put(partial_rollout_sample)
+        print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} cost {processing_time:.2f}s")
+
+        if agent_loop_output.is_cancel:
+            # 放入 cancel 队列中，等待恢复生成
+            await self.cancel_queue.put(rollout_sample)
+        else:
+            # 否则放入结果队列
+            await self.result_queue.put(rollout_sample)
 
         self.processed_sample_count += 1
         # 更新最大处理时间统计
         if processing_time > self.max_processing_time:
             self.max_processing_time = processing_time
 
-        # print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s")
-
     async def _consumer_worker(self):
         """消费者协程，负责从结果队列获取处理结果并放入消息队列"""
         while True:
@@ -575,6 +581,8 @@ async def pause(self):
         print("[FullyAsyncRollouter][Public] pause")
         async with self.lock:
             self.paused = True
+            # 取消rollout所有任务
+            # await self.async_rollout_manager.cancel()
             if self.active_tasks:
                 print("[FullyAsyncRollouter][Public][Pause]")
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
@@ -586,7 +594,6 @@ async def resume(self):
         """resume rollout
         TODO integrated Partial Rollout
         """
-        print("[FullyAsyncRollouter][Public] resume")
         async with self.lock:
             self.paused = False
             self.condition.notify_all()
@@ -609,6 +616,7 @@ async def get_statistics(self) -> dict:
             "max_required_samples": self.max_required_samples,
             "required_samples": self.required_samples,
             "staleness_threshold": self.staleness_threshold,
+            "cancel_queue_size": self.cancel_queue.qsize(),
         }
 
         return stats
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index a39171db764..0d131dd1d3a 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -14,8 +14,9 @@
 
 from .agent_loop import AgentLoopBase, AgentLoopManager
 from .single_turn_agent_loop import SingleTurnAgentLoop
+from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
 from .tool_agent_loop import ToolAgentLoop
 
-_ = [SingleTurnAgentLoop, ToolAgentLoop]
+_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop]
 
 __all__ = ["AgentLoopBase", "AgentLoopManager"]
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 29f2b30edb7..dcb7184df5d 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -18,7 +18,7 @@
 import random
 import time
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, Optional
 
 import hydra
 import numpy as np
@@ -104,6 +104,15 @@ async def generate(
         )
         return output
 
+    async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
+        server = self._choose_server(request_id)
+        output = await server.generate_for_partial.remote(
+            request_id=request_id,
+            prompt_ids=prompt_ids,
+            sampling_params=sampling_params,
+        )
+        return output
+
 
 class AgentLoopMetrics(BaseModel):
     """Agent loop performance metrics."""
@@ -125,6 +134,8 @@ class AgentLoopOutput(BaseModel):
     """Number of chat turns, including user, assistant, tool."""
     metrics: AgentLoopMetrics
     """Auxiliary performance metrics"""
+    is_cancel: bool = False
+    """Indicates whether the request was interrupted"""
 
 
 # make hydra.utils.instantiate happy
@@ -169,12 +180,15 @@ def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs):
         cls._class_initialized = True
 
     @abstractmethod
-    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
         """Run agent loop to interact with LLM server and environment.
 
         Args:
             messages (List[Dict[str, Any]]): Input messages.
             sampling_params (Dict[str, Any]): LLM sampling params.
+            partial_output: Optional[AgentLoopOutput]: already rollout result.
 
         Returns:
             AgentLoopOutput: Agent loop output.
@@ -368,11 +382,14 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
         output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config)
         return output
 
-    async def generate_sequences_no_post(self, batch: DataProto) -> list[AgentLoopOutput]:
+    async def generate_sequences_no_post(
+        self, batch: DataProto, partial_output: Optional[AgentLoopOutput]
+    ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
         Args:
             batch (DataProto): Input batch.
+            partial_output: Optional[AgentLoopOutput]: already rollout result.
 
         Returns:
             list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
@@ -413,7 +430,9 @@ async def generate_sequences_no_post(self, batch: DataProto) -> list[AgentLoopOu
 
         for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True):
             tasks.append(
-                asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory))
+                asyncio.create_task(
+                    self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
+                )
             )
         outputs = await asyncio.gather(*tasks)
 
@@ -425,6 +444,7 @@ async def _run_agent_loop(
         messages: list[dict[str, Any]],
         sampling_params: dict[str, Any],
         trajectory: dict[str, Any],
+        partial_output: Optional[AgentLoopOutput],
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
             step=trajectory["step"],
@@ -444,7 +464,7 @@ async def _run_agent_loop(
                 server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
             )
-            output = await agent_loop.run(messages, sampling_params)
+            output = await agent_loop.run(messages, sampling_params, partial_output)
             return output
 
 
@@ -582,13 +602,15 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         output.meta_info = {"timing": timing}
         return output
 
-    async def generate_single_sample_async(self, sample: DataProto, sample_id: str) -> tuple[AgentLoopOutput, float]:
+    async def generate_single_sample_async(
+        self, sample: DataProto, partial_output: Optional[AgentLoopOutput]
+    ) -> tuple[AgentLoopOutput, float]:
         """
         异步处理单个样本 - 用于流式推理的核心方法
 
         Args:
             sample: 单个样本数据
-            sample_id: 样本ID
+            partial_output: Optional[AgentLoopOutput]: already rollout result.
 
         Returns:
             tuple[AgentLoopOutput, float]: 处理结果和处理时间
@@ -599,7 +621,7 @@ async def generate_single_sample_async(self, sample: DataProto, sample_id: str)
         worker = self._select_best_worker()
 
         # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
-        output_future = worker.generate_sequences_no_post.remote(sample)
+        output_future = worker.generate_sequences_no_post.remote(sample, partial_output)
         outputs = await asyncio.wrap_future(output_future.future())
 
         processing_time = time.time() - start_time
diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
new file mode 100644
index 00000000000..fd2a7292e67
--- /dev/null
+++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
@@ -0,0 +1,68 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Any, Optional
+from uuid import uuid4
+
+from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from verl.utils.profiler import simple_timer
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+@register("partial_single_turn_agent")
+class PartialSingleTurnAgentLoop(AgentLoopBase):
+    """Naive agent loop that only do single turn chat completion."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
+        self.response_length = self.config.actor_rollout_ref.rollout.response_length
+
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
+        if not output:
+            prompt_ids = await self.loop.run_in_executor(
+                None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+            )
+        else:
+            # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
+            prompt_ids = output.prompt_ids + output.response_ids
+
+        metrics = {}
+        request_id = uuid4().hex
+        with simple_timer("generate_sequences", metrics):
+            response_ids, is_cancel = await self.server_manager.generate_for_partial(
+                request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
+            )
+
+        if not output:
+            response_mask = [1] * len(response_ids)
+        # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
+        else:
+            prompt_ids = output.prompt_ids
+            response_ids = output.response_ids + response_ids
+            response_mask = [1] * len(response_ids)
+
+        return AgentLoopOutput(
+            prompt_ids=prompt_ids,
+            response_ids=response_ids[: self.response_length],
+            response_mask=response_mask[: self.response_length],
+            num_turns=2,
+            metrics=metrics,
+            is_cancel=is_cancel,
+        )
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index a5cc0b83e59..3c238912cca 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -11,17 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import logging
 import os
 import pickle
-from typing import Any, Callable, Optional
+from contextlib import ExitStack
+from typing import Any, Callable, Optional, Coroutine, Sequence
 
 import ray
 import zmq
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 from starlette.requests import Request
 from starlette.responses import JSONResponse, StreamingResponse
-from vllm import SamplingParams
+from vllm import SamplingParams, RequestOutput
+from vllm.config import CompilationConfig, CompilationLevel
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse
@@ -204,6 +207,9 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_
         self.vllm_dp_rank = vllm_dp_rank
         self.wg_prefix = wg_prefix
         self.engine: AsyncLLM = None
+        # for cancel
+        self.cancel_event: dict[str, asyncio.Event] = {}
+        self.req_output: dict[str, Optional[RequestOutput]] = {}
 
     async def init_engine(self):
         """Init vLLM AsyncLLM engine."""
@@ -326,6 +332,46 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any],
 
         return final_res.outputs[0].token_ids
 
+    async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str):
+        max_tokens = self.max_model_len - len(prompt_ids)
+        sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params)
+        prompt = TokensPrompt(prompt_token_ids=prompt_ids)
+        generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
+
+        # Get final response
+        self.req_output[request_id]: Optional[RequestOutput] = None
+        async for output in generator:
+            self.req_output[request_id] = output
+        assert self.req_output[request_id] is not None
+
+    async def generate_for_partial(
+        self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
+    ) -> tuple[Sequence[int], bool] | tuple[str, bool]:
+        with ExitStack() as stack:
+            stack.callback(lambda: self.cancel_event.pop(request_id, None))
+            stack.callback(lambda: self.req_output.pop(request_id, None))
+
+            self.cancel_event[request_id] = asyncio.Event()
+            cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
+            generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
+
+            done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED)
+
+            for task in done:
+                await task
+
+            for task in pend:
+                task.cancel()
+
+            token_ids = self.req_output[request_id].outputs[0].token_ids
+            is_cancel = generation_handle not in done
+            return token_ids, is_cancel
+
+    async def cancel(self):
+        for request_id in self.cancel_event:
+            self.cancel_event[request_id].set()
+            print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}")
+
     async def wake_up(self):
         if self.config.rollout.free_cache_engine:
             await self.engine.wake_up()

From f5364bedbb918957dd601959038f235589569dcc Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 20 Aug 2025 16:38:25 +0800
Subject: [PATCH 068/182] partial rollout cancel

---
 .../config/fully_async_ppo_trainer.yaml       |  2 +-
 .../fully_async_rollouter.py                  | 10 ++----
 verl/experimental/agent_loop/agent_loop.py    |  5 +++
 .../rollout/vllm_rollout/vllm_async_server.py | 32 +++++++++++--------
 4 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index f1c4a1c602f..e33ebdb4408 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -11,7 +11,7 @@ defaults:
 async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
-  trigger_parameter_sync_step: 32      # >=1 train 每次训练一个batch, 迭代多少次后触发更新
+  trigger_parameter_sync_step: 1      # >=1 train 每次训练一个batch, 迭代多少次后触发更新
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 2500a215d9a..ce616be95de 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -528,7 +528,7 @@ async def _async_monitor_loop(self):
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
-                print(f"[FullyAsyncRollouter][MonitorLoop] {stats}")
+                pprint(stats)
                 last_stats_time = current_time
 
             # pause 和 resume 直接，不进行恢复操作
@@ -578,22 +578,18 @@ async def pause(self):
         """pause rollout
         TODO integrated Partial Rollout
         """
-        print("[FullyAsyncRollouter][Public] pause")
+        print("[FullyAsyncRollouter][Public][Pause]")
         async with self.lock:
             self.paused = True
             # 取消rollout所有任务
-            # await self.async_rollout_manager.cancel()
+            self.async_rollout_manager.cancel()
             if self.active_tasks:
-                print("[FullyAsyncRollouter][Public][Pause]")
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
             print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
         self.monitor_loop_trigger = False
 
     async def resume(self):
-        """resume rollout
-        TODO integrated Partial Rollout
-        """
         async with self.lock:
             self.paused = False
             self.condition.notify_all()
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index dcb7184df5d..668a7cc0d62 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -673,3 +673,8 @@ def wake_up(self):
     def sleep(self):
         """Sleep all rollout server instances."""
         ray.get([server.sleep.remote() for server in self.async_llm_servers])
+
+    def cancel(self):
+        """Cancel all rollout tasks."""
+        ray.get([server.cancel.remote() for server in self.async_llm_servers])
+
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 3c238912cca..2f62a9bd6b7 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -208,6 +208,8 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_
         self.wg_prefix = wg_prefix
         self.engine: AsyncLLM = None
         # for cancel
+
+        self.lock = asyncio.Lock()
         self.cancel_event: dict[str, asyncio.Event] = {}
         self.req_output: dict[str, Optional[RequestOutput]] = {}
 
@@ -347,30 +349,32 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str,
     async def generate_for_partial(
         self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
     ) -> tuple[Sequence[int], bool] | tuple[str, bool]:
-        with ExitStack() as stack:
-            stack.callback(lambda: self.cancel_event.pop(request_id, None))
-            stack.callback(lambda: self.req_output.pop(request_id, None))
-
+        # 设置中断标志
+        async with self.lock:
             self.cancel_event[request_id] = asyncio.Event()
             cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
-            generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
 
-            done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED)
+        generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
+        done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED)
 
-            for task in done:
-                await task
+        for task in done:
+            await task
 
-            for task in pend:
-                task.cancel()
+        for task in pend:
+            task.cancel()
 
+        async with self.lock:
             token_ids = self.req_output[request_id].outputs[0].token_ids
             is_cancel = generation_handle not in done
-            return token_ids, is_cancel
+            self.cancel_event.pop(request_id, None)
+            self.req_output.pop(request_id, None)
+        return token_ids, is_cancel
 
     async def cancel(self):
-        for request_id in self.cancel_event:
-            self.cancel_event[request_id].set()
-            print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}")
+        async with self.lock:
+            for request_id in self.cancel_event:
+                self.cancel_event[request_id].set()
+                print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}")
 
     async def wake_up(self):
         if self.config.rollout.free_cache_engine:

From f547a22a65ee0b3ac00f51949b3145fcd50037f3 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 20 Aug 2025 17:24:28 +0800
Subject: [PATCH 069/182] partial rollout cancel debug

---
 .../config/fully_async_ppo_trainer.yaml               |  1 +
 recipe/fully_async_policy/fully_async_rollouter.py    | 11 ++++++++---
 verl/experimental/agent_loop/agent_loop.py            |  8 ++++----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index e33ebdb4408..30f5ec4bf87 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -12,6 +12,7 @@ async_training:
   # 新鲜度控制 (Freshness Control)
   staleness_threshold: 3              # 样本新鲜度阈值
   trigger_parameter_sync_step: 1      # >=1 train 每次训练一个batch, 迭代多少次后触发更新
+  partial_rollout: True              # 同步参数时，是否中断 rollout
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index ce616be95de..8a69536fd58 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -312,6 +312,7 @@ async def _processor_worker(self):
                     self.paused = True
                 while self.paused:
                     await self.condition.wait()
+                    print("等待已提交的任务结束 condition")
 
             # 获取待处理的部分 RolloutSample
             async with self.lock:
@@ -339,6 +340,7 @@ async def _processor_worker(self):
                 # pause结束后，获取到锁，还需要判断是否是暂停阶段，否则继续等待
                 while self.paused:
                     await self.condition.wait()
+                    print("立即提交单个样本处理 condition")
                 task = asyncio.create_task(
                     self._process_single_sample_streaming(rollout_sample),
                     name=rollout_sample.sample_id,
@@ -528,14 +530,15 @@ async def _async_monitor_loop(self):
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
+                print("[FullyAsyncRollouter][MonitorLoop][Statistics]")
                 pprint(stats)
                 last_stats_time = current_time
 
             # pause 和 resume 直接，不进行恢复操作
             if self.monitor_loop_trigger and self.paused:
-                if await self._should_pause_generation():
+                if not await self._should_pause_generation():
                     async with self.lock:
-                        print("[FullyAsyncRollouter][MonitorLoop] trigger resume")
+                        print("[FullyAsyncRollouter][MonitorLoop][Resume]")
                         self.paused = False
                         self.condition.notify_all()
 
@@ -582,7 +585,8 @@ async def pause(self):
         async with self.lock:
             self.paused = True
             # 取消rollout所有任务
-            self.async_rollout_manager.cancel()
+            if self.config.async_training.partial_rollout:
+                await self.async_rollout_manager.cancel_async()
             if self.active_tasks:
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
@@ -590,6 +594,7 @@ async def pause(self):
         self.monitor_loop_trigger = False
 
     async def resume(self):
+        print("[FullyAsyncRollouter][Public][Resume]")
         async with self.lock:
             self.paused = False
             self.condition.notify_all()
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 668a7cc0d62..6bd90fe9b44 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -674,7 +674,7 @@ def sleep(self):
         """Sleep all rollout server instances."""
         ray.get([server.sleep.remote() for server in self.async_llm_servers])
 
-    def cancel(self):
-        """Cancel all rollout tasks."""
-        ray.get([server.cancel.remote() for server in self.async_llm_servers])
-
+    async def cancel_async(self):
+        """Cancel all rollout tasks asynchronously."""
+        futures = [server.cancel.remote() for server in self.async_llm_servers]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)

From a3e11f9441484de2c86cc90b6897602ca1ba89e9 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 20 Aug 2025 19:14:53 +0800
Subject: [PATCH 070/182] partial rollout cancel success

---
 .../fully_async_rollouter.py                  | 20 +++++++++++--------
 .../rollout/vllm_rollout/vllm_async_server.py |  1 -
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 8a69536fd58..1d1490b32ce 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -296,23 +296,22 @@ async def _processor_worker(self):
         """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
 
         while True:
+            simple_from_cancel_queue = False
             if not self.cancel_queue.empty():
-                print(f"self.cancel_queue {self.cancel_queue.qsize()}")
                 rollout_sample = await self.cancel_queue.get()
+                simple_from_cancel_queue = True
             else:
                 rollout_sample = await self.pending_queue.get()
-            self.staleness_samples += 1
+                self.staleness_samples += 1
 
             async with self.lock:
                 if await self._should_pause_generation():
-                    print("[FullyAsyncRollouter][Processor] 等待已提交的任务结束")
                     if self.active_tasks:
                         await asyncio.gather(*self.active_tasks, return_exceptions=True)
                         self.active_tasks.clear()
                     self.paused = True
                 while self.paused:
                     await self.condition.wait()
-                    print("等待已提交的任务结束 condition")
 
             # 获取待处理的部分 RolloutSample
             async with self.lock:
@@ -340,7 +339,6 @@ async def _processor_worker(self):
                 # pause结束后，获取到锁，还需要判断是否是暂停阶段，否则继续等待
                 while self.paused:
                     await self.condition.wait()
-                    print("立即提交单个样本处理 condition")
                 task = asyncio.create_task(
                     self._process_single_sample_streaming(rollout_sample),
                     name=rollout_sample.sample_id,
@@ -348,7 +346,10 @@ async def _processor_worker(self):
                 self.active_tasks.add(task)
 
             # 标记队列任务完成
-            self.pending_queue.task_done()
+            if simple_from_cancel_queue:
+                self.cancel_queue.task_done()
+            else:
+                self.pending_queue.task_done()
 
     async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         """流式处理单个样本"""
@@ -362,7 +363,10 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         rollout_sample.processing_time += processing_time
         rollout_sample.param_version = self.current_param_version
 
-        print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} cost {processing_time:.2f}s")
+        # print(
+        #     f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} "
+        #     f"cost {processing_time:.2f}s  "
+        #     f"response_len: {len(rollout_sample.agent_loop_output.response_ids)}")
 
         if agent_loop_output.is_cancel:
             # 放入 cancel 队列中，等待恢复生成
@@ -579,7 +583,7 @@ async def _should_pause_generation(self) -> bool:
 
     async def pause(self):
         """pause rollout
-        TODO integrated Partial Rollout
+        TODO async_rollout_manager clear kv cache
         """
         print("[FullyAsyncRollouter][Public][Pause]")
         async with self.lock:
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 2f62a9bd6b7..06e8626ad42 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -374,7 +374,6 @@ async def cancel(self):
         async with self.lock:
             for request_id in self.cancel_event:
                 self.cancel_event[request_id].set()
-                print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}")
 
     async def wake_up(self):
         if self.config.rollout.free_cache_engine:

From 1fdd90d460fbac9956bd28265b9e2df1fa7ac0d7 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 21 Aug 2025 20:42:40 +0800
Subject: [PATCH 071/182] partial rollout cancel debug

---
 .../config/fully_async_ppo_trainer.yaml       |  6 +-
 .../dapo_7b_math_fsdp2_4_12.sh                |  8 +-
 .../fully_async_rollouter.py                  | 80 +++++++++++--------
 verl/experimental/agent_loop/agent_loop.py    |  5 ++
 .../rollout/vllm_rollout/vllm_async_server.py | 11 ++-
 5 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 30f5ec4bf87..0714e107ee4 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -10,9 +10,9 @@ defaults:
 
 async_training:
   # 新鲜度控制 (Freshness Control)
-  staleness_threshold: 3              # 样本新鲜度阈值
-  trigger_parameter_sync_step: 1      # >=1 train 每次训练一个batch, 迭代多少次后触发更新
-  partial_rollout: True              # 同步参数时，是否中断 rollout
+  staleness_threshold: 1              # 样本新鲜度阈值
+  trigger_parameter_sync_step: 4     # >=1 train 每次训练一个batch, 迭代多少次后触发更新
+  partial_rollout: True               # 同步参数时，是否中断 rollout
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
index 86cd25affe2..fe490af24ea 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
@@ -76,9 +76,10 @@ train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-staleness_threshold=10
+staleness_threshold=1
 total_rollout_steps=$(((512*16*100)))
-trigger_parameter_sync_step=32
+trigger_parameter_sync_step=4
+partial_rollout=True
 
 /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
@@ -159,4 +160,5 @@ trigger_parameter_sync_step=32
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}"
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 1d1490b32ce..0982954db75 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -62,6 +62,8 @@ def __init__(
         assert not self.hybrid_engine
         assert self.config.data.train_batch_size == 0, "train_batch_size must be zero"
         assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one"
+        assert self.config.async_training.staleness_threshold >= 0, "staleness_threshold must larger than 0"
+        assert self.config.async_training.trigger_parameter_sync_step >= 1, "trigger_parameter_sync_step must larger than 1"
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
@@ -304,33 +306,46 @@ async def _processor_worker(self):
                 rollout_sample = await self.pending_queue.get()
                 self.staleness_samples += 1
 
-            async with self.lock:
-                if await self._should_pause_generation():
-                    if self.active_tasks:
-                        await asyncio.gather(*self.active_tasks, return_exceptions=True)
-                        self.active_tasks.clear()
+            # 判断是否需要暂停
+            # self.paused 由 pause() 和 self._should_pause_generation() 负责修改
+            if self.paused or await self._should_pause_generation():
+                print("[FullyAsyncRollouter][Processor] 收到暂停信号，等待剩余任务完成...")
+                while self.active_tasks:
+                    async with self.lock:
+                        # 获取锁后，active_tasks 数量会发生变化，需要再次校验
+                        if self.active_tasks:
+                            done_tasks, self.active_tasks = await asyncio.wait(
+                                self.active_tasks, return_when=asyncio.FIRST_COMPLETED
+                            )
+                        for task in done_tasks:
+                            await task
+                async with self.lock:
                     self.paused = True
-                while self.paused:
-                    await self.condition.wait()
+
+                async with self.lock:
+                    while self.paused:
+                        await self.condition.wait()
 
             # 获取待处理的部分 RolloutSample
-            async with self.lock:
-                if rollout_sample == "DONE":
-                    print("[FullyAsyncRollouter][Processor] 收到结束信号，等待剩余任务完成...")
-                    # 等待所有活动任务完成
-                    if self.active_tasks:
-                        await asyncio.gather(*self.active_tasks, return_exceptions=True)
-                        self.active_tasks.clear()
-                    break
+            if rollout_sample == "DONE":
+                print("[FullyAsyncRollouter][Processor] 收到结束信号，等待剩余任务完成...")
+                while self.active_tasks:
+                    async with self.lock:
+                        if self.active_tasks:
+                            done_tasks, self.active_tasks = await asyncio.wait(
+                                self.active_tasks, return_when=asyncio.FIRST_COMPLETED
+                            )
+                        for task in done_tasks:
+                            await task
+                break
 
             # 检查并发数是否超限
-            async with self.lock:
-                while len(self.active_tasks) >= self.max_concurrent_samples:
-                    # 等待至少一个任务完成
-                    done_tasks, self.active_tasks = await asyncio.wait(
-                        self.active_tasks, return_when=asyncio.FIRST_COMPLETED
-                    )
-                    # 清理已完成的任务
+            while len(self.active_tasks) >= self.max_concurrent_samples:
+                async with self.lock:
+                    if self.active_tasks:
+                        done_tasks, self.active_tasks = await asyncio.wait(
+                            self.active_tasks, return_when=asyncio.FIRST_COMPLETED
+                        )
                     for task in done_tasks:
                         await task
 
@@ -353,7 +368,6 @@ async def _processor_worker(self):
 
     async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         """流式处理单个样本"""
-
         # 调用异步生成方法
         agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async(
             rollout_sample.full_batch, rollout_sample.agent_loop_output
@@ -366,7 +380,9 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         # print(
         #     f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} "
         #     f"cost {processing_time:.2f}s  "
-        #     f"response_len: {len(rollout_sample.agent_loop_output.response_ids)}")
+        #     f"len: {len(rollout_sample.agent_loop_output.response_ids)} "
+        #     f"cancel: {agent_loop_output.is_cancel} "
+        # )
 
         if agent_loop_output.is_cancel:
             # 放入 cancel 队列中，等待恢复生成
@@ -538,17 +554,14 @@ async def _async_monitor_loop(self):
                 pprint(stats)
                 last_stats_time = current_time
 
-            # pause 和 resume 直接，不进行恢复操作
-            if self.monitor_loop_trigger and self.paused:
+            # pause 和 resume 之间，不进行恢复操作
+            if self.monitor_loop_trigger:
                 if not await self._should_pause_generation():
                     async with self.lock:
-                        print("[FullyAsyncRollouter][MonitorLoop][Resume]")
                         self.paused = False
                         self.condition.notify_all()
 
     async def _should_pause_generation(self) -> bool:
-        if self.paused:
-            return True
         """Determine whether the build should be paused"""
         queue_stats = self.message_queue_client.get_statistics_sync()
         queue_size = queue_stats["queue_size"]
@@ -594,15 +607,18 @@ async def pause(self):
             if self.active_tasks:
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
-            print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
-        self.monitor_loop_trigger = False
+                print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
+            self.monitor_loop_trigger = False
 
     async def resume(self):
         print("[FullyAsyncRollouter][Public][Resume]")
         async with self.lock:
             self.paused = False
+            self.monitor_loop_trigger = True
             self.condition.notify_all()
-        self.monitor_loop_trigger = True
+
+            if self.config.async_training.partial_rollout:
+                await self.async_rollout_manager.resume_async()
 
     async def get_statistics(self) -> dict:
         queue_stats = self.message_queue_client.get_statistics_sync()
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 6bd90fe9b44..83ba95c8662 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -678,3 +678,8 @@ async def cancel_async(self):
         """Cancel all rollout tasks asynchronously."""
         futures = [server.cancel.remote() for server in self.async_llm_servers]
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+
+    async def resume_async(self):
+        """Cancel all rollout tasks asynchronously."""
+        futures = [server.resume.remote() for server in self.async_llm_servers]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 06e8626ad42..7ce640e33cb 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -209,6 +209,7 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_
         self.engine: AsyncLLM = None
         # for cancel
 
+        self.paused = False
         self.lock = asyncio.Lock()
         self.cancel_event: dict[str, asyncio.Event] = {}
         self.req_output: dict[str, Optional[RequestOutput]] = {}
@@ -351,10 +352,13 @@ async def generate_for_partial(
     ) -> tuple[Sequence[int], bool] | tuple[str, bool]:
         # 设置中断标志
         async with self.lock:
+            if self.paused:
+                # cancel 后， 所有任务直接返回，等待下次提交
+                return [], True
             self.cancel_event[request_id] = asyncio.Event()
             cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
+            generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
 
-        generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
         done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED)
 
         for task in done:
@@ -372,9 +376,14 @@ async def generate_for_partial(
 
     async def cancel(self):
         async with self.lock:
+            self.paused = True
             for request_id in self.cancel_event:
                 self.cancel_event[request_id].set()
 
+    async def resume(self):
+        async with self.lock:
+            self.paused = False
+
     async def wake_up(self):
         if self.config.rollout.free_cache_engine:
             await self.engine.wake_up()

From ea0020569bb67cb00b6f3cec355a72d8a344f419 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 21 Aug 2025 21:04:46 +0800
Subject: [PATCH 072/182] partial rollout banchmark time

---
 ...sdp2_4_12.sh => dapo_7b_math_fsdp2_4_4.sh} |   6 +-
 .../dapo_7b_math_fsdp2_colocate.sh            | 136 ++++++++++++++++++
 recipe/fully_async_policy/fully_async_main.py |   3 +
 verl/trainer/main_ppo.py                      |   3 +
 4 files changed, 145 insertions(+), 3 deletions(-)
 rename recipe/fully_async_policy/{dapo_7b_math_fsdp2_4_12.sh => dapo_7b_math_fsdp2_4_4.sh} (98%)
 create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
similarity index 98%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
rename to recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
index fe490af24ea..bb3eb5cc88b 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-one-step-off-4-12'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-4-4'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -77,8 +77,8 @@ gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 staleness_threshold=1
-total_rollout_steps=$(((512*16*100)))
-trigger_parameter_sync_step=4
+total_rollout_steps=$(((512*16*10)))
+trigger_parameter_sync_step=24
 partial_rollout=True
 
 /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
new file mode 100644
index 00000000000..938a6d65c32
--- /dev/null
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-colocate'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-1}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+
+/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=-1 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=10 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 1d4e64b1ca4..cce57501a3b 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -272,7 +272,10 @@ def main(config):
     # Ensure async training config exists
     if not hasattr(config, "async_training"):
         raise RuntimeError("must set async_training config")
+    from time import time
+    start_time = time()
     run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
+    print(f"total time: {time() - start_time:.2f} seconds")
 
 
 if __name__ == "__main__":
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index fa12105f07f..7b34cbfaf23 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -37,7 +37,10 @@ def main(config):
     Args:
         config_dict: Hydra configuration dictionary containing training parameters.
     """
+    from time import time
+    start_time = time()
     run_ppo(config)
+    print(f"total time: {time() - start_time:.2f} seconds")
 
 
 # Define a function to run the PPO-like training process

From eb67390abd654bc0abf991bf1f5d623f639a1382 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 25 Aug 2025 10:37:29 +0800
Subject: [PATCH 073/182] eval code

---
 .../dapo_7b_math_fsdp2_4_4.sh                 |   5 +-
 .../dapo_7b_math_fsdp2_server.sh              | 148 +++++++
 recipe/fully_async_policy/detach_utils.py     |   8 +-
 .../fully_async_rollouter.py                  |  97 +++--
 .../fully_async_policy/fully_async_trainer.py |  19 +-
 recipe/fully_async_policy/message_queue.py    |  22 ++
 recipe/fully_async_policy/param_sync.py       |   5 +-
 .../unittest/ray_async_resource_config.py     | 366 ------------------
 tests/special_e2e/run_fully_async_policy.sh   |  16 +-
 verl/experimental/agent_loop/agent_loop.py    |   3 +-
 .../agent_loop/single_turn_agent_loop.py      |   6 +-
 .../agent_loop/tool_agent_loop.py             |   7 +-
 12 files changed, 275 insertions(+), 427 deletions(-)
 create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh
 delete mode 100644 recipe/fully_async_policy/unittest/ray_async_resource_config.py

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
index bb3eb5cc88b..936d9475d4d 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
@@ -75,10 +75,11 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=32
+train_prompt_mini_bsz=64
 staleness_threshold=1
 total_rollout_steps=$(((512*16*10)))
-trigger_parameter_sync_step=24
+test_freq=-1
+trigger_parameter_sync_step=32
 partial_rollout=True
 
 /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh
new file mode 100644
index 00000000000..087dea05121
--- /dev/null
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-server'
+
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-1}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+
+/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=-1 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=10 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 3ac998bc82a..b8b359f9669 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import time
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Dict
 
 import numpy as np
 import torch
@@ -49,6 +49,12 @@ class RolloutSample:
     param_version: int
 
 
+@dataclass
+class ValidateMetrics:
+    timing_raw: Dict[str, Any]
+    metrics: Dict[str, Any]
+
+
 def prepare_single_generation_data(batch_dict, global_steps) -> DataProto:
     """
     类似 ray_trainer._prepare_generate_batch 的逻辑，但针对单个样本
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 0982954db75..0770c7bc6c1 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -22,11 +22,12 @@
 from recipe.fully_async_policy.detach_utils import (
     RolloutSample,
     calculate_one_step_size,
-    prepare_single_generation_data,
+    prepare_single_generation_data, ValidateMetrics,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
+from verl.utils.profiler import marked_timer
 from verl.utils.tracking import ValidationGenerationsLogger
 
 
@@ -145,7 +146,7 @@ def __init__(
             f"max_required_samples: {self.max_required_samples}"
         )
 
-        # 单次最多扔一次迭代需要的样本
+        # 单次最多扔一次更新需要的样本
         self.max_concurrent_samples = self.required_samples
 
         # 流式处理统计
@@ -167,6 +168,15 @@ def __init__(
         # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
         self.monitor_loop_trigger = True
 
+        self.update_param_version_time = 0
+        self.global_steps = 0
+
+        self.progress_bar = tqdm(
+            total=self.total_rollout_steps / (
+                    self.required_samples * self.config.async_training.trigger_parameter_sync_step),
+            initial=self.global_steps, desc="Training Progress"
+        )
+
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
         async with self.lock:
@@ -195,6 +205,17 @@ async def update_param_version(self, version: int):
                 f"[FullyAsyncRollouter][Public][update_param_version] "
                 f"Parameter version updated from {old_version} to {version}"
             )
+            timing_raw = {}
+            self.update_param_version_time += 1
+            if (self.val_reward_fn is not None
+                    and self.config.trainer.test_freq > 0
+                    and (self.is_last_step or self.global_steps % self.config.trainer.test_freq == 0)):
+                with marked_timer("testing", timing_raw, color="green"):
+                    val_metrics: dict = self._validate()
+                    data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics)
+                    self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
+            if version > 0:
+                self.progress_bar.update(1)
 
     def _validate_config(self):
         # Validate asynchronous training configuration
@@ -245,10 +266,6 @@ async def _feed_samples(self):
         sample_count = 0
         should_stop = False
 
-        progress_bar = tqdm(
-            total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, desc="Training Progress"
-        )
-
         for epoch, batch_dict in continuous_iterator:
             if should_stop:  # 检查停止标志
                 break
@@ -275,6 +292,7 @@ async def _feed_samples(self):
 
                 # 检查是否到达最后一步
                 if self.global_steps >= self.total_rollout_steps:
+                    self.is_last_step = True
                     print(
                         f"[FullyAsyncRollouter][Feed] "
                         f"达到最大步数，停止添加新样本 "
@@ -283,15 +301,13 @@ async def _feed_samples(self):
                     should_stop = True  # 设置停止标志
                     break
 
-                if self.global_steps % self.required_samples == 0:
-                    progress_bar.update(1)
                 self.global_steps += 1
 
             sample_count += 1
 
         # 发送结束信号
         progress_bar.close()
-        await self.pending_queue.put("DONE")
+        await self.pending_should_stopqueue.put("DONE")
         print(f"[FullyAsyncRollouter][Feed] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
 
     async def _processor_worker(self):
@@ -377,12 +393,12 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         rollout_sample.processing_time += processing_time
         rollout_sample.param_version = self.current_param_version
 
-        # print(
-        #     f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} "
-        #     f"cost {processing_time:.2f}s  "
-        #     f"len: {len(rollout_sample.agent_loop_output.response_ids)} "
-        #     f"cancel: {agent_loop_output.is_cancel} "
-        # )
+        print(
+            f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} "
+            f"cost {processing_time:.2f}s  "
+            f"len: {len(rollout_sample.agent_loop_output.response_ids)} "
+            f"cancel: {agent_loop_output.is_cancel} "
+        )
 
         if agent_loop_output.is_cancel:
             # 放入 cancel 队列中，等待恢复生成
@@ -411,8 +427,6 @@ async def _consumer_worker(self):
             else:
                 self.dropped_stale_samples += 1
 
-            # print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}")
-
             # 标记结果队列任务完成
             self.result_queue.task_done()
 
@@ -427,20 +441,19 @@ async def _streaming_generation_main(self):
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        self.global_steps = 0
-
         # load checkpoint before doing anything
         self._load_checkpoint()
 
         # perform validation before training
         # currently, we only support validation using the reward_function.
-        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
-            val_metrics = self._validate()
-            assert val_metrics, f"{val_metrics=}"
-            pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
-            self.logger.log(data=val_metrics, step=self.global_steps)
-            if self.config.trainer.get("val_only", False):
-                return
+        async with self.lock:
+            if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+                print("Initial validation metric")
+                val_metrics = self._validate()
+                assert val_metrics, f"{val_metrics=}"
+                pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
+                if self.config.trainer.get("val_only", False):
+                    return
 
         # we start from step 1
         self.global_steps += 1
@@ -570,26 +583,30 @@ async def _should_pause_generation(self) -> bool:
         version_diff = self.current_param_version - current_trainer_version
 
         if version_diff > self.staleness_threshold:
-            print(
-                "[FullyAsyncRollouter][ShouldPause] "
-                f"due to version_diff > self.staleness_threshold: "
-                f"rollout_version={self.current_param_version}, "
-                f"trainer_version={current_trainer_version}, diff={version_diff}"
-            )
+            if not self.paused:
+                print(
+                    "[FullyAsyncRollouter][ShouldPause] "
+                    f"due to version_diff > self.staleness_threshold: "
+                    f"rollout_version={self.current_param_version}, "
+                    f"trainer_version={current_trainer_version}, diff={version_diff}"
+                )
             return True
 
         if queue_size >= self.max_queue_size:
-            print(
-                f"[FullyAsyncRollouter][ShouldPause]  due to full queue: size={queue_size}, max={self.max_queue_size}"
-            )
+            if not self.paused:
+                print(
+                    f"[FullyAsyncRollouter][ShouldPause]  "
+                    f"due to full queue: size={queue_size}, max={self.max_queue_size}"
+                )
             return True
 
         if self.staleness_samples > self.max_required_samples:
-            print(
-                "[FullyAsyncRollouter][ShouldPause] "
-                f"due to "
-                f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} "
-            )
+            if not self.paused:
+                print(
+                    "[FullyAsyncRollouter][ShouldPause] "
+                    f"due to "
+                    f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} "
+                )
             return True
 
         return False
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index b82b1c4d5d2..d6d44babb2a 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -15,6 +15,7 @@
 import logging
 import time
 import warnings
+from datetime import datetime
 from pprint import pprint
 from typing import Any
 
@@ -23,7 +24,7 @@
 
 from recipe.fully_async_policy.detach_utils import (
     assemble_batch_from_rollout_samples,
-    calculate_one_step_size,
+    calculate_one_step_size, ValidateMetrics,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -156,7 +157,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
             queue_samples.append(sample)
 
-            if len(queue_samples) % 10 == 0:
+            if len(queue_samples) % 64 == 0:
                 print(
                     f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. "
                     f"mq_len: {queue_len}"
@@ -251,6 +252,12 @@ def fit(self):
             metrics = {}
             timing_raw = {}
 
+            val_data = self.message_queue_client.get_validate_sync()
+            if val_data:
+                val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
+                metrics.update(val_data.metrics)
+                timing_raw.update(val_data.timing_raw)
+
             with marked_timer("step", timing_raw):
                 with marked_timer("gen", timing_raw, color="red"):
                     epoch, batch = self._get_samples_from_queue()
@@ -285,13 +292,17 @@ def fit(self):
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
                 self._check_save_checkpoint(False, timing_raw)
 
-            # self._collect_metrics(batch, epoch, metrics, timing_raw)
+            self._collect_metrics(batch, 0, metrics, timing_raw)
             pprint(metrics)
             # Trigger parameter synchronization after training step
+
+            time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3]
+
             print(
                 f"[FullyAsyncTrainer] global_steps: {self.global_steps} "
                 f"local_trigger_step: {self.local_trigger_step} "
-                f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}"
+                f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step} "
+                f"{time_str}"
             )
             self._trigger_parameter_sync_after_step()
             self.global_steps += 1
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 012445d45ed..6a425c50478 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -36,6 +36,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.queue = deque(maxlen=max_queue_size)
         self.current_param_version = 0
 
+        self.val_queue = deque()
+
+
         try:
             if hasattr(config, "async_training") and config.async_training is not None:
                 self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3)
@@ -188,6 +191,18 @@ async def get_memory_usage(self) -> dict:
                 "estimated_memory_mb": total_size / (1024 * 1024),
             }
 
+    async def put_validate(self, data):
+        async with self._lock:
+            self.val_queue.append(data)
+
+    async def get_validate(self):
+        async with self._lock:
+            if self.val_queue:
+                return self.val_queue.popleft()
+            else:
+                return None
+
+
 
 class MessageQueueClient:
     """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor"""
@@ -200,6 +215,13 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
         future = self.queue_actor.put_sample.remote(sample, param_version)
         return await asyncio.wrap_future(future.future())
 
+    async def put_validate(self, data: Any) -> bool:
+        future = self.queue_actor.put_validate.remote(data)
+        return await asyncio.wrap_future(future.future())
+
+    def get_validate_sync(self) -> Any | None:
+        return ray.get(self.queue_actor.get_validate.remote())
+
     async def get_sample(self) -> Any | None:
         """Get single sample from queue, wait until one is available (async)"""
         future = self.queue_actor.get_sample.remote()
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 53ced11956c..7e75865ebd5 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -86,8 +86,9 @@ def sync_weights(self, version):
         self.actor_wg.sync_rollout_weights()
         ray.get(self.rollout_wg.sync_rollout_weights())
 
-        # Update rollout version
-        ray.get(self.rollouter.update_param_version.remote(version))
+        # Async Update rollout version
+        self.rollouter.update_param_version.remote(version)
+
         ray.get(self.rollouter.resume.remote())
         end_time = time.time()
 
diff --git a/recipe/fully_async_policy/unittest/ray_async_resource_config.py b/recipe/fully_async_policy/unittest/ray_async_resource_config.py
deleted file mode 100644
index 930f8c5169f..00000000000
--- a/recipe/fully_async_policy/unittest/ray_async_resource_config.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import random
-import time
-
-import ray
-
-
-# 配置1: 默认配置
-class DefaultStreamingActor:
-    """默认配置的流式处理Actor"""
-
-    def __init__(self, actor_id: str):
-        self.actor_id = actor_id
-        self.processed_count = 0
-        self.start_time = time.time()
-        self.max_concurrent_tasks = 0
-        self.current_tasks = 0
-
-    async def process_data_async(self, data_item: dict) -> dict:
-        """异步处理数据"""
-        self.current_tasks += 1
-        self.max_concurrent_tasks = max(self.max_concurrent_tasks, self.current_tasks)
-
-        try:
-            task_id = data_item["id"]
-            processing_time = random.uniform(1, 3)
-
-            print(f"[{self.actor_id}] 开始处理 {task_id} (当前并发: {self.current_tasks})")
-
-            # CPU密集型任务模拟
-            await asyncio.sleep(processing_time * 0.5)  # I/O部分
-
-            # 模拟CPU计算
-            total = 0
-            for i in range(int(processing_time * 100000)):  # CPU密集计算
-                total += i * 0.001
-
-            await asyncio.sleep(processing_time * 0.5)  # 更多I/O
-
-            self.processed_count += 1
-
-            result = {
-                "id": task_id,
-                "actor_id": self.actor_id,
-                "processing_time": processing_time,
-                "processed_count": self.processed_count,
-                "max_concurrent": self.max_concurrent_tasks,
-                "compute_result": total,
-                "completed_at": time.time(),
-            }
-
-            print(f"[{self.actor_id}] 完成处理 {task_id} (耗时: {processing_time:.1f}s)")
-            return result
-
-        finally:
-            self.current_tasks -= 1
-
-    def get_stats(self) -> dict:
-        return {
-            "actor_id": self.actor_id,
-            "processed_count": self.processed_count,
-            "max_concurrent_tasks": self.max_concurrent_tasks,
-            "uptime": time.time() - self.start_time,
-        }
-
-
-# 配置2: 只设置 num_cpus
-@ray.remote(num_cpus=4)
-class HighCpuStreamingActor(DefaultStreamingActor):
-    """高CPU配置的Actor"""
-
-    pass
-
-
-# 配置3: 只设置 max_concurrency
-@ray.remote(max_concurrency=5)
-class HighConcurrencyStreamingActor(DefaultStreamingActor):
-    """高并发配置的Actor"""
-
-    pass
-
-
-# 配置4: 同时设置两者
-@ray.remote(num_cpus=4, max_concurrency=8)
-class OptimalStreamingActor(DefaultStreamingActor):
-    """最优配置的Actor"""
-
-    pass
-
-
-# 配置5: 极端低配置
-@ray.remote(num_cpus=1, max_concurrency=2)
-class LowResourceStreamingActor(DefaultStreamingActor):
-    """低资源配置的Actor"""
-
-    pass
-
-
-class RayStreamingSystemTest:
-    """Ray流式处理系统测试"""
-
-    def __init__(self):
-        self.test_data = []
-        self.results = {}
-
-    def generate_test_data(self, count: int = 20) -> list[dict]:
-        """生成测试数据"""
-        return [
-            {"id": f"task_{i:03d}", "content": f"测试数据_{i}", "priority": random.choice(["high", "normal", "low"])}
-            for i in range(count)
-        ]
-
-    async def test_actor_configuration(self, actor_class, config_name: str, test_data: list[dict]) -> dict:
-        """测试特定配置的Actor"""
-        print(f"\n{'=' * 60}")
-        print(f"测试配置: {config_name}")
-        print(f"{'=' * 60}")
-
-        # 创建Actor实例
-        actor = actor_class.remote(config_name)
-
-        start_time = time.time()
-
-        # 并发提交所有任务
-        print(f"提交 {len(test_data)} 个任务...")
-        task_futures = []
-
-        for i, data_item in enumerate(test_data):
-            future = actor.process_data_async.remote(data_item)
-            task_futures.append(future)
-
-            # 模拟流式数据到达
-            if i < len(test_data) - 1:
-                await asyncio.sleep(0.1)  # 100ms间隔
-
-        print("所有任务已提交，等待完成...")
-
-        # 等待所有任务完成
-        try:
-            results = await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in task_futures])
-        except Exception as e:
-            print(f"任务执行出错: {e}")
-            results = []
-
-        end_time = time.time()
-        total_time = end_time - start_time
-
-        # 获取Actor统计信息
-        stats = ray.get(actor.get_stats.remote())
-
-        # 计算性能指标
-        performance_metrics = {
-            "config_name": config_name,
-            "total_tasks": len(test_data),
-            "completed_tasks": len(results),
-            "total_time": total_time,
-            "throughput": len(results) / total_time if total_time > 0 else 0,
-            "avg_processing_time": sum(r.get("processing_time", 0) for r in results) / len(results) if results else 0,
-            "max_concurrent_tasks": stats["max_concurrent_tasks"],
-            "actor_stats": stats,
-            "success_rate": len(results) / len(test_data) if test_data else 0,
-        }
-
-        print(f"✅ 完成测试 {config_name}:")
-        print(f"   总任务数: {performance_metrics['total_tasks']}")
-        print(f"   完成任务数: {performance_metrics['completed_tasks']}")
-        print(f"   总耗时: {performance_metrics['total_time']:.2f}s")
-        print(f"   吞吐量: {performance_metrics['throughput']:.2f} tasks/s")
-        print(f"   最大并发: {performance_metrics['max_concurrent_tasks']}")
-        print(f"   成功率: {performance_metrics['success_rate'] * 100:.1f}%")
-
-        return performance_metrics
-
-    async def run_comprehensive_test(self):
-        """运行综合测试"""
-        print("🚀 开始Ray异步资源配置测试")
-        print(f"Ray集群状态: {ray.cluster_resources()}")
-
-        # 生成测试数据
-        test_data = self.generate_test_data(15)  # 15个任务便于观察
-
-        # 测试配置列表
-        test_configs = [
-            (DefaultStreamingActor, "默认配置 (无特殊设置)"),
-            (HighCpuStreamingActor, "高CPU配置 (num_cpus=4)"),
-            (HighConcurrencyStreamingActor, "高并发配置 (max_concurrency=5)"),
-            (OptimalStreamingActor, "最优配置 (num_cpus=4, max_concurrency=8)"),
-            (LowResourceStreamingActor, "低资源配置 (num_cpus=1, max_concurrency=2)"),
-        ]
-
-        results = {}
-
-        # 逐个测试各种配置
-        for actor_class, config_name in test_configs:
-            try:
-                result = await self.test_actor_configuration(actor_class, config_name, test_data)
-                results[config_name] = result
-
-                # 测试间隔
-                await asyncio.sleep(2)
-
-            except Exception as e:
-                print(f"❌ 测试 {config_name} 失败: {e}")
-                results[config_name] = {"error": str(e)}
-
-        # 生成对比报告
-        self.generate_comparison_report(results)
-
-        return results
-
-    def generate_comparison_report(self, results: dict):
-        """生成对比报告"""
-        print(f"\n{'=' * 80}")
-        print("📊 配置对比报告")
-        print(f"{'=' * 80}")
-
-        # 表头
-        print(f"{'配置名称':<25} {'吞吐量':<12} {'最大并发':<10} {'平均处理时间':<15} {'成功率':<10}")
-        print("-" * 80)
-
-        # 数据行
-        best_throughput = 0
-        best_config = ""
-
-        for config_name, result in results.items():
-            if "error" in result:
-                print(f"{config_name:<25} {'错误':<12} {'':<10} {'':<15} {'':<10}")
-                continue
-
-            throughput = result.get("throughput", 0)
-            max_concurrent = result.get("max_concurrent_tasks", 0)
-            avg_time = result.get("avg_processing_time", 0)
-            success_rate = result.get("success_rate", 0)
-
-            print(
-                f"{config_name:<25} {throughput:<12.2f} {max_concurrent:<10} "
-                f"{avg_time:<15.2f} {success_rate * 100:<10.1f}%"
-            )
-
-            if throughput > best_throughput:
-                best_throughput = throughput
-                best_config = config_name
-
-        print(f"\n🏆 最佳配置: {best_config} (吞吐量: {best_throughput:.2f} tasks/s)")
-
-        # 详细分析
-        print("\n📋 配置分析:")
-        print("1. num_cpus 作用:")
-        print("   - 资源预留: 确保Actor有足够计算资源")
-        print("   - 节点选择: Ray选择有足够CPU的节点")
-        print("   - 避免资源竞争: 防止过度调度")
-
-        print("\n2. max_concurrency 作用:")
-        print("   - 并发控制: 限制Actor内同时执行的任务数")
-        print("   - 内存保护: 防止过多并发导致内存溢出")
-        print("   - 性能调优: 平衡并发度和资源利用率")
-
-        print("\n3. 建议配置:")
-        print("   - CPU密集型任务: 设置较高的num_cpus，适中的max_concurrency")
-        print("   - I/O密集型任务: 设置较低的num_cpus，较高的max_concurrency")
-        print("   - 混合型任务: 平衡两个参数，根据实际测试调优")
-
-
-async def run_resource_stress_test():
-    """运行资源压力测试"""
-    print(f"\n{'=' * 60}")
-    print("🔥 资源压力测试")
-    print(f"{'=' * 60}")
-
-    # 创建多个不同配置的Actor
-    actors = {
-        "高并发低CPU": OptimalStreamingActor.remote("stress_test_1"),
-        "低并发高CPU": ray.remote(num_cpus=8, max_concurrency=2)(DefaultStreamingActor).remote("stress_test_2"),
-        "平衡配置": ray.remote(num_cpus=2, max_concurrency=4)(DefaultStreamingActor).remote("stress_test_3"),
-    }
-
-    # 大量并发任务
-    heavy_workload = [{"id": f"heavy_{i}", "content": f"重载任务_{i}"} for i in range(50)]
-
-    print("提交大量并发任务，观察资源使用...")
-
-    all_futures = []
-    for actor_name, actor in actors.items():
-        print(f"向 {actor_name} 提交任务...")
-        for task in heavy_workload[:15]:  # 每个Actor处理15个任务
-            future = actor.process_data_async.remote(task)
-            all_futures.append((actor_name, future))
-
-    # 等待完成并记录时间
-    start_time = time.time()
-    results = []
-
-    for actor_name, future in all_futures:
-        try:
-            result = await asyncio.wrap_future(future.future())
-            results.append((actor_name, result))
-        except Exception as e:
-            print(f"{actor_name} 任务失败: {e}")
-
-    end_time = time.time()
-
-    print(f"压力测试完成，总耗时: {end_time - start_time:.2f}s")
-    print(f"完成任务数: {len(results)}")
-
-    # 按Actor分组统计
-    actor_stats = {}
-    for actor_name, result in results:
-        if actor_name not in actor_stats:
-            actor_stats[actor_name] = []
-        actor_stats[actor_name].append(result)
-
-    for actor_name, actor_results in actor_stats.items():
-        avg_time = sum(r["processing_time"] for r in actor_results) / len(actor_results)
-        print(f"{actor_name}: 完成 {len(actor_results)} 个任务, 平均耗时 {avg_time:.2f}s")
-
-
-async def main():
-    """主函数"""
-    # 初始化Ray
-    if not ray.is_initialized():
-        ray.init(
-            num_cpus=16,  # 设置足够的CPU资源
-            object_store_memory=2000000000,  # 2GB
-            ignore_reinit_error=True,
-        )
-
-    print("🎯 Ray异步资源配置测试")
-    print(f"可用资源: {ray.cluster_resources()}")
-
-    try:
-        # 基础配置测试
-        test_system = RayStreamingSystemTest()
-        await test_system.run_comprehensive_test()
-
-        # 压力测试
-        await run_resource_stress_test()
-
-        print("\n所有测试完成!")
-
-    except Exception as e:
-        print(f"测试执行失败: {e}")
-        import traceback
-
-        traceback.print_exc()
-
-    finally:
-        # 清理资源
-        ray.shutdown()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 8e0b82ddefc..04c17c98c24 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -49,15 +49,18 @@ top_k=-1
 val_top_p=0.7
 
 # Fully async specific parameters
-n_gpus_rollout=6
+n_gpus_rollout=4
 n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
-n_resp_per_prompt=3
-train_prompt_mini_bsz=32
-total_rollout_steps=50000
-staleness_threshold=10
+n_resp_per_prompt=16
+train_prompt_mini_bsz=64
+staleness_threshold=1
+total_rollout_steps=$(((512*16*10)))
+test_freq=2
+trigger_parameter_sync_step=2
+partial_rollout=True
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"
 
@@ -114,7 +117,7 @@ common_params=(
     trainer.logger=['console']
     trainer.project_name='verl-test-fully-async'
     trainer.experiment_name="${exp_name}"
-    trainer.val_before_train=False
+    trainer.val_before_train=True
     trainer.test_freq=-1
     trainer.save_freq=-1
     trainer.resume_mode=disable
@@ -126,6 +129,7 @@ common_params=(
     rollout.total_epochs=2
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}"
 )
 
 if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 83ba95c8662..41dc6967b5c 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -444,7 +444,7 @@ async def _run_agent_loop(
         messages: list[dict[str, Any]],
         sampling_params: dict[str, Any],
         trajectory: dict[str, Any],
-        partial_output: Optional[AgentLoopOutput],
+        partial_output: Optional[AgentLoopOutput] = None,
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
             step=trajectory["step"],
@@ -456,7 +456,6 @@ async def _run_agent_loop(
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
             )
-
             agent_loop_config = _agent_loop_registry[agent_name]
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py
index 411388e7321..492c1894cc5 100644
--- a/verl/experimental/agent_loop/single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/single_turn_agent_loop.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import logging
 import os
-from typing import Any
+from typing import Any, Optional
 from uuid import uuid4
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
@@ -32,7 +32,9 @@ def __init__(self, *args, **kwargs):
         self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
         self.response_length = self.config.actor_rollout_ref.rollout.response_length
 
-    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
+    async def run(self, messages: list[dict[str, Any]],
+                  sampling_params: dict[str, Any],
+                  output: Optional[AgentLoopOutput]) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index 3437c0be5ab..a0642048dc7 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -15,7 +15,7 @@
 import json
 import logging
 import os
-from typing import Any
+from typing import Any, Optional
 from uuid import uuid4
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
@@ -56,7 +56,10 @@ def init_class(cls, config, tokenizer, **kwargs):
         cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True)
 
     @rollout_trace_op
-    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
+    async def run(self,
+                  messages: list[dict[str, Any]],
+                  sampling_params: dict[str, Any],
+                  output: Optional[AgentLoopOutput]) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(

From 43883aefa98ece80a1425663bf92bb806ecd1df3 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 25 Aug 2025 11:50:14 +0800
Subject: [PATCH 074/182] fix FullyAsyncRollouter

---
 recipe/fully_async_policy/fully_async_rollouter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 0770c7bc6c1..71645f793df 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -307,7 +307,7 @@ async def _feed_samples(self):
 
         # 发送结束信号
         progress_bar.close()
-        await self.pending_should_stopqueue.put("DONE")
+        await self.pending_queue.put("DONE")
         print(f"[FullyAsyncRollouter][Feed] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
 
     async def _processor_worker(self):

From b22826586840d3922edcaccba937bbb0385ccba6 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 25 Aug 2025 19:33:22 +0800
Subject: [PATCH 075/182] group batch

---
 .../dapo_7b_math_fsdp2_4_4.sh                 |  13 ++-
 .../dapo_7b_math_fsdp2_colocate.sh            |  13 ++-
 recipe/fully_async_policy/detach_utils.py     |  81 ++++++-------
 recipe/fully_async_policy/fully_async_main.py |   1 +
 .../fully_async_rollouter.py                  | 110 ++++++++----------
 .../unittest/test_batch_utils.py              |   2 +-
 tests/special_e2e/run_fully_async_policy.sh   |  20 ++--
 verl/experimental/agent_loop/agent_loop.py    |  46 ++++----
 .../partial_single_turn_agent_loop.py         |   9 +-
 9 files changed, 144 insertions(+), 151 deletions(-)

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
index bb3eb5cc88b..3bcc82c9cef 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
@@ -65,7 +65,7 @@ gen_tp=1
 sp_size=1
 fsdp_size=2
 
-NNODES=${NNODES:-1}
+NNODES=${NNODES:-2}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
 # Fully async specific parameters
@@ -75,13 +75,18 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=32
+train_prompt_mini_bsz=64
 staleness_threshold=1
 total_rollout_steps=$(((512*16*10)))
-trigger_parameter_sync_step=24
+trigger_parameter_sync_step=32
 partial_rollout=True
 
-/home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
index 938a6d65c32..951db892651 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
@@ -30,7 +30,7 @@ train_prompt_mini_bsz=32
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
 # WORKING_DIR=${WORKING_DIR:-"${PWD}"}
 # RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-1}
+NNODES=${NNODES:-2}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 # Paths
 RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
@@ -61,7 +61,12 @@ fsdp_size=2
 
 # reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
 
-/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m verl.trainer.main_ppo \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -127,10 +132,10 @@ fsdp_size=2
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
     trainer.nnodes="${NNODES}" \
     trainer.val_before_train=True \
-    trainer.test_freq=-1 \
+    trainer.test_freq=10 \
     trainer.save_freq=-1 \
     trainer.total_epochs=10 \
-    trainer.total_training_steps=10 \
+    trainer.total_training_steps=100 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
     trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 3ac998bc82a..e3371e59acc 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import time
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, List
 
 import numpy as np
 import torch
@@ -36,20 +36,18 @@ class RolloutSample:
     full_batch: Any
 
     # AgentLoopOutput from generation
-    agent_loop_output: Any  # AgentLoopOutput
+    agent_loop_output_list: List[Any]  # AgentLoopOutput
 
     # Metadata
     sample_id: str
     epoch: int
-    rollout_n_index: int  # Index within the rollout.n repetitions (0, 1, ..., n-1)
-    original_sample_index: int  # Index of the original sample before repetition
 
     # Processing metadata
-    processing_time: float
+    processing_times: List[float]
     param_version: int
 
 
-def prepare_single_generation_data(batch_dict, global_steps) -> DataProto:
+def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto:
     """
     类似 ray_trainer._prepare_generate_batch 的逻辑，但针对单个样本
     分离出用于生成的数据和需要保留的原始数据
@@ -81,10 +79,35 @@ def prepare_single_generation_data(batch_dict, global_steps) -> DataProto:
 
     # 添加全局步数到生成数据
     full_batch.meta_info["global_steps"] = global_steps
-
+    full_batch = full_batch.repeat(repeat_times=rollout_n, interleave=True)
     return full_batch
 
 
+def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
+    # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
+    gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config)
+
+    # 第二步：添加 uid
+    rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object)
+
+    # 第二步：合并batch
+    # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch
+    for key, value in rs.full_batch.non_tensor_batch.items():
+        gen_batch_output.non_tensor_batch[key] = value
+    gen_batch_output.meta_info.update(rs.full_batch.meta_info)
+
+    # 第三步，设置 full_batch
+    rs.full_batch = gen_batch_output
+    rs.processing_times = []
+    for agent_loop in rs.agent_loop_output_list:
+        rs.processing_times.append(agent_loop.metrics.generate_sequences)
+
+    # 第四步，清空 agent_loop_output_list
+    rs.agent_loop_output_list = []
+
+    return rs
+
+
 def assemble_batch_from_rollout_samples(
     rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None
 ) -> DataProto:
@@ -111,47 +134,13 @@ def assemble_batch_from_rollout_samples(
 
     print(f"[BatchUtils] Assembling batch from {len(rollout_samples)} RolloutSample objects")
 
-    # 直接处理 RolloutSample 对象
-    processing_times = [rs.processing_time for rs in rollout_samples]
-
-    # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
-    agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples]
-    gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, tokenizer, config)
-
-    # 第二步：重建原始 batch 信息
-    # 每个 RolloutSample 都是独立的，直接按顺序重建原始数据
-    original_batch_list = []
-    for rs in rollout_samples:
-        item = rs.full_batch.to_items()[0]
-        original_batch_list.append(item)
-
-    # print("=" * 300)
-    # print(original_batch_list)
-
-    # 合并所有原始样本为一个批次
-    if original_batch_list:
-        original_batch = DataProto.from_items(original_batch_list)
-    else:
-        # 如果没有原始数据，创建空的 DataProto
-        original_batch = DataProto.from_single_dict({})
-
-    # print("=" * 300)
-    # print(original_batch)
+    rollout_samples_batch = []
+    processing_times = []
 
-    # 添加 UID
-    uids = []
     for rs in rollout_samples:
-        uids.append(f"uid_{rs.sample_id}")
-    original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object)
-
-    # 直接合并原始数据和生成结果，不需要 repeat
-    # 因为队列中的每个 RolloutSample 都已经是独立的样本
-    if original_batch.batch is None:
-        final_batch = gen_batch_output
-        # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch
-        for key, value in original_batch.non_tensor_batch.items():
-            final_batch.non_tensor_batch[key] = value
-        final_batch.meta_info.update(original_batch.meta_info)
+        rollout_samples_batch.append(rs.full_batch)
+        processing_times.extend(rs.processing_times)
+    final_batch = DataProto.concat(rollout_samples_batch)
 
     # 计算 response_mask（如果不存在）
     if "response_mask" not in final_batch.batch.keys():
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index cce57501a3b..c86ade05301 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -273,6 +273,7 @@ def main(config):
     if not hasattr(config, "async_training"):
         raise RuntimeError("must set async_training config")
     from time import time
+
     start_time = time()
     run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
     print(f"total time: {time() - start_time:.2f} seconds")
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 0982954db75..e846f13c3be 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import asyncio
 import time
-from pprint import pprint
+from pprint import pformat
 
 import ray
 from omegaconf import OmegaConf
@@ -23,6 +23,7 @@
     RolloutSample,
     calculate_one_step_size,
     prepare_single_generation_data,
+    merge_rollout_sample,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -63,7 +64,9 @@ def __init__(
         assert self.config.data.train_batch_size == 0, "train_batch_size must be zero"
         assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one"
         assert self.config.async_training.staleness_threshold >= 0, "staleness_threshold must larger than 0"
-        assert self.config.async_training.trigger_parameter_sync_step >= 1, "trigger_parameter_sync_step must larger than 1"
+        assert self.config.async_training.trigger_parameter_sync_step >= 1, (
+            "trigger_parameter_sync_step must larger than 1"
+        )
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
@@ -149,7 +152,6 @@ def __init__(
         self.max_concurrent_samples = self.required_samples
 
         # 流式处理统计
-        self.max_processing_time = 0.0  # 最长处理时间
         self.processed_sample_count = 0  # 已处理的样本计数
         self.active_sample_count = 0  # 当前正在处理的样本数
         self.queue_full_pause_count = 0  # 队列满导致的暂停次数
@@ -243,54 +245,39 @@ def _init_async_rollout_manager(self):
     async def _feed_samples(self):
         continuous_iterator = self._create_continuous_iterator()
         sample_count = 0
-        should_stop = False
-
-        progress_bar = tqdm(
-            total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, desc="Training Progress"
-        )
 
         for epoch, batch_dict in continuous_iterator:
-            if should_stop:  # 检查停止标志
-                break
-
             # 类似 _prepare_generate_batch 的逻辑：分离数据
-            full_batch = prepare_single_generation_data(batch_dict, self.global_steps)
-
-            # 根据 rollout.n 进行重复
-            for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n):
-                sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}"
-
-                rollout_sample = RolloutSample(
-                    full_batch=full_batch,
-                    agent_loop_output=None,  # 待处理后填充
-                    sample_id=sample_id,
-                    epoch=epoch,
-                    rollout_n_index=rollout_n_index,
-                    original_sample_index=sample_count,
-                    processing_time=0.0,  # 待处理后填充
-                    param_version=0,  # 待处理后填充
-                )
+            full_batch = prepare_single_generation_data(
+                batch_dict, self.global_steps, self.config.actor_rollout_ref.rollout.n
+            )
 
-                await self.pending_queue.put(rollout_sample)
+            sample_id = f"sample_{epoch}_{sample_count}"
 
-                # 检查是否到达最后一步
-                if self.global_steps >= self.total_rollout_steps:
-                    print(
-                        f"[FullyAsyncRollouter][Feed] "
-                        f"达到最大步数，停止添加新样本 "
-                        f"{self.global_steps} >= {self.total_rollout_steps}"
-                    )
-                    should_stop = True  # 设置停止标志
-                    break
+            rollout_sample = RolloutSample(
+                full_batch=full_batch,
+                agent_loop_output_list=[None] * self.config.actor_rollout_ref.rollout.n,  # 待处理后填充
+                sample_id=sample_id,
+                epoch=epoch,
+                param_version=0,  # 待处理后填充
+                processing_times=[],
+            )
+
+            await self.pending_queue.put(rollout_sample)
 
-                if self.global_steps % self.required_samples == 0:
-                    progress_bar.update(1)
-                self.global_steps += 1
+            # 检查是否到达最后一步
+            if self.global_steps >= self.total_rollout_steps:
+                print(
+                    f"[FullyAsyncRollouter][Feed] "
+                    f"达到最大步数，停止添加新样本 "
+                    f"{self.global_steps} >= {self.total_rollout_steps}"
+                )
+                break
 
+            self.global_steps += 1
             sample_count += 1
 
         # 发送结束信号
-        progress_bar.close()
         await self.pending_queue.put("DONE")
         print(f"[FullyAsyncRollouter][Feed] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
 
@@ -369,22 +356,29 @@ async def _processor_worker(self):
     async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         """流式处理单个样本"""
         # 调用异步生成方法
-        agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async(
-            rollout_sample.full_batch, rollout_sample.agent_loop_output
+        agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async(
+            rollout_sample.full_batch, rollout_sample.agent_loop_output_list
         )
         # 直接更新 RolloutSample 对象，填充剩余字段
-        rollout_sample.agent_loop_output = agent_loop_output
-        rollout_sample.processing_time += processing_time
+        rollout_sample.agent_loop_output_list = agent_loop_output_list
         rollout_sample.param_version = self.current_param_version
 
-        # print(
-        #     f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} "
-        #     f"cost {processing_time:.2f}s  "
-        #     f"len: {len(rollout_sample.agent_loop_output.response_ids)} "
-        #     f"cancel: {agent_loop_output.is_cancel} "
-        # )
+        is_cancel = False
+        # 收集所有信息
+        for agent_loop in agent_loop_output_list:
+            if is_cancel == False and agent_loop.is_cancel:
+                is_cancel = True
 
-        if agent_loop_output.is_cancel:
+        rollout_data = {
+            "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list],
+            "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list],
+        }
+        if is_cancel:
+            rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list]
+        formatted_data = pformat(rollout_data, width=200, compact=True)
+        print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}")
+
+        if is_cancel:
             # 放入 cancel 队列中，等待恢复生成
             await self.cancel_queue.put(rollout_sample)
         else:
@@ -392,15 +386,14 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
             await self.result_queue.put(rollout_sample)
 
         self.processed_sample_count += 1
-        # 更新最大处理时间统计
-        if processing_time > self.max_processing_time:
-            self.max_processing_time = processing_time
 
     async def _consumer_worker(self):
         """消费者协程，负责从结果队列获取处理结果并放入消息队列"""
         while True:
             # 从结果队列获取 RolloutSample
             rollout_sample = await self.result_queue.get()
+            rollout_sample = merge_rollout_sample(self.config, self.tokenizer, rollout_sample)
+
             # 直接将 RolloutSample 放入消息队列
             success = await self.message_queue_client.put_sample(
                 sample=ray.cloudpickle.dumps(rollout_sample),
@@ -411,8 +404,6 @@ async def _consumer_worker(self):
             else:
                 self.dropped_stale_samples += 1
 
-            # print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}")
-
             # 标记结果队列任务完成
             self.result_queue.task_done()
 
@@ -437,7 +428,7 @@ async def _streaming_generation_main(self):
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
             assert val_metrics, f"{val_metrics=}"
-            pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
+            print(f"[FullyAsyncRollouter] Initial validation metrics: \n {pformat(val_metrics)}")
             self.logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
                 return
@@ -550,8 +541,7 @@ async def _async_monitor_loop(self):
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
-                print("[FullyAsyncRollouter][MonitorLoop][Statistics]")
-                pprint(stats)
+                print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}")
                 last_stats_time = current_time
 
             # pause 和 resume 之间，不进行恢复操作
diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py
index b9351c46c28..363423b589d 100644
--- a/recipe/fully_async_policy/unittest/test_batch_utils.py
+++ b/recipe/fully_async_policy/unittest/test_batch_utils.py
@@ -128,7 +128,7 @@ def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) ->
 
         return RolloutSample(
             full_batch=mock_gen_data,
-            agent_loop_output=agent_loop_output,
+            agent_loop_output_list=agent_loop_output,
             sample_id=sample_id,
             epoch=0,
             rollout_n_index=0,
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 8e0b82ddefc..ac08af80928 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -49,15 +49,17 @@ top_k=-1
 val_top_p=0.7
 
 # Fully async specific parameters
-n_gpus_rollout=6
+n_gpus_rollout=4
 n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
-n_resp_per_prompt=3
-train_prompt_mini_bsz=32
-total_rollout_steps=50000
-staleness_threshold=10
+n_resp_per_prompt=16
+train_prompt_mini_bsz=4
+staleness_threshold=1
+total_rollout_steps=$(((128*2)))
+trigger_parameter_sync_step=4
+partial_rollout=True
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"
 
@@ -126,14 +128,16 @@ common_params=(
     rollout.total_epochs=2
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}
+    async_training.partial_rollout="${partial_rollout}"
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}"
 )
 
 if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then
     echo "Running fully async training with FSDP2 strategy..."
     # FSDP2 specific parameters
-    gen_tp=2
-    sp_size=2
-    fsdp_size=2
+    gen_tp=1
+    sp_size=1
+    fsdp_size=1
     ref_offload=True
     actor_offload=False
 
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 83ba95c8662..6899937443f 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -18,7 +18,7 @@
 import random
 import time
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Optional, List
 
 import hydra
 import numpy as np
@@ -383,13 +383,14 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
         return output
 
     async def generate_sequences_no_post(
-        self, batch: DataProto, partial_output: Optional[AgentLoopOutput]
+            self,
+            batch: DataProto, partial_output_list: Optional[List[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
         Args:
             batch (DataProto): Input batch.
-            partial_output: Optional[AgentLoopOutput]: already rollout result.
+            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
 
         Returns:
             list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
@@ -427,8 +428,14 @@ async def generate_sequences_no_post(
         trajectory_info = await get_trajectory_info(
             batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
         )
-
-        for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True):
+        if not partial_output_list:
+            partial_output_list = [None] * len(batch)
+
+        for agent_name, messages, trajectory, partial_output in zip(agent_names,
+                                                                    raw_prompts,
+                                                                    trajectory_info,
+                                                                    partial_output_list,
+                                                                    strict=True):
             tasks.append(
                 asyncio.create_task(
                     self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
@@ -602,38 +609,25 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         output.meta_info = {"timing": timing}
         return output
 
-    async def generate_single_sample_async(
-        self, sample: DataProto, partial_output: Optional[AgentLoopOutput]
-    ) -> tuple[AgentLoopOutput, float]:
+    async def generate_single_sample_async(self,
+                                           sample: DataProto,
+                                           partial_output_list: Optional[List[AgentLoopOutput]],
+                                           ) -> List[AgentLoopOutput]:
         """
-        异步处理单个样本 - 用于流式推理的核心方法
+        异步处理单个样本, 需要复制n次
 
         Args:
             sample: 单个样本数据
-            partial_output: Optional[AgentLoopOutput]: already rollout result.
+            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
 
         Returns:
             tuple[AgentLoopOutput, float]: 处理结果和处理时间
         """
-        start_time = time.time()
-
         # 使用负载均衡选择 worker
         worker = self._select_best_worker()
-
         # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
-        output_future = worker.generate_sequences_no_post.remote(sample, partial_output)
-        outputs = await asyncio.wrap_future(output_future.future())
-
-        processing_time = time.time() - start_time
-
-        # outputs 是 AgentLoopOutput 列表，取第一个（因为是单样本）
-        assert len(outputs) == 1, f"Expected single output for single sample, got {len(outputs)}"
-        output = outputs[0]
-
-        # 添加处理时间到metrics
-        output.metrics.generate_sequences = processing_time
-
-        return output, processing_time
+        output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list)
+        return await asyncio.wrap_future(output_future.future())
 
     def _select_best_worker(self):
         """选择最佳的 worker（简单的轮询负载均衡）"""
diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
index fd2a7292e67..899b83f1866 100644
--- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
@@ -35,13 +35,18 @@ def __init__(self, *args, **kwargs):
     async def run(
         self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
     ) -> AgentLoopOutput:
+
         if not output:
             prompt_ids = await self.loop.run_in_executor(
                 None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
             )
         else:
-            # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
-            prompt_ids = output.prompt_ids + output.response_ids
+            if output.is_cancel:
+                # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
+                prompt_ids = output.prompt_ids + output.response_ids
+            else:
+                # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
+                return output
 
         metrics = {}
         request_id = uuid4().hex

From 7d6505432e2bf640c732a2b85b69d84ea8eee4c3 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 27 Aug 2025 17:07:35 +0800
Subject: [PATCH 076/182] fix oom

---
 recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
index 951db892651..33f9836e095 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
@@ -55,8 +55,8 @@ use_dynamic_bsz=True
 actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 offload=True
-gen_tp=1
-sp_size=1
+gen_tp=2
+sp_size=4
 fsdp_size=2
 
 # reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361

From 57076bc6c509edfb4bfee2e2b28f914278a7041f Mon Sep 17 00:00:00 2001
From: wangshulin02 <wangshulin02@meituan.com>
Date: Thu, 28 Aug 2025 09:27:19 +0800
Subject: [PATCH 077/182] fix validation bug

---
 recipe/fully_async_policy/fully_async_main.py      |  2 ++
 recipe/fully_async_policy/fully_async_rollouter.py | 12 +++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index c86ade05301..e662aec23bf 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -221,6 +221,8 @@ def _create_rollouter(self, config) -> None:
             resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]),
             ray_worker_group_cls=self.components["ray_worker_group_cls"],
             processor=self.components["processor"],
+            reward_fn=self.components["reward_fn"],
+            val_reward_fn=self.components["val_reward_fn"],
             device_name=config.trainer.device,
         )
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 262dd622e1b..04b2fe5dc54 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import asyncio
 import time
-from pprint import pformat
+from pprint import pformat, pprint
 
 import ray
 from omegaconf import OmegaConf
@@ -210,9 +210,11 @@ async def update_param_version(self, version: int):
             )
             timing_raw = {}
             self.update_param_version_time += 1
+            is_last_step = self.global_steps >= self.total_training_steps
             if (self.val_reward_fn is not None
                     and self.config.trainer.test_freq > 0
-                    and (self.is_last_step or self.global_steps % self.config.trainer.test_freq == 0)):
+                    and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0)
+                         or is_last_step)):
                 with marked_timer("testing", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
                     data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics)
@@ -438,12 +440,12 @@ async def _streaming_generation_main(self):
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        # load checkpoint before doing anything
-        self._load_checkpoint()
+        # load checkpoint before doing anything 
+        self._load_checkpoint() # TODO: 检查是否需要
 
         # perform validation before training
         # currently, we only support validation using the reward_function.
-        async with self.lock:
+        async with self.lock:   # TODO: 检查是否需要锁
             if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
                 print("Initial validation metric")
                 val_metrics = self._validate()

From a7133c94f1d084cca76c2f86f059a357d220f0da Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 28 Aug 2025 13:34:05 +0800
Subject: [PATCH 078/182] fsdp2 8 8

---
 .../dapo_7b_math_fsdp2_4_4.sh                 |   2 +-
 .../dapo_7b_math_fsdp2_8_8.sh                 | 170 ++++++++++++++++++
 recipe/fully_async_policy/runtime_env.yaml    |   2 +
 3 files changed, 173 insertions(+), 1 deletion(-)
 create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
 create mode 100644 recipe/fully_async_policy/runtime_env.yaml

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
index 289a3556871..5fb85a66b6f 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
@@ -155,7 +155,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
     trainer.val_before_train=True \
-    trainer.test_freq=-1 \
+    trainer.test_freq="${test_freq}" \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
new file mode 100644
index 00000000000..30086d3fe30
--- /dev/null
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+# Fully async specific parameters
+n_gpus_rollout=4
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=4
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=16
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.test_freq="${test_freq}" \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml
new file mode 100644
index 00000000000..81c7c9f4265
--- /dev/null
+++ b/recipe/fully_async_policy/runtime_env.yaml
@@ -0,0 +1,2 @@
+env_vars:
+  VLLM_USE_V1: "1"
\ No newline at end of file

From d3216d2b2c7e683c70f4b23c5ec4fa800a3bb8a2 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 28 Aug 2025 13:44:08 +0800
Subject: [PATCH 079/182] fsdp2_8_8

---
 recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
index 30086d3fe30..52ee0136d5a 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
@@ -65,12 +65,12 @@ gen_tp=1
 sp_size=1
 fsdp_size=2
 
-NNODES=${NNODES:-2}
+NNODES=${NNODES:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
 # Fully async specific parameters
-n_gpus_rollout=4
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+n_gpus_rollout=8
+n_gpus_training=8
 
 train_prompt_bsz=0
 gen_prompt_bsz=1

From c33e40ea5f85fd2a63947ca791e205ce5b2da256 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 28 Aug 2025 14:31:59 +0800
Subject: [PATCH 080/182] megatron colocate

---
 .../dapo_7b_math_megatron_colocate.sh         | 142 ++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh

diff --git a/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh
new file mode 100644
index 00000000000..d05f5571876
--- /dev/null
+++ b/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh
@@ -0,0 +1,142 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0519a1-megatron-colocate'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=2
+train_tp=2
+train_pp=2
+
+# TODO: support dynamic_bsz for megatron
+# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=megatron \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=10 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=100 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10

From d391a0612722d0e9a8347f953054432866303df0 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 29 Aug 2025 17:22:55 +0800
Subject: [PATCH 081/182] rollout log probs

---
 .../config/fully_async_ppo_trainer.yaml       |  1 +
 recipe/fully_async_policy/detach_utils.py     | 39 +++++++++++++++++++
 .../fully_async_rollouter.py                  |  2 +-
 verl/experimental/agent_loop/agent_loop.py    |  2 +
 .../partial_single_turn_agent_loop.py         |  4 +-
 verl/trainer/ppo/ray_trainer.py               |  5 +++
 .../rollout/vllm_rollout/vllm_async_server.py | 19 ++++++---
 7 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 0714e107ee4..3334ee4f4d5 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -13,6 +13,7 @@ async_training:
   staleness_threshold: 1              # 样本新鲜度阈值
   trigger_parameter_sync_step: 4     # >=1 train 每次训练一个batch, 迭代多少次后触发更新
   partial_rollout: True               # 同步参数时，是否中断 rollout
+  use_rollout_log_probs: True
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 0296945a2ab..986cb468932 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -87,9 +87,48 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP
     return full_batch
 
 
+def process_rollout_log_probs(data_proto: DataProto,
+                              rollout_log_probs: list[list[float]]) -> torch.Tensor:
+    """
+    根据 DataProto 中的 mask 逻辑处理 rollout_log_probs
+    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+
+    Args:
+        data_proto: 包含 batch 信息的 DataProto 对象
+        rollout_log_probs: 二维列表，每个子列表包含一个样本的 log_probs
+
+    Returns:
+        torch.Tensor: 处理后的 log_probs tensor，形状为 [bsz, response_length]
+    """
+
+    batch = data_proto.batch
+    response_mask = batch["response_mask"]
+    bsz, response_length = response_mask.shape
+
+    # 初始化结果 tensor
+    rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1
+
+
+    for i, log_probs_seq in enumerate(rollout_log_probs):
+        # 获取当前样本的有效长度（mask 中为 1 的位置数量）
+        valid_length = response_mask[i].sum().item()
+
+        # 确保 log_probs_seq 的长度不超过有效长度
+        actual_length = min(len(log_probs_seq), valid_length)
+
+        # 将 log_probs 填入对应位置
+        if actual_length > 0:
+            rollout_log_probs_tensor[i, :actual_length] = torch.tensor(log_probs_seq[:actual_length])
+
+    rollout_log_probs_tensor = rollout_log_probs_tensor.to(torch.float32)
+    return rollout_log_probs_tensor
+
 def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
     # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
     gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config)
+    rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list]
+    rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs)
+    gen_batch_output.batch['rollout_log_probs'] = rollout_log_probs.to(torch.float32)
 
     # 第二步：添加 uid
     rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object)
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 04b2fe5dc54..b2bcebede58 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -226,7 +226,7 @@ def _validate_config(self):
         # Validate asynchronous training configuration
         if not hasattr(self.config, "async_training"):
             raise ValueError("[FullyAsyncRollouter] Missing async_training configuration")
-
+        assert self.config.actor_rollout_ref.rollout.calculate_log_probs == True, "must rollout calculate log_probs"
         super()._validate_config()
 
     def _create_actor_rollout_classes(self):
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index e9383b109e5..743a4927b7e 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -136,6 +136,8 @@ class AgentLoopOutput(BaseModel):
     """Auxiliary performance metrics"""
     is_cancel: bool = False
     """Indicates whether the request was interrupted"""
+    log_probs: list[float] = None
+    """Response token log probs including LLM generated token, tool response token."""
 
 
 # make hydra.utils.instantiate happy
diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
index 899b83f1866..c94788cd61d 100644
--- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
@@ -51,7 +51,7 @@ async def run(
         metrics = {}
         request_id = uuid4().hex
         with simple_timer("generate_sequences", metrics):
-            response_ids, is_cancel = await self.server_manager.generate_for_partial(
+            response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
                 request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
             )
 
@@ -60,6 +60,7 @@ async def run(
         # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
         else:
             prompt_ids = output.prompt_ids
+            log_probs = output.log_probs + log_probs
             response_ids = output.response_ids + response_ids
             response_mask = [1] * len(response_ids)
 
@@ -70,4 +71,5 @@ async def run(
             num_turns=2,
             metrics=metrics,
             is_cancel=is_cancel,
+            log_probs=log_probs
         )
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 60621021b30..4aa7102977f 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1280,6 +1280,11 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                         "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
                     }
                 )
+                if self.config.async_training and self.config.async_training.use_rollout_log_probs:
+                    print("use_rollout_log_probs")
+                    batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
+                    del actor_old_log_probs
+
         if self.use_reference_policy:
             # compute reference log_prob
             with marked_timer("ref", timing_raw, color="olive"):
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 7ce640e33cb..970c309f84a 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -337,7 +337,7 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any],
 
     async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str):
         max_tokens = self.max_model_len - len(prompt_ids)
-        sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params)
+        sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params)
         prompt = TokensPrompt(prompt_token_ids=prompt_ids)
         generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
 
@@ -348,13 +348,13 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str,
         assert self.req_output[request_id] is not None
 
     async def generate_for_partial(
-        self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
-    ) -> tuple[Sequence[int], bool] | tuple[str, bool]:
+            self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
+    ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
         # 设置中断标志
         async with self.lock:
             if self.paused:
                 # cancel 后， 所有任务直接返回，等待下次提交
-                return [], True
+                return [], [], True
             self.cancel_event[request_id] = asyncio.Event()
             cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
             generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
@@ -368,11 +368,20 @@ async def generate_for_partial(
             task.cancel()
 
         async with self.lock:
+            print(f"token_ids size: {len(self.req_output[request_id].outputs[0].token_ids)}")
+            print(f"log_probs size: {len(self.req_output[request_id].outputs[0].logprobs)}")
             token_ids = self.req_output[request_id].outputs[0].token_ids
+            log_probs: list[float] = []
+            for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs):
+                # sampling_params 中 logprobs 设置为1，只返回1个
+                token_id = self.req_output[request_id].outputs[0].token_ids[i]
+                log_probs.append(x[token_id].logprob)
+
             is_cancel = generation_handle not in done
             self.cancel_event.pop(request_id, None)
             self.req_output.pop(request_id, None)
-        return token_ids, is_cancel
+
+        return token_ids, log_probs, is_cancel
 
     async def cancel(self):
         async with self.lock:

From f27b916fcea413f2faa8e8d6f93822e53144e4ab Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 29 Aug 2025 18:29:23 +0800
Subject: [PATCH 082/182] tensorboard

step size

refactor code

fix message_queue

total_train_steps

int max_queue_size

await

self.max_steps_duration

refactor print
---
 ...fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} |   9 +-
 recipe/fully_async_policy/detach_utils.py     |  19 ++-
 recipe/fully_async_policy/fully_async_main.py |  14 +-
 .../fully_async_rollouter.py                  | 157 ++++++++----------
 .../fully_async_policy/fully_async_trainer.py |  47 +++---
 recipe/fully_async_policy/message_queue.py    |   9 +-
 tests/special_e2e/run_fully_async_policy.sh   |   3 +-
 verl/experimental/agent_loop/agent_loop.py    |  24 ++-
 .../partial_single_turn_agent_loop.py         |   3 +-
 .../agent_loop/single_turn_agent_loop.py      |   6 +-
 .../agent_loop/tool_agent_loop.py             |   7 +-
 verl/trainer/main_ppo.py                      |   1 +
 verl/trainer/ppo/ray_trainer.py               |   1 -
 .../rollout/vllm_rollout/vllm_async_server.py |  14 +-
 14 files changed, 155 insertions(+), 159 deletions(-)
 rename recipe/fully_async_policy/{dapo_7b_math_fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} (97%)

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
similarity index 97%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
rename to recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
index 5fb85a66b6f..58017f0123b 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
@@ -69,15 +69,15 @@ NNODES=${NNODES:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
 # Fully async specific parameters
-n_gpus_rollout=4
+n_gpus_rollout=2
 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=4
-total_rollout_steps=$(((512*10)))
-test_freq=-1
+train_prompt_mini_bsz=64
+total_rollout_steps=$(((512*100)))
+test_freq=5
 staleness_threshold=1
 trigger_parameter_sync_step=16
 partial_rollout=True
@@ -140,6 +140,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
     actor_rollout_ref.rollout.val_kwargs.do_sample=True \
     actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
     actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
     actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
     actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 986cb468932..af8dfe16857 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import time
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any
 
 import numpy as np
 import torch
@@ -36,20 +36,22 @@ class RolloutSample:
     full_batch: Any
 
     # AgentLoopOutput from generation
-    agent_loop_output_list: List[Any]  # AgentLoopOutput
+    agent_loop_output_list: list[Any]  # AgentLoopOutput
 
     # Metadata
     sample_id: str
     epoch: int
 
     # Processing metadata
-    processing_times: List[float]
+    processing_times: list[float]
     param_version: int
 
+
 @dataclass
 class ValidateMetrics:
-    timing_raw: Dict[str, Any]
-    metrics: Dict[str, Any]
+    timing_raw: dict[str, Any]
+    metrics: dict[str, Any]
+
 
 def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto:
     """
@@ -87,8 +89,7 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP
     return full_batch
 
 
-def process_rollout_log_probs(data_proto: DataProto,
-                              rollout_log_probs: list[list[float]]) -> torch.Tensor:
+def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[list[float]]) -> torch.Tensor:
     """
     根据 DataProto 中的 mask 逻辑处理 rollout_log_probs
     # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
@@ -108,7 +109,6 @@ def process_rollout_log_probs(data_proto: DataProto,
     # 初始化结果 tensor
     rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1
 
-
     for i, log_probs_seq in enumerate(rollout_log_probs):
         # 获取当前样本的有效长度（mask 中为 1 的位置数量）
         valid_length = response_mask[i].sum().item()
@@ -123,12 +123,13 @@ def process_rollout_log_probs(data_proto: DataProto,
     rollout_log_probs_tensor = rollout_log_probs_tensor.to(torch.float32)
     return rollout_log_probs_tensor
 
+
 def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
     # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
     gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config)
     rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list]
     rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs)
-    gen_batch_output.batch['rollout_log_probs'] = rollout_log_probs.to(torch.float32)
+    gen_batch_output.batch["rollout_log_probs"] = rollout_log_probs.to(torch.float32)
 
     # 第二步：添加 uid
     rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object)
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index e662aec23bf..2b5663bd5ea 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -185,8 +185,18 @@ def _initialize_components(self, config) -> None:
         print("[ASYNC MAIN] Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
-        print("[ASYNC MAIN] Creating MessageQueue...")
+        # 同步require samples
+        required_samples = ray.get(self.components["trainer"].get_required_samples.remote())
+        ray.get(self.components["rollouter"].set_required_samples.remote(required_samples))
+
+        # 同步total_train_steps
+        total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote())
+        print(f"total_train_steps {total_train_steps}")
+        ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps))
+
+        # 获取 max_queue_size (使用同步方法避免异步返回值问题)
         max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote())
+        print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}")
         message_queue = MessageQueue.remote(config, max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
         self.components["message_queue"] = message_queue
@@ -204,9 +214,7 @@ def _initialize_components(self, config) -> None:
             rollouter=self.components["rollouter"],
             mq=self.components["message_queue_client"],
         )
-
         ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
-        ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer))
 
         ray.get(param_synchronizer.sync_weights.remote(0))
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index b2bcebede58..83bc2c0ce8a 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -17,14 +17,12 @@
 
 import ray
 from omegaconf import OmegaConf
-from tqdm import tqdm
 
 from recipe.fully_async_policy.detach_utils import (
     RolloutSample,
-    calculate_one_step_size,
     ValidateMetrics,
-    prepare_single_generation_data,
     merge_rollout_sample,
+    prepare_single_generation_data,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -102,93 +100,81 @@ def __init__(
         if self.config.rollout.total_rollout_steps is not None:
             self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps)
         print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}")
+        self.total_train_steps = None
+
+        # ==================== fully async config ====================
 
         # Rollouter parameter configuration
         self.message_queue_client = None
 
-        self.current_param_version = 0
+        # Worker groups: rollout_wg is same to actor_rollout_wg
+        self.rollout_wg = None
+        self.actor_rollout_wg = None
+        self.async_rollout_manager = None
 
-        # Freshness control - improved configuration management
-        async_config = config.async_training
-        self.staleness_threshold = async_config.get("staleness_threshold", 3)
+        # Config
+        self.staleness_threshold: int = config.async_training.get("staleness_threshold", 1)
+        self.required_samples = None
+        self.max_required_samples = None
+        # 单次最多扔一次更新需要的样本
+        self.max_concurrent_samples = None
+        # queue size
+        self.max_queue_size = None
 
         # Statistics
+        self.current_param_version = 0
         self.total_generated_samples = 0
         self.staleness_samples = 0
         self.dropped_stale_samples = 0
-
-        # Worker groups
-        self.rollout_wg = None
-        self.message_queue_client = None
+        self.processed_sample_count = 0  # 已处理的样本计数
+        self.global_steps = 0
 
         # Concurrency control
         self.paused = False
         self.running = True
+        # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
+        self.monitor_loop_trigger = True
 
         # Initialize async locks directly
         self.lock = asyncio.Lock()
         self.condition = asyncio.Condition(self.lock)
 
-        # Pause/resume statistics
-        self.total_pause_time = 0.0
-        self.last_pause_time = None
-
-        # Parameter synchronization related
-        self.param_synchronizer = None
-
-        self.async_rollout_manager = None
-
-        # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
-        self.required_samples = calculate_one_step_size(
-            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
-        )
-        self.max_required_samples = (
-            self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
-        )
-        print(
-            f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
-            f"max_required_samples: {self.max_required_samples}"
-        )
-
-        # 单次最多扔一次更新需要的样本
-        self.max_concurrent_samples = self.required_samples
-
-        # 流式处理统计
-        self.processed_sample_count = 0  # 已处理的样本计数
-        self.active_sample_count = 0  # 当前正在处理的样本数
-        self.queue_full_pause_count = 0  # 队列满导致的暂停次数
-
-        # queue size
-        self.max_queue_size = self.max_required_samples * 10  # x 10 avoid deadlock
-        print(f"[FullyAsyncRollouter] {self.max_queue_size}")
-
         # 初始化异步队列
-        self.pending_queue = asyncio.Queue(maxsize=100)
+        self.pending_queue = asyncio.Queue(maxsize=128)
         self.active_tasks = set()
         self.result_queue = asyncio.Queue()
         self.cancel_queue = asyncio.Queue()
 
-        # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
-        self.monitor_loop_trigger = True
-
-        self.update_param_version_time = 0
-        self.global_steps = 0
-
-        self.progress_bar = tqdm(
-            total=self.total_rollout_steps / (
-                    self.required_samples * self.config.async_training.trigger_parameter_sync_step),
-            initial=self.global_steps, desc="Training Progress"
-        )
-
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
         async with self.lock:
             self.message_queue_client = message_queue_client
 
-    async def set_parameter_synchronizer(self, param_synchronizer):
-        """Set parameter synchronizer"""
+    async def set_required_samples(self, required_samples: int):
         async with self.lock:
-            self.param_synchronizer = param_synchronizer
+            self.required_samples = int(required_samples)
+            self.max_required_samples = (
+                self.required_samples
+                * (self.staleness_threshold + 1)
+                * self.config.async_training.trigger_parameter_sync_step
+            )
+            self.total_train_steps = int(
+                self.total_rollout_steps
+                / (self.required_samples * self.config.async_training.trigger_parameter_sync_step)
+            )
+
+            # 单次最多扔一次更新需要的样本
+            self.max_concurrent_samples = self.required_samples
+            self.max_queue_size = self.max_required_samples
+
+            print(
+                f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
+                f"max_required_samples: {self.max_required_samples} "
+                f"max_queue_size: {self.max_queue_size} "
+                f"total_train_steps: {self.total_train_steps} "
+                f"total_rollout_steps: {self.total_rollout_steps} "
+                f"max_concurrent_samples: {self.max_concurrent_samples} "
+            )
 
     def get_rollout_wg(self):
         """Get rollout worker group"""
@@ -197,6 +183,9 @@ def get_rollout_wg(self):
     def get_max_queue_size(self):
         return self.max_queue_size
 
+    def get_total_train_steps(self):
+        return self.total_train_steps
+
     async def update_param_version(self, version: int):
         """Update current parameter version"""
         async with self.lock:
@@ -209,24 +198,22 @@ async def update_param_version(self, version: int):
                 f"Parameter version updated from {old_version} to {version}"
             )
             timing_raw = {}
-            self.update_param_version_time += 1
             is_last_step = self.global_steps >= self.total_training_steps
-            if (self.val_reward_fn is not None
-                    and self.config.trainer.test_freq > 0
-                    and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0)
-                         or is_last_step)):
+            if (
+                self.val_reward_fn is not None
+                and self.config.trainer.test_freq > 0
+                and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) or is_last_step)
+            ):
                 with marked_timer("testing", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
                     data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics)
-                    self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
-            if version > 0:
-                self.progress_bar.update(1)
+                    await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
 
     def _validate_config(self):
         # Validate asynchronous training configuration
         if not hasattr(self.config, "async_training"):
             raise ValueError("[FullyAsyncRollouter] Missing async_training configuration")
-        assert self.config.actor_rollout_ref.rollout.calculate_log_probs == True, "must rollout calculate log_probs"
+        assert self.config.actor_rollout_ref.rollout.calculate_log_probs, "must rollout calculate log_probs"
         super()._validate_config()
 
     def _create_actor_rollout_classes(self):
@@ -388,17 +375,17 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         is_cancel = False
         # 收集所有信息
         for agent_loop in agent_loop_output_list:
-            if is_cancel == False and agent_loop.is_cancel:
+            if not is_cancel and agent_loop.is_cancel:
                 is_cancel = True
 
-        rollout_data = {
-            "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list],
-            "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list],
-        }
-        if is_cancel:
-            rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list]
-        formatted_data = pformat(rollout_data, width=200, compact=True)
-        print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}")
+        # rollout_data = {
+        #     "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list],
+        #     "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list],
+        # }
+        # if is_cancel:
+        #     rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list]
+        # formatted_data = pformat(rollout_data, width=200, compact=True)
+        # print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}")
 
         if is_cancel:
             # 放入 cancel 队列中，等待恢复生成
@@ -440,12 +427,12 @@ async def _streaming_generation_main(self):
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        # load checkpoint before doing anything 
-        self._load_checkpoint() # TODO: 检查是否需要
+        # load checkpoint before doing anything
+        self._load_checkpoint()  # TODO: 检查是否需要
 
         # perform validation before training
         # currently, we only support validation using the reward_function.
-        async with self.lock:   # TODO: 检查是否需要锁
+        async with self.lock:  # TODO: 检查是否需要锁
             if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
                 print("Initial validation metric")
                 val_metrics = self._validate()
@@ -514,8 +501,6 @@ async def fit(self):
 
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-        if self.param_synchronizer is None:
-            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
         # 设置运行状态
         async with self.lock:
@@ -550,8 +535,8 @@ async def _async_monitor_loop(self):
         Function 2: Trigger rollout recovery
         """
         last_stats_time = time.time()
-        stats_interval = 30.0
-        check_interval = 5.0
+        stats_interval = 60.0
+        check_interval = 10.0
 
         while True:
             async with self.lock:
@@ -563,6 +548,8 @@ async def _async_monitor_loop(self):
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
                 print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}")
+                data = ValidateMetrics(timing_raw={}, metrics=stats)
+                await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
                 last_stats_time = current_time
 
             # pause 和 resume 之间，不进行恢复操作
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index d6d44babb2a..d6f22ba312a 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -12,19 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import time
 import warnings
 from datetime import datetime
-from pprint import pprint
 from typing import Any
 
 import ray
 from omegaconf import OmegaConf
+from tqdm import tqdm
 
 from recipe.fully_async_policy.detach_utils import (
+    ValidateMetrics,
     assemble_batch_from_rollout_samples,
-    calculate_one_step_size, ValidateMetrics,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -38,8 +37,6 @@
 )
 from verl.utils.debug import marked_timer
 
-logger = logging.getLogger(__name__)
-
 
 @ray.remote(num_cpus=10)
 class FullyAsyncTrainer(RayPPOTrainer):
@@ -103,15 +100,25 @@ def __init__(
         self.param_synchronizer = None
 
         # Statistics
+        # we start from step 1
+        self.global_steps = 1
+        self.local_trigger_step = 1
         self.processed_samples = 0
         self.stale_samples_processed = 0
         self.current_param_version = 0
-
-        self.local_trigger_step = 1
+        self.total_train_steps = None
+        self.progress_bar = None
         self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step
 
-        self.required_samples = calculate_one_step_size(
-            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
+        # calculate required_samples
+        ppo_mini_batch_size = config.actor_rollout_ref.actor.ppo_mini_batch_size
+        rollout_n = config.actor_rollout_ref.rollout.n
+        if ppo_mini_batch_size % rollout_n != 0:
+            raise ValueError(
+                f"PPO mini batch size ({ppo_mini_batch_size}) must be divisible by rollout n ({rollout_n})"
+            )
+        self.required_samples = int(
+            self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n
         )
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
@@ -122,10 +129,17 @@ def set_parameter_synchronizer(self, param_synchronizer):
         """Set parameter synchronizer"""
         self.param_synchronizer = param_synchronizer
 
+    def set_total_train_steps(self, total_train_steps):
+        self.total_train_steps = total_train_steps
+        self.progress_bar = tqdm(total=self.total_train_steps, initial=0, desc="Training Progress")
+
     def get_actor_wg(self):
         """Get actor worker group"""
         return self.actor_wg
 
+    def get_required_samples(self):
+        return self.required_samples
+
     def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
         Get samples from message queue and compose gen_batch_output
@@ -166,7 +180,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         consumer_end = time.time()
 
         if not queue_samples or len(queue_samples) < self.required_samples:
-            logger.warning("not enough samples collected after loop")
+            print("[FullyAsyncTrainer] not enough samples collected after loop")
             return None, None
 
         print(
@@ -230,22 +244,16 @@ def fit(self):
 
         from verl.utils.tracking import Tracking
 
-        self.logger = Tracking(
+        logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
             default_backend=self.config.trainer.logger,
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        self.global_steps = 0
-
         # load checkpoint before doing anything
         self._load_checkpoint()
-
-        # we start from step 1
-        self.global_steps += 1
         self.max_steps_duration = 0
-
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
         while True:
@@ -293,11 +301,9 @@ def fit(self):
                 self._check_save_checkpoint(False, timing_raw)
 
             self._collect_metrics(batch, 0, metrics, timing_raw)
-            pprint(metrics)
+            logger.log(data=metrics, step=self.global_steps)
             # Trigger parameter synchronization after training step
-
             time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3]
-
             print(
                 f"[FullyAsyncTrainer] global_steps: {self.global_steps} "
                 f"local_trigger_step: {self.local_trigger_step} "
@@ -316,6 +322,7 @@ def _trigger_parameter_sync_after_step(self):
             self.local_trigger_step = 1
             self.current_param_version = self.current_param_version + 1
             ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
+            self.progress_bar.update(1)
             return
         else:
             self.local_trigger_step += 1
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 6a425c50478..13e1a3e21e4 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -32,13 +32,15 @@ class MessageQueue:
 
     def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.config = config
-        self.max_queue_size = max_queue_size
-        self.queue = deque(maxlen=max_queue_size)
+        # 确保 max_queue_size 不为 None
+        if max_queue_size is None:
+            raise ValueError(f"max_queue_size cannot be None, got: {max_queue_size}")
+        self.max_queue_size = int(max_queue_size)
+        self.queue = deque(maxlen=self.max_queue_size)
         self.current_param_version = 0
 
         self.val_queue = deque()
 
-
         try:
             if hasattr(config, "async_training") and config.async_training is not None:
                 self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3)
@@ -203,7 +205,6 @@ async def get_validate(self):
                 return None
 
 
-
 class MessageQueueClient:
     """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor"""
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 1a47b0fd06e..64f9fa82825 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -55,7 +55,7 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=4
+train_prompt_mini_bsz=32
 total_rollout_steps=$(((128*2)))
 test_freq=2
 staleness_threshold=1
@@ -79,6 +79,7 @@ common_params=(
     data.gen_batch_size=${gen_prompt_bsz}
     data.return_raw_chat=${return_raw_chat}
     actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.calculate_log_probs=True
     algorithm.adv_estimator=${adv_estimator}
     algorithm.use_kl_in_reward=${use_kl_in_reward}
     algorithm.kl_ctrl.kl_coef=${kl_coef}
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 743a4927b7e..8c49390f456 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -16,9 +16,8 @@
 import logging
 import os
 import random
-import time
 from abc import ABC, abstractmethod
-from typing import Any, Optional, List
+from typing import Any, Optional
 
 import hydra
 import numpy as np
@@ -105,6 +104,7 @@ async def generate(
         return output
 
     async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
+        """Generate tokens from prompt ids. with partial rollout function"""
         server = self._choose_server(request_id)
         output = await server.generate_for_partial.remote(
             request_id=request_id,
@@ -385,8 +385,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
         return output
 
     async def generate_sequences_no_post(
-            self,
-            batch: DataProto, partial_output_list: Optional[List[AgentLoopOutput]]
+        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
@@ -433,11 +432,9 @@ async def generate_sequences_no_post(
         if not partial_output_list:
             partial_output_list = [None] * len(batch)
 
-        for agent_name, messages, trajectory, partial_output in zip(agent_names,
-                                                                    raw_prompts,
-                                                                    trajectory_info,
-                                                                    partial_output_list,
-                                                                    strict=True):
+        for agent_name, messages, trajectory, partial_output in zip(
+            agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True
+        ):
             tasks.append(
                 asyncio.create_task(
                     self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
@@ -610,10 +607,11 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         output.meta_info = {"timing": timing}
         return output
 
-    async def generate_single_sample_async(self,
-                                           sample: DataProto,
-                                           partial_output_list: Optional[List[AgentLoopOutput]],
-                                           ) -> List[AgentLoopOutput]:
+    async def generate_single_sample_async(
+        self,
+        sample: DataProto,
+        partial_output_list: Optional[list[AgentLoopOutput]],
+    ) -> list[AgentLoopOutput]:
         """
         异步处理单个样本, 需要复制n次
 
diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
index c94788cd61d..df4a4f3350a 100644
--- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
@@ -35,7 +35,6 @@ def __init__(self, *args, **kwargs):
     async def run(
         self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
     ) -> AgentLoopOutput:
-
         if not output:
             prompt_ids = await self.loop.run_in_executor(
                 None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
@@ -71,5 +70,5 @@ async def run(
             num_turns=2,
             metrics=metrics,
             is_cancel=is_cancel,
-            log_probs=log_probs
+            log_probs=log_probs,
         )
diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py
index 492c1894cc5..df6e1991888 100644
--- a/verl/experimental/agent_loop/single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/single_turn_agent_loop.py
@@ -32,9 +32,9 @@ def __init__(self, *args, **kwargs):
         self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
         self.response_length = self.config.actor_rollout_ref.rollout.response_length
 
-    async def run(self, messages: list[dict[str, Any]],
-                  sampling_params: dict[str, Any],
-                  output: Optional[AgentLoopOutput]) -> AgentLoopOutput:
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index a0642048dc7..7c945b7d4c9 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -56,10 +56,9 @@ def init_class(cls, config, tokenizer, **kwargs):
         cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True)
 
     @rollout_trace_op
-    async def run(self,
-                  messages: list[dict[str, Any]],
-                  sampling_params: dict[str, Any],
-                  output: Optional[AgentLoopOutput]) -> AgentLoopOutput:
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 7b34cbfaf23..4b240c6ffbf 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -38,6 +38,7 @@ def main(config):
         config_dict: Hydra configuration dictionary containing training parameters.
     """
     from time import time
+
     start_time = time()
     run_ppo(config)
     print(f"total time: {time() - start_time:.2f} seconds")
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 4aa7102977f..e61b1dc5fe0 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1281,7 +1281,6 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                     }
                 )
                 if self.config.async_training and self.config.async_training.use_rollout_log_probs:
-                    print("use_rollout_log_probs")
                     batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
                     del actor_old_log_probs
 
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 970c309f84a..3b3e9542252 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -15,16 +15,14 @@
 import logging
 import os
 import pickle
-from contextlib import ExitStack
-from typing import Any, Callable, Optional, Coroutine, Sequence
+from typing import Any, Callable, Optional, Sequence
 
 import ray
 import zmq
-from omegaconf import DictConfig, ListConfig
+from omegaconf import DictConfig
 from starlette.requests import Request
 from starlette.responses import JSONResponse, StreamingResponse
-from vllm import SamplingParams, RequestOutput
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse
@@ -348,7 +346,7 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str,
         assert self.req_output[request_id] is not None
 
     async def generate_for_partial(
-            self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
+        self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
     ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
         # 设置中断标志
         async with self.lock:
@@ -368,19 +366,15 @@ async def generate_for_partial(
             task.cancel()
 
         async with self.lock:
-            print(f"token_ids size: {len(self.req_output[request_id].outputs[0].token_ids)}")
-            print(f"log_probs size: {len(self.req_output[request_id].outputs[0].logprobs)}")
             token_ids = self.req_output[request_id].outputs[0].token_ids
             log_probs: list[float] = []
             for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs):
                 # sampling_params 中 logprobs 设置为1，只返回1个
                 token_id = self.req_output[request_id].outputs[0].token_ids[i]
                 log_probs.append(x[token_id].logprob)
-
             is_cancel = generation_handle not in done
             self.cancel_event.pop(request_id, None)
             self.req_output.pop(request_id, None)
-
         return token_ids, log_probs, is_cancel
 
     async def cancel(self):

From 2f8971315c6b50811f375ec6f835910143d85d90 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sun, 31 Aug 2025 20:46:06 +0800
Subject: [PATCH 083/182] update metrics

---
 recipe/fully_async_policy/detach_utils.py          |  6 ++++++
 recipe/fully_async_policy/fully_async_rollouter.py |  5 +++--
 recipe/fully_async_policy/fully_async_trainer.py   | 13 +++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index af8dfe16857..79df36652f7 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -45,6 +45,7 @@ class RolloutSample:
     # Processing metadata
     processing_times: list[float]
     param_version: int
+    rollout_status: dict[str, Any]
 
 
 @dataclass
@@ -180,6 +181,9 @@ def assemble_batch_from_rollout_samples(
 
     rollout_samples_batch = []
     processing_times = []
+    rollout_status = rollout_samples[0].rollout_status
+    # 为 rollout_status 的所有 key 添加前缀
+    rollout_status = {f"fully_async/{key}": value for key, value in rollout_status.items()}
 
     for rs in rollout_samples:
         rollout_samples_batch.append(rs.full_batch)
@@ -208,6 +212,7 @@ def assemble_batch_from_rollout_samples(
         "tp99_processing_time": np.percentile(processing_times, 99),  # 99百分位
         "tp95_processing_time": np.percentile(processing_times, 95),  # 95百分位也很有用
     }
+    processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()}
 
     # 创建 meta_info
     final_batch.meta_info.update(
@@ -215,6 +220,7 @@ def assemble_batch_from_rollout_samples(
             "rollout_param_versions": param_versions,
             "param_version_diversity": len(set(param_versions)) if param_versions else 0,
             **processing_time_stats,
+            **rollout_status,
         }
     )
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 83bc2c0ce8a..2e22dae6d7b 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -371,6 +371,7 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         # 直接更新 RolloutSample 对象，填充剩余字段
         rollout_sample.agent_loop_output_list = agent_loop_output_list
         rollout_sample.param_version = self.current_param_version
+        rollout_sample.rollout_status = await self.get_statistics()
 
         is_cancel = False
         # 收集所有信息
@@ -438,6 +439,8 @@ async def _streaming_generation_main(self):
                 val_metrics = self._validate()
                 assert val_metrics, f"{val_metrics=}"
                 pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
+                data = ValidateMetrics(timing_raw={}, metrics=val_metrics)
+                await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
                 if self.config.trainer.get("val_only", False):
                     return
 
@@ -548,8 +551,6 @@ async def _async_monitor_loop(self):
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
                 print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}")
-                data = ValidateMetrics(timing_raw={}, metrics=stats)
-                await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
                 last_stats_time = current_time
 
             # pause 和 resume 之间，不进行恢复操作
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index d6f22ba312a..af406623145 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -285,16 +285,9 @@ def fit(self):
                                 "fully_async/current_param_version": self.current_param_version,
                             }
                         )
-                        for metric in [
-                            "avg_processing_time",
-                            "max_processing_time",
-                            "min_processing_time",
-                            "tp50_processing_time",
-                            "tp99_processing_time",
-                            "tp95_processing_time",
-                            "param_version_diversity",
-                        ]:
-                            metrics[f"fully_async/{metric}"] = batch.meta_info.get(metric, 0)
+                        for key, value in batch.meta_info:
+                            if key.startswith("fully_async"):
+                                metrics[key] = value
 
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)

From 1c3b32b822a89e96b1afca6518c6f5c1717ba1d7 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sun, 31 Aug 2025 21:01:09 +0800
Subject: [PATCH 084/182] update metrics

---
 recipe/fully_async_policy/fully_async_rollouter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 2e22dae6d7b..44768cacb29 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -271,6 +271,7 @@ async def _feed_samples(self):
                 epoch=epoch,
                 param_version=0,  # 待处理后填充
                 processing_times=[],
+                rollout_status={},
             )
 
             await self.pending_queue.put(rollout_sample)

From 1bea47c16f5a220f39b8d9cff07a5e9a07684453 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 29 Aug 2025 17:22:55 +0800
Subject: [PATCH 085/182] rollout log probs

tensorboard

step size

refactor code

fix message_queue

total_train_steps

int max_queue_size

await

self.max_steps_duration

refactor print

update metrics

update metrics

update metrics
---
 .../config/fully_async_ppo_trainer.yaml       |   1 +
 ...fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} |   9 +-
 recipe/fully_async_policy/detach_utils.py     |  56 +++++-
 recipe/fully_async_policy/fully_async_main.py |  14 +-
 .../fully_async_rollouter.py                  | 159 ++++++++----------
 .../fully_async_policy/fully_async_trainer.py |  60 +++----
 recipe/fully_async_policy/message_queue.py    |   9 +-
 tests/special_e2e/run_fully_async_policy.sh   |   3 +-
 verl/experimental/agent_loop/agent_loop.py    |  26 +--
 .../partial_single_turn_agent_loop.py         |   5 +-
 .../agent_loop/single_turn_agent_loop.py      |   6 +-
 .../agent_loop/tool_agent_loop.py             |   7 +-
 verl/trainer/main_ppo.py                      |   1 +
 verl/trainer/ppo/ray_trainer.py               |   4 +
 .../rollout/vllm_rollout/vllm_async_server.py |  21 ++-
 15 files changed, 218 insertions(+), 163 deletions(-)
 rename recipe/fully_async_policy/{dapo_7b_math_fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} (97%)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 0714e107ee4..3334ee4f4d5 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -13,6 +13,7 @@ async_training:
   staleness_threshold: 1              # 样本新鲜度阈值
   trigger_parameter_sync_step: 4     # >=1 train 每次训练一个batch, 迭代多少次后触发更新
   partial_rollout: True               # 同步参数时，是否中断 rollout
+  use_rollout_log_probs: True
 
 # Rollout配置
 rollout:
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
similarity index 97%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
rename to recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
index 5fb85a66b6f..58017f0123b 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
@@ -69,15 +69,15 @@ NNODES=${NNODES:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
 # Fully async specific parameters
-n_gpus_rollout=4
+n_gpus_rollout=2
 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=4
-total_rollout_steps=$(((512*10)))
-test_freq=-1
+train_prompt_mini_bsz=64
+total_rollout_steps=$(((512*100)))
+test_freq=5
 staleness_threshold=1
 trigger_parameter_sync_step=16
 partial_rollout=True
@@ -140,6 +140,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
     actor_rollout_ref.rollout.val_kwargs.do_sample=True \
     actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
     actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
     actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
     actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 0296945a2ab..79df36652f7 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import time
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any
 
 import numpy as np
 import torch
@@ -36,20 +36,23 @@ class RolloutSample:
     full_batch: Any
 
     # AgentLoopOutput from generation
-    agent_loop_output_list: List[Any]  # AgentLoopOutput
+    agent_loop_output_list: list[Any]  # AgentLoopOutput
 
     # Metadata
     sample_id: str
     epoch: int
 
     # Processing metadata
-    processing_times: List[float]
+    processing_times: list[float]
     param_version: int
+    rollout_status: dict[str, Any]
+
 
 @dataclass
 class ValidateMetrics:
-    timing_raw: Dict[str, Any]
-    metrics: Dict[str, Any]
+    timing_raw: dict[str, Any]
+    metrics: dict[str, Any]
+
 
 def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto:
     """
@@ -87,9 +90,47 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP
     return full_batch
 
 
+def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[list[float]]) -> torch.Tensor:
+    """
+    根据 DataProto 中的 mask 逻辑处理 rollout_log_probs
+    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+
+    Args:
+        data_proto: 包含 batch 信息的 DataProto 对象
+        rollout_log_probs: 二维列表，每个子列表包含一个样本的 log_probs
+
+    Returns:
+        torch.Tensor: 处理后的 log_probs tensor，形状为 [bsz, response_length]
+    """
+
+    batch = data_proto.batch
+    response_mask = batch["response_mask"]
+    bsz, response_length = response_mask.shape
+
+    # 初始化结果 tensor
+    rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1
+
+    for i, log_probs_seq in enumerate(rollout_log_probs):
+        # 获取当前样本的有效长度（mask 中为 1 的位置数量）
+        valid_length = response_mask[i].sum().item()
+
+        # 确保 log_probs_seq 的长度不超过有效长度
+        actual_length = min(len(log_probs_seq), valid_length)
+
+        # 将 log_probs 填入对应位置
+        if actual_length > 0:
+            rollout_log_probs_tensor[i, :actual_length] = torch.tensor(log_probs_seq[:actual_length])
+
+    rollout_log_probs_tensor = rollout_log_probs_tensor.to(torch.float32)
+    return rollout_log_probs_tensor
+
+
 def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
     # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
     gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config)
+    rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list]
+    rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs)
+    gen_batch_output.batch["rollout_log_probs"] = rollout_log_probs.to(torch.float32)
 
     # 第二步：添加 uid
     rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object)
@@ -140,6 +181,9 @@ def assemble_batch_from_rollout_samples(
 
     rollout_samples_batch = []
     processing_times = []
+    rollout_status = rollout_samples[0].rollout_status
+    # 为 rollout_status 的所有 key 添加前缀
+    rollout_status = {f"fully_async/{key}": value for key, value in rollout_status.items()}
 
     for rs in rollout_samples:
         rollout_samples_batch.append(rs.full_batch)
@@ -168,6 +212,7 @@ def assemble_batch_from_rollout_samples(
         "tp99_processing_time": np.percentile(processing_times, 99),  # 99百分位
         "tp95_processing_time": np.percentile(processing_times, 95),  # 95百分位也很有用
     }
+    processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()}
 
     # 创建 meta_info
     final_batch.meta_info.update(
@@ -175,6 +220,7 @@ def assemble_batch_from_rollout_samples(
             "rollout_param_versions": param_versions,
             "param_version_diversity": len(set(param_versions)) if param_versions else 0,
             **processing_time_stats,
+            **rollout_status,
         }
     )
 
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index e662aec23bf..2b5663bd5ea 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -185,8 +185,18 @@ def _initialize_components(self, config) -> None:
         print("[ASYNC MAIN] Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
-        print("[ASYNC MAIN] Creating MessageQueue...")
+        # 同步require samples
+        required_samples = ray.get(self.components["trainer"].get_required_samples.remote())
+        ray.get(self.components["rollouter"].set_required_samples.remote(required_samples))
+
+        # 同步total_train_steps
+        total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote())
+        print(f"total_train_steps {total_train_steps}")
+        ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps))
+
+        # 获取 max_queue_size (使用同步方法避免异步返回值问题)
         max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote())
+        print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}")
         message_queue = MessageQueue.remote(config, max_queue_size)
         message_queue_client = MessageQueueClient(message_queue)
         self.components["message_queue"] = message_queue
@@ -204,9 +214,7 @@ def _initialize_components(self, config) -> None:
             rollouter=self.components["rollouter"],
             mq=self.components["message_queue_client"],
         )
-
         ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
-        ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer))
 
         ray.get(param_synchronizer.sync_weights.remote(0))
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 04b2fe5dc54..44768cacb29 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -17,14 +17,12 @@
 
 import ray
 from omegaconf import OmegaConf
-from tqdm import tqdm
 
 from recipe.fully_async_policy.detach_utils import (
     RolloutSample,
-    calculate_one_step_size,
     ValidateMetrics,
-    prepare_single_generation_data,
     merge_rollout_sample,
+    prepare_single_generation_data,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -102,93 +100,81 @@ def __init__(
         if self.config.rollout.total_rollout_steps is not None:
             self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps)
         print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}")
+        self.total_train_steps = None
+
+        # ==================== fully async config ====================
 
         # Rollouter parameter configuration
         self.message_queue_client = None
 
-        self.current_param_version = 0
+        # Worker groups: rollout_wg is same to actor_rollout_wg
+        self.rollout_wg = None
+        self.actor_rollout_wg = None
+        self.async_rollout_manager = None
 
-        # Freshness control - improved configuration management
-        async_config = config.async_training
-        self.staleness_threshold = async_config.get("staleness_threshold", 3)
+        # Config
+        self.staleness_threshold: int = config.async_training.get("staleness_threshold", 1)
+        self.required_samples = None
+        self.max_required_samples = None
+        # 单次最多扔一次更新需要的样本
+        self.max_concurrent_samples = None
+        # queue size
+        self.max_queue_size = None
 
         # Statistics
+        self.current_param_version = 0
         self.total_generated_samples = 0
         self.staleness_samples = 0
         self.dropped_stale_samples = 0
-
-        # Worker groups
-        self.rollout_wg = None
-        self.message_queue_client = None
+        self.processed_sample_count = 0  # 已处理的样本计数
+        self.global_steps = 0
 
         # Concurrency control
         self.paused = False
         self.running = True
+        # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
+        self.monitor_loop_trigger = True
 
         # Initialize async locks directly
         self.lock = asyncio.Lock()
         self.condition = asyncio.Condition(self.lock)
 
-        # Pause/resume statistics
-        self.total_pause_time = 0.0
-        self.last_pause_time = None
-
-        # Parameter synchronization related
-        self.param_synchronizer = None
-
-        self.async_rollout_manager = None
-
-        # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout
-        self.required_samples = calculate_one_step_size(
-            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
-        )
-        self.max_required_samples = (
-            self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step
-        )
-        print(
-            f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
-            f"max_required_samples: {self.max_required_samples}"
-        )
-
-        # 单次最多扔一次更新需要的样本
-        self.max_concurrent_samples = self.required_samples
-
-        # 流式处理统计
-        self.processed_sample_count = 0  # 已处理的样本计数
-        self.active_sample_count = 0  # 当前正在处理的样本数
-        self.queue_full_pause_count = 0  # 队列满导致的暂停次数
-
-        # queue size
-        self.max_queue_size = self.max_required_samples * 10  # x 10 avoid deadlock
-        print(f"[FullyAsyncRollouter] {self.max_queue_size}")
-
         # 初始化异步队列
-        self.pending_queue = asyncio.Queue(maxsize=100)
+        self.pending_queue = asyncio.Queue(maxsize=128)
         self.active_tasks = set()
         self.result_queue = asyncio.Queue()
         self.cancel_queue = asyncio.Queue()
 
-        # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
-        self.monitor_loop_trigger = True
-
-        self.update_param_version_time = 0
-        self.global_steps = 0
-
-        self.progress_bar = tqdm(
-            total=self.total_rollout_steps / (
-                    self.required_samples * self.config.async_training.trigger_parameter_sync_step),
-            initial=self.global_steps, desc="Training Progress"
-        )
-
     async def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
         async with self.lock:
             self.message_queue_client = message_queue_client
 
-    async def set_parameter_synchronizer(self, param_synchronizer):
-        """Set parameter synchronizer"""
+    async def set_required_samples(self, required_samples: int):
         async with self.lock:
-            self.param_synchronizer = param_synchronizer
+            self.required_samples = int(required_samples)
+            self.max_required_samples = (
+                self.required_samples
+                * (self.staleness_threshold + 1)
+                * self.config.async_training.trigger_parameter_sync_step
+            )
+            self.total_train_steps = int(
+                self.total_rollout_steps
+                / (self.required_samples * self.config.async_training.trigger_parameter_sync_step)
+            )
+
+            # 单次最多扔一次更新需要的样本
+            self.max_concurrent_samples = self.required_samples
+            self.max_queue_size = self.max_required_samples
+
+            print(
+                f"[FullyAsyncRollouter] required_samples : {self.required_samples} "
+                f"max_required_samples: {self.max_required_samples} "
+                f"max_queue_size: {self.max_queue_size} "
+                f"total_train_steps: {self.total_train_steps} "
+                f"total_rollout_steps: {self.total_rollout_steps} "
+                f"max_concurrent_samples: {self.max_concurrent_samples} "
+            )
 
     def get_rollout_wg(self):
         """Get rollout worker group"""
@@ -197,6 +183,9 @@ def get_rollout_wg(self):
     def get_max_queue_size(self):
         return self.max_queue_size
 
+    def get_total_train_steps(self):
+        return self.total_train_steps
+
     async def update_param_version(self, version: int):
         """Update current parameter version"""
         async with self.lock:
@@ -209,24 +198,22 @@ async def update_param_version(self, version: int):
                 f"Parameter version updated from {old_version} to {version}"
             )
             timing_raw = {}
-            self.update_param_version_time += 1
             is_last_step = self.global_steps >= self.total_training_steps
-            if (self.val_reward_fn is not None
-                    and self.config.trainer.test_freq > 0
-                    and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0)
-                         or is_last_step)):
+            if (
+                self.val_reward_fn is not None
+                and self.config.trainer.test_freq > 0
+                and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) or is_last_step)
+            ):
                 with marked_timer("testing", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
                     data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics)
-                    self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
-            if version > 0:
-                self.progress_bar.update(1)
+                    await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
 
     def _validate_config(self):
         # Validate asynchronous training configuration
         if not hasattr(self.config, "async_training"):
             raise ValueError("[FullyAsyncRollouter] Missing async_training configuration")
-
+        assert self.config.actor_rollout_ref.rollout.calculate_log_probs, "must rollout calculate log_probs"
         super()._validate_config()
 
     def _create_actor_rollout_classes(self):
@@ -284,6 +271,7 @@ async def _feed_samples(self):
                 epoch=epoch,
                 param_version=0,  # 待处理后填充
                 processing_times=[],
+                rollout_status={},
             )
 
             await self.pending_queue.put(rollout_sample)
@@ -384,21 +372,22 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         # 直接更新 RolloutSample 对象，填充剩余字段
         rollout_sample.agent_loop_output_list = agent_loop_output_list
         rollout_sample.param_version = self.current_param_version
+        rollout_sample.rollout_status = await self.get_statistics()
 
         is_cancel = False
         # 收集所有信息
         for agent_loop in agent_loop_output_list:
-            if is_cancel == False and agent_loop.is_cancel:
+            if not is_cancel and agent_loop.is_cancel:
                 is_cancel = True
 
-        rollout_data = {
-            "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list],
-            "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list],
-        }
-        if is_cancel:
-            rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list]
-        formatted_data = pformat(rollout_data, width=200, compact=True)
-        print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}")
+        # rollout_data = {
+        #     "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list],
+        #     "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list],
+        # }
+        # if is_cancel:
+        #     rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list]
+        # formatted_data = pformat(rollout_data, width=200, compact=True)
+        # print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}")
 
         if is_cancel:
             # 放入 cancel 队列中，等待恢复生成
@@ -440,17 +429,19 @@ async def _streaming_generation_main(self):
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        # load checkpoint before doing anything 
-        self._load_checkpoint() # TODO: 检查是否需要
+        # load checkpoint before doing anything
+        self._load_checkpoint()  # TODO: 检查是否需要
 
         # perform validation before training
         # currently, we only support validation using the reward_function.
-        async with self.lock:   # TODO: 检查是否需要锁
+        async with self.lock:  # TODO: 检查是否需要锁
             if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
                 print("Initial validation metric")
                 val_metrics = self._validate()
                 assert val_metrics, f"{val_metrics=}"
                 pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
+                data = ValidateMetrics(timing_raw={}, metrics=val_metrics)
+                await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
                 if self.config.trainer.get("val_only", False):
                     return
 
@@ -514,8 +505,6 @@ async def fit(self):
 
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
-        if self.param_synchronizer is None:
-            raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.")
 
         # 设置运行状态
         async with self.lock:
@@ -550,8 +539,8 @@ async def _async_monitor_loop(self):
         Function 2: Trigger rollout recovery
         """
         last_stats_time = time.time()
-        stats_interval = 30.0
-        check_interval = 5.0
+        stats_interval = 60.0
+        check_interval = 10.0
 
         while True:
             async with self.lock:
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index d6d44babb2a..6d74ee215f8 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -12,19 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import time
 import warnings
 from datetime import datetime
-from pprint import pprint
 from typing import Any
 
 import ray
 from omegaconf import OmegaConf
+from tqdm import tqdm
 
 from recipe.fully_async_policy.detach_utils import (
+    ValidateMetrics,
     assemble_batch_from_rollout_samples,
-    calculate_one_step_size, ValidateMetrics,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -38,8 +37,6 @@
 )
 from verl.utils.debug import marked_timer
 
-logger = logging.getLogger(__name__)
-
 
 @ray.remote(num_cpus=10)
 class FullyAsyncTrainer(RayPPOTrainer):
@@ -103,15 +100,25 @@ def __init__(
         self.param_synchronizer = None
 
         # Statistics
+        # we start from step 1
+        self.global_steps = 1
+        self.local_trigger_step = 1
         self.processed_samples = 0
         self.stale_samples_processed = 0
         self.current_param_version = 0
-
-        self.local_trigger_step = 1
+        self.total_train_steps = None
+        self.progress_bar = None
         self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step
 
-        self.required_samples = calculate_one_step_size(
-            self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size
+        # calculate required_samples
+        ppo_mini_batch_size = config.actor_rollout_ref.actor.ppo_mini_batch_size
+        rollout_n = config.actor_rollout_ref.rollout.n
+        if ppo_mini_batch_size % rollout_n != 0:
+            raise ValueError(
+                f"PPO mini batch size ({ppo_mini_batch_size}) must be divisible by rollout n ({rollout_n})"
+            )
+        self.required_samples = int(
+            self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n
         )
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
@@ -122,10 +129,17 @@ def set_parameter_synchronizer(self, param_synchronizer):
         """Set parameter synchronizer"""
         self.param_synchronizer = param_synchronizer
 
+    def set_total_train_steps(self, total_train_steps):
+        self.total_train_steps = total_train_steps
+        self.progress_bar = tqdm(total=self.total_train_steps, initial=0, desc="Training Progress")
+
     def get_actor_wg(self):
         """Get actor worker group"""
         return self.actor_wg
 
+    def get_required_samples(self):
+        return self.required_samples
+
     def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
         Get samples from message queue and compose gen_batch_output
@@ -166,7 +180,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         consumer_end = time.time()
 
         if not queue_samples or len(queue_samples) < self.required_samples:
-            logger.warning("not enough samples collected after loop")
+            print("[FullyAsyncTrainer] not enough samples collected after loop")
             return None, None
 
         print(
@@ -230,22 +244,16 @@ def fit(self):
 
         from verl.utils.tracking import Tracking
 
-        self.logger = Tracking(
+        logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
             default_backend=self.config.trainer.logger,
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        self.global_steps = 0
-
         # load checkpoint before doing anything
         self._load_checkpoint()
-
-        # we start from step 1
-        self.global_steps += 1
         self.max_steps_duration = 0
-
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
         while True:
@@ -277,27 +285,18 @@ def fit(self):
                                 "fully_async/current_param_version": self.current_param_version,
                             }
                         )
-                        for metric in [
-                            "avg_processing_time",
-                            "max_processing_time",
-                            "min_processing_time",
-                            "tp50_processing_time",
-                            "tp99_processing_time",
-                            "tp95_processing_time",
-                            "param_version_diversity",
-                        ]:
-                            metrics[f"fully_async/{metric}"] = batch.meta_info.get(metric, 0)
+                        for key, value in batch.meta_info.items():
+                            if key.startswith("fully_async"):
+                                metrics[key] = value
 
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
                 self._check_save_checkpoint(False, timing_raw)
 
             self._collect_metrics(batch, 0, metrics, timing_raw)
-            pprint(metrics)
+            logger.log(data=metrics, step=self.global_steps)
             # Trigger parameter synchronization after training step
-
             time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3]
-
             print(
                 f"[FullyAsyncTrainer] global_steps: {self.global_steps} "
                 f"local_trigger_step: {self.local_trigger_step} "
@@ -316,6 +315,7 @@ def _trigger_parameter_sync_after_step(self):
             self.local_trigger_step = 1
             self.current_param_version = self.current_param_version + 1
             ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
+            self.progress_bar.update(1)
             return
         else:
             self.local_trigger_step += 1
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 6a425c50478..13e1a3e21e4 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -32,13 +32,15 @@ class MessageQueue:
 
     def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.config = config
-        self.max_queue_size = max_queue_size
-        self.queue = deque(maxlen=max_queue_size)
+        # 确保 max_queue_size 不为 None
+        if max_queue_size is None:
+            raise ValueError(f"max_queue_size cannot be None, got: {max_queue_size}")
+        self.max_queue_size = int(max_queue_size)
+        self.queue = deque(maxlen=self.max_queue_size)
         self.current_param_version = 0
 
         self.val_queue = deque()
 
-
         try:
             if hasattr(config, "async_training") and config.async_training is not None:
                 self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3)
@@ -203,7 +205,6 @@ async def get_validate(self):
                 return None
 
 
-
 class MessageQueueClient:
     """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor"""
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 1a47b0fd06e..64f9fa82825 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -55,7 +55,7 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=4
+train_prompt_mini_bsz=32
 total_rollout_steps=$(((128*2)))
 test_freq=2
 staleness_threshold=1
@@ -79,6 +79,7 @@ common_params=(
     data.gen_batch_size=${gen_prompt_bsz}
     data.return_raw_chat=${return_raw_chat}
     actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.calculate_log_probs=True
     algorithm.adv_estimator=${adv_estimator}
     algorithm.use_kl_in_reward=${use_kl_in_reward}
     algorithm.kl_ctrl.kl_coef=${kl_coef}
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index e9383b109e5..8c49390f456 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -16,9 +16,8 @@
 import logging
 import os
 import random
-import time
 from abc import ABC, abstractmethod
-from typing import Any, Optional, List
+from typing import Any, Optional
 
 import hydra
 import numpy as np
@@ -105,6 +104,7 @@ async def generate(
         return output
 
     async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
+        """Generate tokens from prompt ids. with partial rollout function"""
         server = self._choose_server(request_id)
         output = await server.generate_for_partial.remote(
             request_id=request_id,
@@ -136,6 +136,8 @@ class AgentLoopOutput(BaseModel):
     """Auxiliary performance metrics"""
     is_cancel: bool = False
     """Indicates whether the request was interrupted"""
+    log_probs: list[float] = None
+    """Response token log probs including LLM generated token, tool response token."""
 
 
 # make hydra.utils.instantiate happy
@@ -383,8 +385,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
         return output
 
     async def generate_sequences_no_post(
-            self,
-            batch: DataProto, partial_output_list: Optional[List[AgentLoopOutput]]
+        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
@@ -431,11 +432,9 @@ async def generate_sequences_no_post(
         if not partial_output_list:
             partial_output_list = [None] * len(batch)
 
-        for agent_name, messages, trajectory, partial_output in zip(agent_names,
-                                                                    raw_prompts,
-                                                                    trajectory_info,
-                                                                    partial_output_list,
-                                                                    strict=True):
+        for agent_name, messages, trajectory, partial_output in zip(
+            agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True
+        ):
             tasks.append(
                 asyncio.create_task(
                     self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
@@ -608,10 +607,11 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         output.meta_info = {"timing": timing}
         return output
 
-    async def generate_single_sample_async(self,
-                                           sample: DataProto,
-                                           partial_output_list: Optional[List[AgentLoopOutput]],
-                                           ) -> List[AgentLoopOutput]:
+    async def generate_single_sample_async(
+        self,
+        sample: DataProto,
+        partial_output_list: Optional[list[AgentLoopOutput]],
+    ) -> list[AgentLoopOutput]:
         """
         异步处理单个样本, 需要复制n次
 
diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
index 899b83f1866..df4a4f3350a 100644
--- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
@@ -35,7 +35,6 @@ def __init__(self, *args, **kwargs):
     async def run(
         self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
     ) -> AgentLoopOutput:
-
         if not output:
             prompt_ids = await self.loop.run_in_executor(
                 None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
@@ -51,7 +50,7 @@ async def run(
         metrics = {}
         request_id = uuid4().hex
         with simple_timer("generate_sequences", metrics):
-            response_ids, is_cancel = await self.server_manager.generate_for_partial(
+            response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
                 request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
             )
 
@@ -60,6 +59,7 @@ async def run(
         # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
         else:
             prompt_ids = output.prompt_ids
+            log_probs = output.log_probs + log_probs
             response_ids = output.response_ids + response_ids
             response_mask = [1] * len(response_ids)
 
@@ -70,4 +70,5 @@ async def run(
             num_turns=2,
             metrics=metrics,
             is_cancel=is_cancel,
+            log_probs=log_probs,
         )
diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py
index 492c1894cc5..df6e1991888 100644
--- a/verl/experimental/agent_loop/single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/single_turn_agent_loop.py
@@ -32,9 +32,9 @@ def __init__(self, *args, **kwargs):
         self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
         self.response_length = self.config.actor_rollout_ref.rollout.response_length
 
-    async def run(self, messages: list[dict[str, Any]],
-                  sampling_params: dict[str, Any],
-                  output: Optional[AgentLoopOutput]) -> AgentLoopOutput:
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index a0642048dc7..7c945b7d4c9 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -56,10 +56,9 @@ def init_class(cls, config, tokenizer, **kwargs):
         cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True)
 
     @rollout_trace_op
-    async def run(self,
-                  messages: list[dict[str, Any]],
-                  sampling_params: dict[str, Any],
-                  output: Optional[AgentLoopOutput]) -> AgentLoopOutput:
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 7b34cbfaf23..4b240c6ffbf 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -38,6 +38,7 @@ def main(config):
         config_dict: Hydra configuration dictionary containing training parameters.
     """
     from time import time
+
     start_time = time()
     run_ppo(config)
     print(f"total time: {time() - start_time:.2f} seconds")
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 60621021b30..e61b1dc5fe0 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1280,6 +1280,10 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                         "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
                     }
                 )
+                if self.config.async_training and self.config.async_training.use_rollout_log_probs:
+                    batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
+                    del actor_old_log_probs
+
         if self.use_reference_policy:
             # compute reference log_prob
             with marked_timer("ref", timing_raw, color="olive"):
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 7ce640e33cb..3b3e9542252 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -15,16 +15,14 @@
 import logging
 import os
 import pickle
-from contextlib import ExitStack
-from typing import Any, Callable, Optional, Coroutine, Sequence
+from typing import Any, Callable, Optional, Sequence
 
 import ray
 import zmq
-from omegaconf import DictConfig, ListConfig
+from omegaconf import DictConfig
 from starlette.requests import Request
 from starlette.responses import JSONResponse, StreamingResponse
-from vllm import SamplingParams, RequestOutput
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse
@@ -337,7 +335,7 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any],
 
     async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str):
         max_tokens = self.max_model_len - len(prompt_ids)
-        sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params)
+        sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params)
         prompt = TokensPrompt(prompt_token_ids=prompt_ids)
         generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
 
@@ -349,12 +347,12 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str,
 
     async def generate_for_partial(
         self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
-    ) -> tuple[Sequence[int], bool] | tuple[str, bool]:
+    ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
         # 设置中断标志
         async with self.lock:
             if self.paused:
                 # cancel 后， 所有任务直接返回，等待下次提交
-                return [], True
+                return [], [], True
             self.cancel_event[request_id] = asyncio.Event()
             cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
             generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
@@ -369,10 +367,15 @@ async def generate_for_partial(
 
         async with self.lock:
             token_ids = self.req_output[request_id].outputs[0].token_ids
+            log_probs: list[float] = []
+            for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs):
+                # sampling_params 中 logprobs 设置为1，只返回1个
+                token_id = self.req_output[request_id].outputs[0].token_ids[i]
+                log_probs.append(x[token_id].logprob)
             is_cancel = generation_handle not in done
             self.cancel_event.pop(request_id, None)
             self.req_output.pop(request_id, None)
-        return token_ids, is_cancel
+        return token_ids, log_probs, is_cancel
 
     async def cancel(self):
         async with self.lock:

From fdd8af0fc9bc041044bb37e83787947b5c14c694 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sun, 31 Aug 2025 23:59:54 +0800
Subject: [PATCH 086/182] batch.meta_info.items()

---
 recipe/fully_async_policy/fully_async_trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index af406623145..d9258a1c935 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -285,7 +285,7 @@ def fit(self):
                                 "fully_async/current_param_version": self.current_param_version,
                             }
                         )
-                        for key, value in batch.meta_info:
+                        for key, value in batch.meta_info.items():
                             if key.startswith("fully_async"):
                                 metrics[key] = value
 
@@ -320,3 +320,5 @@ def _trigger_parameter_sync_after_step(self):
         else:
             self.local_trigger_step += 1
             return
+
+

From 9444d19e3cc023b404701345ab08705fe7b8cc6f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 1 Sep 2025 00:40:12 +0800
Subject: [PATCH 087/182] total wait time

---
 recipe/fully_async_policy/fully_async_trainer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 6d74ee215f8..8340b22f6c6 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -182,20 +182,21 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         if not queue_samples or len(queue_samples) < self.required_samples:
             print("[FullyAsyncTrainer] not enough samples collected after loop")
             return None, None
+        total_wait_time = consumer_end - consumer_start
 
         print(
             f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{self.required_samples} samples, "
-            f"total wait time: {consumer_end - consumer_start:.2f} seconds"
+            f"total wait time: {total_wait_time:.2f} seconds"
         )
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
-        # print(queue_samples)
         # Assemble batch - now working directly with RolloutSample objects
         if self.config.trainer.balance_batch:
             batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, self._balance_batch)
         else:
             batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, None)
-        # print(f" _assemble_gen_batch_output_from_queue_samples {batch}")
+
+        batch.meta_info["fully_async/total_wait_time"] = total_wait_time
         return 0, batch
 
     def _create_actor_rollout_classes(self):

From 69c2427a04341158b5a2a22d15a4084092c18811 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 1 Sep 2025 17:03:31 +0800
Subject: [PATCH 088/182]         from .detach_sharding_manager import
 DetachShardingManager

---
 recipe/one_step_off_policy/megatron_workers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py
index 5b338c5be42..a9318b8f7b3 100644
--- a/recipe/one_step_off_policy/megatron_workers.py
+++ b/recipe/one_step_off_policy/megatron_workers.py
@@ -168,7 +168,7 @@ def init_model(self):
         )
         log_gpu_memory_usage("After building vllm rollout", logger=logger)
 
-        from sharding_manager import DetachShardingManager
+        from .detach_sharding_manager import DetachShardingManager
 
         rollout_sharding_manager = DetachShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh

From 237d766739fa8f5736319132334eab30b8b54530 Mon Sep 17 00:00:00 2001
From: wangshulin02 <wangshulin02@meituan.com>
Date: Mon, 1 Sep 2025 21:36:43 +0800
Subject: [PATCH 089/182] fix validate frequent bug & add final validate

---
 .../config/fully_async_ppo_trainer.yaml       |  1 +
 .../dapo_7b_math_fsdp2_2_6.sh                 |  4 +--
 .../dapo_7b_math_fsdp2_8_8.sh                 |  2 +-
 recipe/fully_async_policy/fully_async_main.py |  4 ++-
 .../fully_async_rollouter.py                  | 22 ++++++------
 .../fully_async_policy/fully_async_trainer.py | 36 +++++++++++++------
 recipe/fully_async_policy/message_queue.py    | 20 +++++++++++
 recipe/fully_async_policy/param_sync.py       | 14 ++++----
 tests/special_e2e/run_fully_async_policy.sh   |  2 +-
 9 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 3334ee4f4d5..c1f94b56b6b 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -22,6 +22,7 @@ rollout:
   n: 4                               # 每个prompt生成的响应数量
   total_rollout_steps: 100
   total_epochs: 10
+  test_freq: 1                       # 测试频率, 每多少次参数更新后进行一次测试
 
 data:
   gen_batch_size: 32
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
index 58017f0123b..5f654227d15 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
@@ -77,7 +77,7 @@ gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=64
 total_rollout_steps=$(((512*100)))
-test_freq=5
+test_freq=2
 staleness_threshold=1
 trigger_parameter_sync_step=16
 partial_rollout=True
@@ -156,7 +156,6 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
     trainer.val_before_train=True \
-    trainer.test_freq="${test_freq}" \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
@@ -166,6 +165,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
     async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
     async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
index 52ee0136d5a..c65080ba548 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
@@ -155,7 +155,6 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
     trainer.val_before_train=True \
-    trainer.test_freq="${test_freq}" \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
@@ -165,6 +164,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
     async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
     async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 2b5663bd5ea..532a425d126 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -216,7 +216,9 @@ def _initialize_components(self, config) -> None:
         )
         ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
 
-        ray.get(param_synchronizer.sync_weights.remote(0))
+        # load checkpoint and sync parameter before doing anything
+        ray.get(self.components["trainer"].load_checkpoint.remote()) 
+        ray.get(param_synchronizer.sync_weights.remote(version=0))
 
         self.components["param_synchronizer"] = param_synchronizer
         print("[ASYNC MAIN] All components initialized successfully")
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 44768cacb29..426a5cad430 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -186,7 +186,7 @@ def get_max_queue_size(self):
     def get_total_train_steps(self):
         return self.total_train_steps
 
-    async def update_param_version(self, version: int):
+    async def update_param_version(self, version: int, last_sync: bool = False):
         """Update current parameter version"""
         async with self.lock:
             old_version = self.current_param_version
@@ -198,11 +198,13 @@ async def update_param_version(self, version: int):
                 f"Parameter version updated from {old_version} to {version}"
             )
             timing_raw = {}
-            is_last_step = self.global_steps >= self.total_training_steps
             if (
                 self.val_reward_fn is not None
-                and self.config.trainer.test_freq > 0
-                and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) or is_last_step)
+                and self.config.rollout.test_freq > 0
+                and self.current_param_version % self.config.rollout.test_freq == 0 # test_freq 表示每多少步参数更新测试一次
+                and self.current_param_version > 0 # don't test here in the initial parameter sync
+            ) or (
+                last_sync and self.val_reward_fn is not None
             ):
                 with marked_timer("testing", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
@@ -429,20 +431,18 @@ async def _streaming_generation_main(self):
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        # load checkpoint before doing anything
-        self._load_checkpoint()  # TODO: 检查是否需要
-
         # perform validation before training
         # currently, we only support validation using the reward_function.
-        async with self.lock:  # TODO: 检查是否需要锁
+        async with self.lock:
             if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
-                print("Initial validation metric")
+                print("[FullyAsyncRollouter] Initial validating before training...")
                 val_metrics = self._validate()
                 assert val_metrics, f"{val_metrics=}"
                 pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
                 data = ValidateMetrics(timing_raw={}, metrics=val_metrics)
                 await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
-                if self.config.trainer.get("val_only", False):
+                if self.config.trainer.get("val_only", False): # TODO: 是否需要保留此功能
+
                     return
 
         # we start from step 1
@@ -544,7 +544,7 @@ async def _async_monitor_loop(self):
 
         while True:
             async with self.lock:
-                if not self.running:
+                if not self.running and self.message_queue_client.is_training_ended():
                     break
             await asyncio.sleep(check_interval)
             # 定期打印统计信息
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index b43b3d87297..5accc435422 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -252,8 +252,6 @@ def fit(self):
             config=OmegaConf.to_container(self.config, resolve=True),
         )
 
-        # load checkpoint before doing anything
-        self._load_checkpoint()
         self.max_steps_duration = 0
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
@@ -307,19 +305,35 @@ def fit(self):
             self._trigger_parameter_sync_after_step()
             self.global_steps += 1
 
-    def _trigger_parameter_sync_after_step(self):
+        # final parameter sync and validate
+        self._trigger_parameter_sync_after_step(last_sync=True)
+        val_data = self.message_queue_client.get_validate_sync()
+
+        if val_data:
+            val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
+            from pprint import pprint
+            pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
+            # TODO: 是否需要计入log
+
+        print("[FullyAsyncTrainer] Training completed, sending end signal...,sleeping")
+        time.sleep(10)
+        self.message_queue_client.set_training_end()
+        print("[FullyAsyncTrainer] End signal sent")
+
+        self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint
+
+    def load_checkpoint(self):
+        return self._load_checkpoint()
+
+    def _trigger_parameter_sync_after_step(self, last_sync: bool = False):
         """
         Trigger parameter synchronization after training step
         This ensures rollouter always uses the latest trained parameters
         """
-        if self.local_trigger_step >= self.trigger_parameter_sync_step:
-            self.local_trigger_step = 1
-            self.current_param_version = self.current_param_version + 1
-            ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version))
-            self.progress_bar.update(1)
-            return
-        else:
+        if self.local_trigger_step < self.trigger_parameter_sync_step  and not last_sync:
             self.local_trigger_step += 1
             return
 
-
+        self.current_param_version += 1 
+        self.local_trigger_step = 1
+        ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, last_sync=last_sync))
\ No newline at end of file
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 13e1a3e21e4..5aecba389f7 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -51,6 +51,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
 
         # Asyncio for message handling
         self.running = True
+        
+        # trainer end signal
+        self.training_ended = False
 
         # async safe - 在第一次使用时初始化
         self._lock = asyncio.Lock()
@@ -204,6 +207,15 @@ async def get_validate(self):
             else:
                 return None
 
+    async def set_training_end(self):
+        """set training end signal"""
+        async with self._lock:
+            self.training_ended = True
+    
+    async def is_training_ended(self):
+        """check training end signal"""
+        async with self._lock:
+            return self.training_ended
 
 class MessageQueueClient:
     """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor"""
@@ -269,3 +281,11 @@ def get_statistics_sync(self) -> dict[str, Any]:
     def update_param_version_sync(self, version: int):
         """Update parameter version (async)"""
         return ray.get(self.queue_actor.update_param_version.remote(version))
+
+    def set_training_end(self):
+        """Notify the end of training"""
+        return ray.get(self.queue_actor.set_training_end.remote())
+    
+    def is_training_ended(self):
+        """Check if training is finished"""
+        return ray.get(self.queue_actor.is_training_ended.remote())
\ No newline at end of file
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 7e75865ebd5..ad8dfed56b5 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -71,7 +71,7 @@ def _init_sync_group(self):
             group_name=self.sync_group_name,
         )
 
-    def sync_weights(self, version):
+    def sync_weights(self, version, last_sync = False):
         start_time = time.time()
 
         self.current_version = version
@@ -85,11 +85,13 @@ def sync_weights(self, version):
         # sync weights
         self.actor_wg.sync_rollout_weights()
         ray.get(self.rollout_wg.sync_rollout_weights())
+        end_time = time.time()
+        print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds")
 
-        # Async Update rollout version
-        self.rollouter.update_param_version.remote(version)
-
+        # Async Update rollout version & validation
+        self.rollouter.update_param_version.remote(version, last_sync)
         ray.get(self.rollouter.resume.remote())
-        end_time = time.time()
 
-        print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds")
+        print(f"[ParameterSynchronizer] Update rollout version & validation done. cost {time.time() - end_time:.2f} seconds")
+
+
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 64f9fa82825..51691a8800f 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -119,7 +119,6 @@ common_params=(
     trainer.project_name='verl-test-fully-async'
     trainer.experiment_name="${exp_name}"
     trainer.val_before_train=True
-    trainer.test_freq=-1
     trainer.save_freq=-1
     trainer.resume_mode=disable
     trainer.nnodes=1
@@ -128,6 +127,7 @@ common_params=(
     rollout.n_gpus_per_node=${n_gpus_rollout}
     rollout.total_rollout_steps=${total_rollout_steps}
     rollout.total_epochs=2
+    rollout.test_freq=${test_freq}
     # Fully async specific configurations
     async_training.staleness_threshold=${staleness_threshold}
     async_training.partial_rollout="${partial_rollout}"

From 66cc990edaff02c0e5bf422e38b757ec5cb9f450 Mon Sep 17 00:00:00 2001
From: wangshulin02 <wangshulin02@meituan.com>
Date: Tue, 2 Sep 2025 15:29:13 +0800
Subject: [PATCH 090/182] remove unnecessary code, fix validate logic

---
 recipe/fully_async_policy/detach_utils.py     |  4 +-
 recipe/fully_async_policy/fully_async_main.py |  4 +-
 .../fully_async_rollouter.py                  | 34 +++---------
 .../fully_async_policy/fully_async_trainer.py | 53 ++++++++++---------
 recipe/fully_async_policy/message_queue.py    | 22 +-------
 recipe/fully_async_policy/param_sync.py       |  7 +--
 6 files changed, 47 insertions(+), 77 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 79df36652f7..c28cbf9e631 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import time
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -52,6 +52,8 @@ class RolloutSample:
 class ValidateMetrics:
     timing_raw: dict[str, Any]
     metrics: dict[str, Any]
+    global_steps: Optional[int] = None
+    param_version: Optional[int] = None
 
 
 def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto:
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 532a425d126..09961c85391 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -217,8 +217,10 @@ def _initialize_components(self, config) -> None:
         ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer))
 
         # load checkpoint and sync parameter before doing anything
+        val_before_train = val_reward_fn is not None and config.trainer.get("val_before_train", True)
         ray.get(self.components["trainer"].load_checkpoint.remote()) 
-        ray.get(param_synchronizer.sync_weights.remote(version=0))
+        ray.get(param_synchronizer.sync_weights.remote(version=0,
+                                                       validate=val_before_train))
 
         self.components["param_synchronizer"] = param_synchronizer
         print("[ASYNC MAIN] All components initialized successfully")
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 426a5cad430..4e7f911fb1e 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -186,7 +186,7 @@ def get_max_queue_size(self):
     def get_total_train_steps(self):
         return self.total_train_steps
 
-    async def update_param_version(self, version: int, last_sync: bool = False):
+    async def update_param_version(self, version: int, validate: bool = False, global_steps: int = 0):
         """Update current parameter version"""
         async with self.lock:
             old_version = self.current_param_version
@@ -201,14 +201,16 @@ async def update_param_version(self, version: int, last_sync: bool = False):
             if (
                 self.val_reward_fn is not None
                 and self.config.rollout.test_freq > 0
-                and self.current_param_version % self.config.rollout.test_freq == 0 # test_freq 表示每多少步参数更新测试一次
+                and self.current_param_version % self.config.rollout.test_freq == 0
                 and self.current_param_version > 0 # don't test here in the initial parameter sync
             ) or (
-                last_sync and self.val_reward_fn is not None
+                validate and self.val_reward_fn is not None
             ):
                 with marked_timer("testing", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
-                    data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics)
+                    data = ValidateMetrics(timing_raw=timing_raw, 
+                                           metrics=val_metrics, 
+                                           global_steps=global_steps)
                     await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
 
     def _validate_config(self):
@@ -422,28 +424,6 @@ async def _consumer_worker(self):
 
     async def _streaming_generation_main(self):
         """流式处理的主入口方法，包含初始化和验证逻辑"""
-        from verl.utils.tracking import Tracking
-
-        self.logger = Tracking(
-            project_name=self.config.trainer.project_name,
-            experiment_name=self.config.trainer.experiment_name,
-            default_backend=self.config.trainer.logger,
-            config=OmegaConf.to_container(self.config, resolve=True),
-        )
-
-        # perform validation before training
-        # currently, we only support validation using the reward_function.
-        async with self.lock:
-            if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
-                print("[FullyAsyncRollouter] Initial validating before training...")
-                val_metrics = self._validate()
-                assert val_metrics, f"{val_metrics=}"
-                pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}")
-                data = ValidateMetrics(timing_raw={}, metrics=val_metrics)
-                await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
-                if self.config.trainer.get("val_only", False): # TODO: 是否需要保留此功能
-
-                    return
 
         # we start from step 1
         self.global_steps += 1
@@ -544,7 +524,7 @@ async def _async_monitor_loop(self):
 
         while True:
             async with self.lock:
-                if not self.running and self.message_queue_client.is_training_ended():
+                if not self.running:
                     break
             await asyncio.sleep(check_interval)
             # 定期打印统计信息
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 5accc435422..17415f496d7 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -36,7 +36,7 @@
     WorkerType,
 )
 from verl.utils.debug import marked_timer
-
+from pprint import pprint
 
 @ray.remote(num_cpus=10)
 class FullyAsyncTrainer(RayPPOTrainer):
@@ -253,18 +253,21 @@ def fit(self):
         )
 
         self.max_steps_duration = 0
+
+        # get validate data before training
+        val_data = self.message_queue_client.get_validate_sync()
+        if val_data:
+            val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
+            logger.log(data=val_data.metrics, step=val_data.global_steps)
+            logger.log(data=val_data.timing_raw, step=val_data.global_steps)
+            pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}")
+
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
         while True:
             metrics = {}
             timing_raw = {}
 
-            val_data = self.message_queue_client.get_validate_sync()
-            if val_data:
-                val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-                metrics.update(val_data.metrics)
-                timing_raw.update(val_data.timing_raw)
-
             with marked_timer("step", timing_raw):
                 with marked_timer("gen", timing_raw, color="red"):
                     epoch, batch = self._get_samples_from_queue()
@@ -302,38 +305,40 @@ def fit(self):
                 f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step} "
                 f"{time_str}"
             )
-            self._trigger_parameter_sync_after_step()
+            self._trigger_parameter_sync_after_step(global_steps=self.global_steps)
+            val_data = self.message_queue_client.get_validate_sync()
+            if val_data:
+                val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
+                logger.log(data=val_data.metrics, step=val_data.global_steps)
+                logger.log(data=val_data.timing_raw, step=val_data.global_steps)
             self.global_steps += 1
 
         # final parameter sync and validate
-        self._trigger_parameter_sync_after_step(last_sync=True)
-        val_data = self.message_queue_client.get_validate_sync()
-
-        if val_data:
-            val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-            from pprint import pprint
-            pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
-            # TODO: 是否需要计入log
-
-        print("[FullyAsyncTrainer] Training completed, sending end signal...,sleeping")
-        time.sleep(10)
-        self.message_queue_client.set_training_end()
-        print("[FullyAsyncTrainer] End signal sent")
+        if val_data is None:
+            self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps-1)
+            val_data = self.message_queue_client.get_validate_sync()
+            if val_data:
+                val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
+                logger.log(data=val_data.metrics, step=val_data.global_steps)
+                logger.log(data=val_data.timing_raw, step=val_data.global_steps)     
+        pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
 
         self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint
 
     def load_checkpoint(self):
         return self._load_checkpoint()
 
-    def _trigger_parameter_sync_after_step(self, last_sync: bool = False):
+    def _trigger_parameter_sync_after_step(self, validate: bool = False, global_steps: int = None):
         """
         Trigger parameter synchronization after training step
         This ensures rollouter always uses the latest trained parameters
         """
-        if self.local_trigger_step < self.trigger_parameter_sync_step  and not last_sync:
+        if self.local_trigger_step < self.trigger_parameter_sync_step  and not validate:
             self.local_trigger_step += 1
             return
 
         self.current_param_version += 1 
         self.local_trigger_step = 1
-        ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, last_sync=last_sync))
\ No newline at end of file
+        ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, 
+                                                            validate=validate,
+                                                            global_steps=global_steps))
\ No newline at end of file
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 5aecba389f7..0520ec98034 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -51,9 +51,6 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
 
         # Asyncio for message handling
         self.running = True
-        
-        # trainer end signal
-        self.training_ended = False
 
         # async safe - 在第一次使用时初始化
         self._lock = asyncio.Lock()
@@ -207,15 +204,6 @@ async def get_validate(self):
             else:
                 return None
 
-    async def set_training_end(self):
-        """set training end signal"""
-        async with self._lock:
-            self.training_ended = True
-    
-    async def is_training_ended(self):
-        """check training end signal"""
-        async with self._lock:
-            return self.training_ended
 
 class MessageQueueClient:
     """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor"""
@@ -280,12 +268,4 @@ def get_statistics_sync(self) -> dict[str, Any]:
 
     def update_param_version_sync(self, version: int):
         """Update parameter version (async)"""
-        return ray.get(self.queue_actor.update_param_version.remote(version))
-
-    def set_training_end(self):
-        """Notify the end of training"""
-        return ray.get(self.queue_actor.set_training_end.remote())
-    
-    def is_training_ended(self):
-        """Check if training is finished"""
-        return ray.get(self.queue_actor.is_training_ended.remote())
\ No newline at end of file
+        return ray.get(self.queue_actor.update_param_version.remote(version))
\ No newline at end of file
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index ad8dfed56b5..4cf39e5355b 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -71,7 +71,7 @@ def _init_sync_group(self):
             group_name=self.sync_group_name,
         )
 
-    def sync_weights(self, version, last_sync = False):
+    def sync_weights(self, version, validate = False, global_steps = 0):
         start_time = time.time()
 
         self.current_version = version
@@ -89,9 +89,10 @@ def sync_weights(self, version, last_sync = False):
         print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds")
 
         # Async Update rollout version & validation
-        self.rollouter.update_param_version.remote(version, last_sync)
+        self.rollouter.update_param_version.remote(version, validate, global_steps)
         ray.get(self.rollouter.resume.remote())
 
-        print(f"[ParameterSynchronizer] Update rollout version & validation done. cost {time.time() - end_time:.2f} seconds")
+        print(f"[ParameterSynchronizer] Update rollout version & validation done. \
+              cost {time.time() - end_time:.2f} seconds")
 
 

From b405c6646438831de9824f3857bef80eae192d79 Mon Sep 17 00:00:00 2001
From: arron <arron@MBP-2G17FXQ05P-2332.local>
Date: Tue, 2 Sep 2025 18:11:53 +0800
Subject: [PATCH 091/182] 8_8

---
 recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh | 3 ++-
 recipe/fully_async_policy/runtime_env.yaml          | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
index 52ee0136d5a..61ec5d3c1e3 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
@@ -75,7 +75,7 @@ n_gpus_training=8
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=4
+train_prompt_mini_bsz=64
 total_rollout_steps=$(((512*100)))
 test_freq=10
 staleness_threshold=1
@@ -145,6 +145,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
     actor_rollout_ref.rollout.name=${rollout_name} \
     actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
     reward_model.reward_manager=dapo \
     +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
     +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml
index 81c7c9f4265..5dcf269faa8 100644
--- a/recipe/fully_async_policy/runtime_env.yaml
+++ b/recipe/fully_async_policy/runtime_env.yaml
@@ -1,2 +1,5 @@
 env_vars:
-  VLLM_USE_V1: "1"
\ No newline at end of file
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 5919f05ace7357b2fdc8c6a36f0ee5530552dc63 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Tue, 2 Sep 2025 19:21:04 +0800
Subject: [PATCH 092/182] fix trainer and rollouter validation asynchrony

---
 recipe/fully_async_policy/fully_async_rollouter.py |  8 ++++----
 recipe/fully_async_policy/fully_async_trainer.py   |  1 +
 recipe/fully_async_policy/param_sync.py            | 14 +++++++++-----
 tests/special_e2e/run_fully_async_policy.sh        |  2 +-
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 4e7f911fb1e..0bc871b6d7f 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -208,10 +208,10 @@ async def update_param_version(self, version: int, validate: bool = False, globa
             ):
                 with marked_timer("testing", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
-                    data = ValidateMetrics(timing_raw=timing_raw, 
-                                           metrics=val_metrics, 
-                                           global_steps=global_steps)
-                    await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
+                data = ValidateMetrics(timing_raw=timing_raw,
+                                       metrics=val_metrics,
+                                       global_steps=global_steps)
+                await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
 
     def _validate_config(self):
         # Validate asynchronous training configuration
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 17415f496d7..cca95efab66 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -339,6 +339,7 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step
 
         self.current_param_version += 1 
         self.local_trigger_step = 1
+        ray.get(self.param_synchronizer.wait_last_sync.remote())
         ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, 
                                                             validate=validate,
                                                             global_steps=global_steps))
\ No newline at end of file
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 4cf39e5355b..34fbca1c3e3 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -41,6 +41,7 @@ def __init__(self, config, trainer, rollouter, mq):
         self.weights_info = None
         self.sync_group_initialized = False
         self.sync_group_name = "actor_rollout"
+        self.wait_last = None
 
         # Statistics
         self.current_version = 0
@@ -90,9 +91,12 @@ def sync_weights(self, version, validate = False, global_steps = 0):
 
         # Async Update rollout version & validation
         self.rollouter.update_param_version.remote(version, validate, global_steps)
-        ray.get(self.rollouter.resume.remote())
-
-        print(f"[ParameterSynchronizer] Update rollout version & validation done. \
-              cost {time.time() - end_time:.2f} seconds")
-
+        self.wait_last = self.rollouter.resume.remote()
+
+    def wait_last_sync(self):
+        print(f"[ParameterSynchronizer] waiting last parameter sync and validate...")
+        start_time =  time.time()
+        if self.wait_last:
+            ray.get(self.wait_last)
+        print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds")
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 51691a8800f..142ee3e8806 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -57,7 +57,7 @@ gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((128*2)))
-test_freq=2
+test_freq=10
 staleness_threshold=1
 trigger_parameter_sync_step=1
 partial_rollout=True

From cfa324919c4badbe580d6927e00fb66ac849c290 Mon Sep 17 00:00:00 2001
From: arron <arron@MBP-2G17FXQ05P-2332.local>
Date: Tue, 2 Sep 2025 22:14:36 +0800
Subject: [PATCH 093/182] TENSORBOARD_DIR

---
 .../dapo_7b_math_fsdp2_4_12.sh                | 171 ++++++++++++++++++
 recipe/fully_async_policy/runtime_env.yaml    |   2 +-
 2 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
new file mode 100644
index 00000000000..2b4bf9c31fe
--- /dev/null
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
@@ -0,0 +1,171 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=2
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=64
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=16
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.test_freq="${test_freq}" \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml
index 5dcf269faa8..dcca08e67f7 100644
--- a/recipe/fully_async_policy/runtime_env.yaml
+++ b/recipe/fully_async_policy/runtime_env.yaml
@@ -1,5 +1,5 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
   NCCL_DEBUG: "INFO"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 54448199b22159f8823b3000d2517efb5241c369 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 3 Sep 2025 00:12:42 +0800
Subject: [PATCH 094/182] simple implementation of Metrics Aggregator

---
 recipe/fully_async_policy/detach_utils.py     | 216 +++++++++++++++++-
 .../fully_async_rollouter.py                  |   3 +-
 .../fully_async_policy/fully_async_trainer.py |  34 ++-
 recipe/fully_async_policy/runtime_env.yaml    |   5 +-
 4 files changed, 246 insertions(+), 12 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index c28cbf9e631..48a41443612 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 import time
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any, Optional, Dict, List
+from collections import defaultdict
 
 import numpy as np
 import torch
@@ -229,3 +230,216 @@ def assemble_batch_from_rollout_samples(
     print(f"[BatchUtils] Batch assembly completed in {time.time() - start_time:.2f}s")
 
     return final_batch
+
+class MetricsAggregator:
+    """Metrics aggregator, used to combine metrics from multiple training steps"""
+    
+    def __init__(self):
+        # Store all values ​​for each metric
+        self.metric_values: Dict[str, List[float]] = defaultdict(list)
+        # Store the number of samples at each step for weighted averaging
+        self.sample_counts: List[int] = []
+        # Store the timestamp of each step for time-related calculations
+        self.timestamps: List[float] = []
+        # Step Count
+        self.step_count = 0
+        
+        # Metric aggregation rule configuration
+        self.aggregation_rules = self._init_aggregation_rules()
+    
+    def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]:
+        """Initialize metrics aggregation rules"""
+        return {
+            # # Cumulative metrics - take the last value
+            # 'last': [
+            #     'fully_async/stale_samples_processed',
+            #     'fully_async/current_param_version',
+            #     'global_steps',
+            #     'epoch',
+            # ],
+            
+            # # Weighted average metrics - weighted by sample size
+            # 'weighted_avg': [
+            #     'fully_async/stale_samples_ratio',
+            #     'policy_loss',
+            #     'value_loss',
+            #     'entropy_loss',
+            #     'kl_divergence',
+            #     'advantage_mean',
+            #     'advantage_std',
+            #     'learning_rate',
+            # ],
+            
+            # # Summation type metrics - direct accumulation
+            # 'sum': [
+            #     'fully_async/total_wait_time',
+            #     'processed_samples',
+            #     'total_tokens',
+            # ],
+            
+            # Average metrics - Simple Average
+            # 'avg': [
+            #     'perf/throughput',
+            #     'fully_async/avg_processing_time',
+            #     'fully_async/tp50_processing_time',
+            #     'fully_async/tp95_processing_time',
+            #     'fully_async/tp99_processing_time',
+            #     'grad_norm',
+            # ],
+            
+            # # Maximum value metrics
+            # 'max': [
+            #     'fully_async/max_processing_time',
+            #     'max_grad_norm',
+            #     'peak_memory_usage',
+            # ],
+            
+            # # Minimum value metrics
+            # 'min': [
+            #     'fully_async/min_processing_time',
+            #     'min_learning_rate',
+            # ],
+            
+            # Time-Based metrics - Special Treatment
+            'time_sum': [
+                'timing_s/adv',
+                'timing_s/gen',
+                'timing_s/old_log_prob',
+                'timing_s/reward',
+                'timing_s/step',
+                'timing_s/update_actor',
+            ],
+        }
+    
+    def add_step_metrics(self, metrics: Dict[str, Any], sample_count: int, timestamp: float = None):
+        """Adding a single-step metrics"""
+        if timestamp is None:
+            timestamp = time.time()
+            
+        self.sample_counts.append(sample_count)
+        self.timestamps.append(timestamp)
+        self.step_count += 1
+        
+        # Store all metrics values
+        for key, value in metrics.items():
+            if isinstance(value, (int, float, np.number)):
+                self.metric_values[key].append(float(value))
+            elif isinstance(value, torch.Tensor):
+                self.metric_values[key].append(float(value.item()))
+    
+    def _get_aggregation_type(self, metric_name: str) -> str:
+        """Determine the aggregation type based on the metric name"""
+        for agg_type, metric_list in self.aggregation_rules.items():
+            if metric_name in metric_list:
+                return agg_type
+        import warnings
+        warnings.warn(f"No aggregation rule is matched in init_aggregation_rules. \
+                      For metric {metric_name}, the 'last' method is used")
+        return 'last'
+
+        # raise ValueError(f"No aggregation rule is matched in init_aggregation_rules. \
+        #                 Metric name: {metric_name}")    # TODO: 删除
+
+        
+        # Aggregation rules based on naming patterns
+        if metric_name.startswith('time/'):
+            aggregation_type = 'time_sum'
+        elif metric_name.endswith('_ratio') or metric_name.endswith('_rate'):
+            aggregation_type = 'weighted_avg'
+        elif metric_name.endswith('_count') or metric_name.endswith('_total'):
+            aggregation_type = 'sum'
+        elif metric_name.startswith('max_') or metric_name.endswith('_max'):
+            aggregation_type = 'max'
+        elif metric_name.startswith('min_') or metric_name.endswith('_min'):
+            aggregation_type = 'min'
+        else:
+            # The default is weighted average.
+            aggregation_type = 'weighted_avg'
+        import warnings
+        warnings.simplefilter("always", DeprecationWarning)
+        warnings.warn("No aggregation rule is matched in init_aggregation_rules. \
+                      Aggregation rule is matched based on name prefix:", aggregation_type)
+        return aggregation_type
+    
+    def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> float:
+        """Aggregating a single metric"""
+        if not values:
+            return 0.0
+            
+        agg_type = self._get_aggregation_type(metric_name)
+        
+        if agg_type == 'last':
+            return values[-1]
+        
+        elif agg_type == 'weighted_avg':
+            # Weighted average
+            if len(values) != len(self.sample_counts):
+                # If the lengths do not match, use a simple average
+                return sum(values) / len(values)
+            
+            total_samples = sum(self.sample_counts)
+            if total_samples == 0:
+                return sum(values) / len(values)
+            
+            weighted_sum = sum(v * c for v, c in zip(values, self.sample_counts))
+            return weighted_sum / total_samples
+        
+        elif agg_type == 'sum' or agg_type == 'time_sum':
+            return sum(values)
+        
+        elif agg_type == 'avg':
+            return sum(values) / len(values)
+        
+        elif agg_type == 'max':
+            return max(values)
+        
+        elif agg_type == 'min':
+            return min(values)
+        
+        else:
+            # Default average
+            return sum(values) / len(values)
+    
+    def get_aggregated_metrics(self) -> Dict[str, Any]:
+        """aggregated metrics"""
+        if self.step_count == 0:
+            return {}
+        
+        aggregated = {}
+        
+        # Aggregate all metrics
+        for metric_name, values in self.metric_values.items():
+            aggregated[metric_name] = self._aggregate_single_metric(metric_name, values)
+        
+        # # Adding aggregate statistics
+        # aggregated.update({
+        #     'aggregation/step_count': self.step_count,
+        #     'aggregation/total_samples': sum(self.sample_counts),
+        #     'aggregation/avg_samples_per_step': sum(self.sample_counts) / self.step_count,
+        #     'aggregation/time_span': self.timestamps[-1] - self.timestamps[0] if len(self.timestamps) > 1 else 0,
+        # })
+        
+        # # Add statistics on sample size
+        # if self.sample_counts:
+        #     aggregated.update({
+        #         'aggregation/min_samples_per_step': min(self.sample_counts),
+        #         'aggregation/max_samples_per_step': max(self.sample_counts),
+        #     })
+        
+        return aggregated
+    
+    def reset(self):
+        """Reset Aggregator"""
+        self.metric_values.clear()
+        self.sample_counts.clear()
+        self.timestamps.clear()
+        self.step_count = 0
+    
+    def get_current_stats(self) -> Dict[str, Any]:
+        """Get statistics about the current aggregation state (for debugging)"""
+        return {
+            'step_count': self.step_count,
+            'metric_count': len(self.metric_values),
+            'total_samples': sum(self.sample_counts),
+            'metric_names': list(self.metric_values.keys()),
+        }
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 0bc871b6d7f..bb7fcfa1889 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -210,7 +210,8 @@ async def update_param_version(self, version: int, validate: bool = False, globa
                     val_metrics: dict = self._validate()
                 data = ValidateMetrics(timing_raw=timing_raw,
                                        metrics=val_metrics,
-                                       global_steps=global_steps)
+                                       global_steps=global_steps,
+                                       param_version=version)
                 await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
 
     def _validate_config(self):
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index cca95efab66..4d800af70a9 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -24,6 +24,7 @@
 from recipe.fully_async_policy.detach_utils import (
     ValidateMetrics,
     assemble_batch_from_rollout_samples,
+    MetricsAggregator,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -120,6 +121,7 @@ def __init__(
         self.required_samples = int(
             self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n
         )
+        self.metrics_aggregator = MetricsAggregator()
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""
@@ -245,7 +247,7 @@ def fit(self):
 
         from verl.utils.tracking import Tracking
 
-        logger = Tracking(
+        self.logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
             default_backend=self.config.trainer.logger,
@@ -255,11 +257,13 @@ def fit(self):
         self.max_steps_duration = 0
 
         # get validate data before training
+        if self.config.trainer.val_before_train and self.reward_fn is not None:
+            ray.get(self.param_synchronizer.wait_last_sync.remote())
         val_data = self.message_queue_client.get_validate_sync()
         if val_data:
             val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-            logger.log(data=val_data.metrics, step=val_data.global_steps)
-            logger.log(data=val_data.timing_raw, step=val_data.global_steps)
+            self.logger.log(data=val_data.metrics, step=val_data.param_version)
+            self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
             pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}")
 
         # Use queue mode, no need for traditional dataloader iterator
@@ -296,7 +300,11 @@ def fit(self):
                 self._check_save_checkpoint(False, timing_raw)
 
             self._collect_metrics(batch, 0, metrics, timing_raw)
-            logger.log(data=metrics, step=self.global_steps)
+            self.metrics_aggregator.add_step_metrics(
+                metrics=metrics, 
+                sample_count=self.required_samples,
+                timestamp=time.time()
+            )
             # Trigger parameter synchronization after training step
             time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3]
             print(
@@ -309,8 +317,10 @@ def fit(self):
             val_data = self.message_queue_client.get_validate_sync()
             if val_data:
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-                logger.log(data=val_data.metrics, step=val_data.global_steps)
-                logger.log(data=val_data.timing_raw, step=val_data.global_steps)
+                self.logger.log(data=val_data.metrics, step=val_data.param_version)
+                self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
+                pprint(f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \
+                      Validation metrics: {val_data.metrics}")
             self.global_steps += 1
 
         # final parameter sync and validate
@@ -319,8 +329,8 @@ def fit(self):
             val_data = self.message_queue_client.get_validate_sync()
             if val_data:
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-                logger.log(data=val_data.metrics, step=val_data.global_steps)
-                logger.log(data=val_data.timing_raw, step=val_data.global_steps)     
+                self.logger.log(data=val_data.metrics, step=val_data.param_version)
+                self.logger.log(data=val_data.timing_raw, step=val_data.param_version)     
         pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
 
         self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint
@@ -339,7 +349,13 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step
 
         self.current_param_version += 1 
         self.local_trigger_step = 1
+        self.logger.log(
+            data=self.metrics_aggregator.get_aggregated_metrics(),
+            step=self.current_param_version,
+            )
+        self.metrics_aggregator.reset()
         ray.get(self.param_synchronizer.wait_last_sync.remote())
         ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, 
                                                             validate=validate,
-                                                            global_steps=global_steps))
\ No newline at end of file
+                                                            global_steps=global_steps)
+                                                            )
\ No newline at end of file
diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml
index 81c7c9f4265..dcca08e67f7 100644
--- a/recipe/fully_async_policy/runtime_env.yaml
+++ b/recipe/fully_async_policy/runtime_env.yaml
@@ -1,2 +1,5 @@
 env_vars:
-  VLLM_USE_V1: "1"
\ No newline at end of file
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 09b0e135d35454ac7a126b225a8c0adcccedc484 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 3 Sep 2025 11:06:13 +0800
Subject: [PATCH 095/182] Merge branch 'recipe/async_policy' into
 recipe/fully_async_fix_0

---
 .../dapo_7b_math_fsdp2_4_12.sh                | 171 ++++++++++++++++++
 .../dapo_7b_math_fsdp2_8_8.sh                 |   3 +-
 .../one_step_off_policy/megatron_workers.py   |   2 +-
 3 files changed, 174 insertions(+), 2 deletions(-)
 create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh

diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
new file mode 100644
index 00000000000..2b4bf9c31fe
--- /dev/null
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
@@ -0,0 +1,171 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=2
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=64
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=16
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.test_freq="${test_freq}" \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
index c65080ba548..688a87fab92 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
@@ -75,7 +75,7 @@ n_gpus_training=8
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=4
+train_prompt_mini_bsz=64
 total_rollout_steps=$(((512*100)))
 test_freq=10
 staleness_threshold=1
@@ -145,6 +145,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
     actor_rollout_ref.rollout.name=${rollout_name} \
     actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
     reward_model.reward_manager=dapo \
     +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
     +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py
index 5b338c5be42..a9318b8f7b3 100644
--- a/recipe/one_step_off_policy/megatron_workers.py
+++ b/recipe/one_step_off_policy/megatron_workers.py
@@ -168,7 +168,7 @@ def init_model(self):
         )
         log_gpu_memory_usage("After building vllm rollout", logger=logger)
 
-        from sharding_manager import DetachShardingManager
+        from .detach_sharding_manager import DetachShardingManager
 
         rollout_sharding_manager = DetachShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh

From d393d5c0daf6e5a489d84127a75a6aeb73872e7d Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 3 Sep 2025 14:10:03 +0800
Subject: [PATCH 096/182] fix final param_sync wait

---
 recipe/fully_async_policy/fully_async_trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 4d800af70a9..9276d148b66 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -326,6 +326,7 @@ def fit(self):
         # final parameter sync and validate
         if val_data is None:
             self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps-1)
+            ray.get(self.param_synchronizer.wait_last_sync.remote())
             val_data = self.message_queue_client.get_validate_sync()
             if val_data:
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)

From 362c3f95d280922cc4459ecf8941c54e3eb8a5fa Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 3 Sep 2025 15:19:19 +0800
Subject: [PATCH 097/182] free kv cache by calling sleep&wake_up

---
 recipe/fully_async_policy/fully_async_rollouter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index bb7fcfa1889..f3a25c2c30c 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -593,6 +593,8 @@ async def pause(self):
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
                 print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
+            self.async_rollout_manager.sleep()
+            self.async_rollout_manager.wake_up()
             self.monitor_loop_trigger = False
 
     async def resume(self):

From 53bfad2c7a995169e7532e3270dd148615dac464 Mon Sep 17 00:00:00 2001
From: hadoop-ai-search <hadoop-ai-search@set-zw04-mlp-codelab-pc1189.mt>
Date: Fri, 5 Sep 2025 15:39:15 +0800
Subject: [PATCH 098/182] reset one step

---
 .../detach_sharding_manager.py                |   0
 recipe/fully_async_policy/fsdp_workers.py     | 268 ++++++++++++
 recipe/fully_async_policy/fully_async_main.py |   4 +-
 recipe/fully_async_policy/megatron_workers.py | 200 +++++++++
 recipe/one_step_off_policy/fsdp_workers.py    |  84 +---
 recipe/one_step_off_policy/main_ppo.py        |  91 ++--
 .../one_step_off_policy/megatron_workers.py   |  89 ++--
 recipe/one_step_off_policy/ray_trainer.py     | 387 +++++++++++++++---
 .../vllm_sharding_manager.py                  |  74 ++++
 tests/special_e2e/run_fully_async_policy.sh   |   2 +-
 10 files changed, 1019 insertions(+), 180 deletions(-)
 rename recipe/{one_step_off_policy => fully_async_policy}/detach_sharding_manager.py (100%)
 create mode 100644 recipe/fully_async_policy/fsdp_workers.py
 create mode 100644 recipe/fully_async_policy/megatron_workers.py
 create mode 100644 recipe/one_step_off_policy/vllm_sharding_manager.py

diff --git a/recipe/one_step_off_policy/detach_sharding_manager.py b/recipe/fully_async_policy/detach_sharding_manager.py
similarity index 100%
rename from recipe/one_step_off_policy/detach_sharding_manager.py
rename to recipe/fully_async_policy/detach_sharding_manager.py
diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
new file mode 100644
index 00000000000..086f109e434
--- /dev/null
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -0,0 +1,268 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+import torch
+import torch.distributed
+from omegaconf import DictConfig, OmegaConf
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from transformers import AutoConfig
+
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.utils import hf_processor, hf_tokenizer, omega_conf_to_dataclass
+from verl.utils.debug import DistProfiler, DistProfilerExtension, log_gpu_memory_usage
+from verl.utils.device import (
+    get_device_name,
+    get_nccl_backend,
+    get_torch_device,
+)
+from verl.utils.fs import copy_to_local
+from verl.utils.fsdp_utils import (
+    fsdp_version,
+)
+from verl.utils.import_utils import import_external_libs
+from verl.utils.model import get_generation_config, update_model_config
+from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
+from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+device_name = get_device_name()
+
+__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
+
+
+def get_inference_model(rollout):
+    """
+    根据不同类型的inference_engine获取模型对象
+    Args:
+        rollout: rollout对象，包含inference_engine
+    Returns:
+        model: 模型对象
+    """
+    inference_engine = rollout.inference_engine
+    # 判断inference_engine的类型
+    if hasattr(inference_engine, "llm_engine"):
+        # LLM类型 - vLLMRollout
+        inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+    elif hasattr(inference_engine, "worker"):
+        # WorkerWrapperBase类型 - vLLMAsyncRollout
+        inference_model = inference_engine.worker.model_runner.model
+    else:
+        raise AttributeError(
+            f"Unsupported inference_engine type: {type(inference_engine)}. "
+            f"Expected LLM (with llm_engine attribute) or WorkerWrapperBase (with worker attribute)."
+        )
+    return inference_model
+
+
+class DetachNcclSync(ActorRolloutRefWorker):
+    def _get_actor_params(self):
+        pass
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
+    def sync_rollout_weights(self):
+        assert (self._is_actor or self._is_rollout) and not self.config.hybrid_engine
+        assert hasattr(self, "_weights_info") and self._weights_info is not None
+
+        params = self._get_actor_params() if self._is_actor else None
+        if self._is_rollout:
+            inference_model = get_inference_model(self.rollout)
+            patch_vllm_moe_model_weight_loader(inference_model)
+        for key, shape, dtype in self._weights_info:
+            tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
+            if self._is_actor:
+                assert key in params
+                origin_data = params[key]
+                if hasattr(origin_data, "full_tensor"):
+                    origin_data = origin_data.full_tensor()
+                if torch.distributed.get_rank() == 0:
+                    tensor.copy_(origin_data)
+            from ray.util.collective import collective
+
+            collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
+            if self._is_rollout:
+                inference_model.load_weights([(key, tensor)])
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def get_actor_weights_info(self):
+        assert self._is_actor
+        if hasattr(self, "_weights_info"):
+            return self._weights_info
+        if fsdp_version(self.actor_module_fsdp) == 1:
+            from torch.distributed.fsdp.api import ShardedStateDictConfig, StateDictType
+
+            FSDP.set_state_dict_type(
+                self.actor_module_fsdp,
+                state_dict_type=StateDictType.SHARDED_STATE_DICT,
+                state_dict_config=ShardedStateDictConfig(),
+            )
+        params = self._get_actor_params()
+        ret = []
+        for key, tensor in params.items():
+            ret.append((key, tensor.size(), tensor.dtype))
+        self._weights_info = ret
+        return ret
+
+
+class DetachActorWorker(DetachNcclSync):
+    def _get_actor_params(self):
+        assert self._is_actor
+        params = self.actor_module_fsdp.state_dict()
+        from verl.utils.model import convert_weight_keys
+
+        params = convert_weight_keys(
+            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
+        )
+        return params
+
+
+class DetachRolloutWorker(DetachNcclSync):
+    def __init__(self, config: DictConfig, role: str):
+        Worker.__init__(self)
+        assert role == "rollout"
+        self.config = config
+        import torch.distributed
+
+        if not torch.distributed.is_initialized():
+            rank = int(os.environ.get("RANK", 0))
+            world_size = int(os.environ.get("WORLD_SIZE", 1))
+            torch.distributed.init_process_group(
+                backend=f"cpu:gloo,{get_device_name()}:{get_nccl_backend()}",
+                rank=rank,
+                world_size=world_size,
+                init_method=os.environ.get("DIST_INIT_METHOD", None),
+            )
+        # TODO(haibin.lin):
+        # As of now the type of config is DictConfig, if we assign config.profiler with ProfilerConfig,
+        # it will actually convert the ProfilerConfig dataclass back to a DictConfig.
+        # We can still use ProfilerConfig for testing purpose (tests/utils/test_nvtx_profile.py)
+        # as they provides DictConfig-like interface
+        # The benefit of creating the dataclass config is to perform validation during __post_init__
+        profiler_config = omega_conf_to_dataclass(config.rollout.get("profiler", {}))
+        DistProfilerExtension.__init__(self, DistProfiler(rank=self.rank, config=profiler_config))
+        self._is_rollout = True
+        self._is_actor = False
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        # This is used to import external_lib into the huggingface systems
+        import_external_libs(self.config.model.get("external_lib", None))
+        override_model_config = OmegaConf.to_container(OmegaConf.create(self.config.model.get("override_config", {})))
+
+        use_shm = self.config.model.get("use_shm", False)
+        local_path = copy_to_local(self.config.model.path, use_shm=use_shm)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
+
+        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        self.processor = hf_processor(local_path, trust_remote_code=trust_remote_code)
+
+        if self.config.model.get("custom_chat_template", None) is not None:
+            if self.processor is not None:
+                self.processor.chat_template = self.config.model.custom_chat_template
+            else:
+                self.tokenizer.chat_template = self.config.model.custom_chat_template
+
+        # override model kwargs
+        actor_model_config = AutoConfig.from_pretrained(
+            local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2"
+        )
+
+        # patch for kimi-vl
+        if getattr(actor_model_config, "model_type", None) == "kimi_vl":
+            actor_model_config.text_config.topk_method = "greedy"
+
+        self.generation_config = get_generation_config(local_path, trust_remote_code=trust_remote_code)
+
+        override_config_kwargs = {
+            "bos_token_id": self.tokenizer.bos_token_id,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+        override_config_kwargs.update(override_model_config)
+        update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs)
+        if self.rank == 0:
+            print(f"Model config after override: {actor_model_config}")
+
+        infer_tp = self.config.rollout.tensor_model_parallel_size
+        dp = self.world_size // infer_tp
+        assert self.world_size % infer_tp == 0, (
+            f"rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}"
+        )
+        rollout_device_mesh = init_device_mesh(
+            device_name, mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"]
+        )
+        rollout_name = self.config.rollout.name
+        assert rollout_name == "vllm"
+
+        from verl.workers.rollout.vllm_rollout import vLLMRollout
+
+        log_gpu_memory_usage(f"Before building {rollout_name} rollout", logger=logger)
+
+        from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout
+
+        vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout
+        rollout = vllm_rollout_cls(
+            model_path=local_path,
+            config=self.config.rollout,
+            tokenizer=self.tokenizer,
+            model_hf_config=actor_model_config,
+            device_mesh=rollout_device_mesh,
+            trust_remote_code=trust_remote_code,
+        )
+        log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger)
+
+        from .detach_sharding_manager import DetachShardingManager
+
+        sharding_manager = DetachShardingManager(
+            inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
+        )
+
+        log_gpu_memory_usage("After building sharding manager", logger=logger)
+
+        self.rollout = rollout
+        self.rollout_sharding_manager = sharding_manager
+
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
+    def async_generate_sequences(self, *args, **kwargs):
+        return super().generate_sequences(*args, **kwargs)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def set_actor_weights_info(self, weights_info):
+        assert self._is_rollout
+        self._weights_info = weights_info
+
+
+class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker):
+    def __init__(self, config: DictConfig, role: str):
+        print(f"[DetachAsyncRolloutWorker] {DetachAsyncRolloutWorker.__mro__}")
+        DetachRolloutWorker.__init__(self, config, role)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        print("[DetachAsyncRolloutWorker] init_model")
+        DetachRolloutWorker.init_model(self)
+
+        self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size
+        self.vllm_dp_rank = int(os.environ["RANK"]) // self.vllm_tp_size
+        self.vllm_tp_rank = int(os.environ["RANK"]) % self.vllm_tp_size
+
+        # used for sleep/wake_up
+        self.rollout.sharding_manager = self.rollout_sharding_manager
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 09961c85391..78fc1784b82 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -81,7 +81,7 @@ def create_role_worker_mapping(config):
     # Select worker class based on strategy
     if config.actor_rollout_ref.actor.strategy == "fsdp2":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-        from recipe.one_step_off_policy.fsdp_workers import (
+        from recipe.fully_async_policy.fsdp_workers import (
             CriticWorker,
             DetachActorWorker,
             DetachAsyncRolloutWorker,
@@ -92,7 +92,7 @@ def create_role_worker_mapping(config):
 
     elif config.actor_rollout_ref.actor.strategy == "megatron":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-        from recipe.one_step_off_policy.megatron_workers import (
+        from recipe.fully_async_policy.megatron_workers import (
             CriticWorker,
             DetachActorWorker,
             DetachAsyncRolloutWorker,
diff --git a/recipe/fully_async_policy/megatron_workers.py b/recipe/fully_async_policy/megatron_workers.py
new file mode 100644
index 00000000000..a9318b8f7b3
--- /dev/null
+++ b/recipe/fully_async_policy/megatron_workers.py
@@ -0,0 +1,200 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+import torch
+import torch.distributed
+from omegaconf import DictConfig, OmegaConf
+
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.utils.debug import (
+    log_gpu_memory_usage,
+)
+from verl.utils.device import get_device_name, get_torch_device
+from verl.utils.fs import copy_to_local
+from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
+from verl.workers.megatron_workers import (
+    ActorRolloutRefWorker,
+    AsyncActorRolloutRefWorker,
+    CriticWorker,
+)
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
+
+
+class DetachNcclSync(ActorRolloutRefWorker):
+    def _get_actor_params_generator(self):
+        pass
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
+    def sync_rollout_weights(self):
+        assert (self._is_actor or self._is_rollout) and not self.config.hybrid_engine
+        assert hasattr(self, "_weights_info") and self._weights_info is not None
+
+        params_generator = self._get_actor_params_generator() if self._is_actor else None
+        if self._is_rollout:
+            inference_model = (
+                self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+            )
+            patch_vllm_moe_model_weight_loader(inference_model)
+        for key, shape, dtype in self._weights_info:
+            if self._is_actor:
+                weight_key, weight = next(params_generator)
+                assert key == weight_key
+                assert shape == weight.size()
+                assert dtype == weight.dtype
+
+            tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
+            if self._is_actor and torch.distributed.get_rank() == 0:
+                tensor.copy_(weight)
+            from ray.util.collective import collective
+
+            collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
+            if self._is_rollout:
+                inference_model.load_weights([(key, tensor)])
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def get_actor_weights_info(self):
+        assert self._is_actor
+        if hasattr(self, "_weights_info"):
+            return self._weights_info
+
+        params_generator = self._get_actor_params_generator()
+        ret = []
+        for key, tensor in params_generator:
+            ret.append((key, tensor.size(), tensor.dtype))
+
+        self._weights_info = ret
+        return ret
+
+
+class DetachActorWorker(DetachNcclSync):
+    def _get_actor_params_generator(self):
+        assert self._is_actor
+        from verl.models.mcore import get_mcore_weight_converter
+        from verl.utils.megatron_utils import per_tensor_generator
+
+        layer_name_mapping = {
+            "qkv_layer_name": "self_attention.linear_qkv.",
+            "gate_proj_layer_name": "linear_fc1.",
+        }
+        weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
+        generator = per_tensor_generator(
+            self.actor.actor_module,
+            self.actor_model_config,
+            weight_converter,
+            self.tf_config,
+            layer_name_mapping,
+        )
+        return generator
+
+
+class DetachRolloutWorker(DetachNcclSync):
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        if self.config.model.get("external_lib", None) is not None:
+            # This is used to import external_lib into the huggingface systems
+            import importlib
+
+            importlib.import_module(self.config.model.external_lib)
+
+        from verl.utils.torch_dtypes import PrecisionType
+
+        override_model_config = OmegaConf.to_container(OmegaConf.create(self.config.model.get("override_config", {})))
+        override_transformer_config = {}
+        self.param_dtype = torch.bfloat16
+        self.dtype = PrecisionType.to_dtype(self.param_dtype)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
+
+        from verl.utils.model import get_generation_config
+
+        self._init_hf_config_and_tf_config(
+            self.config.model.path,
+            self.config.model.path,
+            self.dtype,
+            override_model_config,
+            override_transformer_config,
+            trust_remote_code,
+        )
+        self.generation_config = get_generation_config(self.local_path)
+
+        from torch.distributed.device_mesh import init_device_mesh
+
+        assert self.config.rollout.name == "vllm"
+
+        from verl.workers.rollout.vllm_rollout import vLLMRollout
+
+        # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor,
+        # we will reorganize their weight format when resharding from actor to rollout.
+
+        infer_tp = self.config.rollout.tensor_model_parallel_size
+        dp = self.world_size // infer_tp
+        assert self.world_size % infer_tp == 0, (
+            f"rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}"
+        )
+        rollout_device_mesh = init_device_mesh(
+            get_device_name(), mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"]
+        )
+        log_gpu_memory_usage("Before building vllm rollout", logger=None)
+
+        local_path = copy_to_local(self.config.model.path, use_shm=self.config.model.get("use_shm", False))
+        from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout
+
+        vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout
+        rollout = vllm_rollout_cls(
+            model_path=local_path,
+            config=self.config.rollout,
+            tokenizer=self.tokenizer,
+            model_hf_config=self.hf_config,
+            device_mesh=rollout_device_mesh,
+            trust_remote_code=trust_remote_code,
+        )
+        log_gpu_memory_usage("After building vllm rollout", logger=logger)
+
+        from .detach_sharding_manager import DetachShardingManager
+
+        rollout_sharding_manager = DetachShardingManager(
+            inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
+        )
+
+        log_gpu_memory_usage("After building sharding manager", logger=logger)
+
+        self.rollout = rollout
+        self.sharding_manager = rollout_sharding_manager
+        self.rollout.sharding_manager = rollout_sharding_manager
+
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
+    def async_generate_sequences(self, *args, **kwargs):
+        return super().generate_sequences(*args, **kwargs)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def set_actor_weights_info(self, weights_info):
+        assert self._is_rollout
+        self._weights_info = weights_info
+
+
+class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker):
+    def __init__(self, config: DictConfig, role: str):
+        print(DetachAsyncRolloutWorker.__mro__)
+        DetachRolloutWorker.__init__(self, config, role)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        DetachRolloutWorker.init_model(self)
diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py
index 086f109e434..0aa21991708 100644
--- a/recipe/one_step_off_policy/fsdp_workers.py
+++ b/recipe/one_step_off_policy/fsdp_workers.py
@@ -39,43 +39,27 @@
 from verl.utils.import_utils import import_external_libs
 from verl.utils.model import get_generation_config, update_model_config
 from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
-from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+from verl.workers.fsdp_workers import ActorRolloutRefWorker as ARRWorker
+from verl.workers.fsdp_workers import CriticWorker
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 device_name = get_device_name()
 
-__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
-
-
-def get_inference_model(rollout):
-    """
-    根据不同类型的inference_engine获取模型对象
-    Args:
-        rollout: rollout对象，包含inference_engine
-    Returns:
-        model: 模型对象
-    """
-    inference_engine = rollout.inference_engine
-    # 判断inference_engine的类型
-    if hasattr(inference_engine, "llm_engine"):
-        # LLM类型 - vLLMRollout
-        inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
-    elif hasattr(inference_engine, "worker"):
-        # WorkerWrapperBase类型 - vLLMAsyncRollout
-        inference_model = inference_engine.worker.model_runner.model
-    else:
-        raise AttributeError(
-            f"Unsupported inference_engine type: {type(inference_engine)}. "
-            f"Expected LLM (with llm_engine attribute) or WorkerWrapperBase (with worker attribute)."
-        )
-    return inference_model
+__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RolloutWorker"]
 
 
-class DetachNcclSync(ActorRolloutRefWorker):
+class ActorRolloutRefWorker(ARRWorker):
     def _get_actor_params(self):
-        pass
+        assert self._is_actor
+        params = self.actor_module_fsdp.state_dict()
+        from verl.utils.model import convert_weight_keys
+
+        params = convert_weight_keys(
+            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
+        )
+        return params
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
     def sync_rollout_weights(self):
@@ -84,7 +68,9 @@ def sync_rollout_weights(self):
 
         params = self._get_actor_params() if self._is_actor else None
         if self._is_rollout:
-            inference_model = get_inference_model(self.rollout)
+            inference_model = (
+                self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+            )
             patch_vllm_moe_model_weight_loader(inference_model)
         for key, shape, dtype in self._weights_info:
             tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
@@ -122,19 +108,7 @@ def get_actor_weights_info(self):
         return ret
 
 
-class DetachActorWorker(DetachNcclSync):
-    def _get_actor_params(self):
-        assert self._is_actor
-        params = self.actor_module_fsdp.state_dict()
-        from verl.utils.model import convert_weight_keys
-
-        params = convert_weight_keys(
-            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
-        )
-        return params
-
-
-class DetachRolloutWorker(DetachNcclSync):
+class RolloutWorker(ActorRolloutRefWorker):
     def __init__(self, config: DictConfig, role: str):
         Worker.__init__(self)
         assert role == "rollout"
@@ -228,17 +202,16 @@ def init_model(self):
             trust_remote_code=trust_remote_code,
         )
         log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger)
+        from .vllm_sharding_manager import VLLMShardingManager
 
-        from .detach_sharding_manager import DetachShardingManager
-
-        sharding_manager = DetachShardingManager(
+        rollout_sharding_manager = VLLMShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
         )
 
         log_gpu_memory_usage("After building sharding manager", logger=logger)
 
         self.rollout = rollout
-        self.rollout_sharding_manager = sharding_manager
+        self.rollout_sharding_manager = rollout_sharding_manager
 
     @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
     def async_generate_sequences(self, *args, **kwargs):
@@ -250,19 +223,6 @@ def set_actor_weights_info(self, weights_info):
         self._weights_info = weights_info
 
 
-class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker):
-    def __init__(self, config: DictConfig, role: str):
-        print(f"[DetachAsyncRolloutWorker] {DetachAsyncRolloutWorker.__mro__}")
-        DetachRolloutWorker.__init__(self, config, role)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def init_model(self):
-        print("[DetachAsyncRolloutWorker] init_model")
-        DetachRolloutWorker.init_model(self)
-
-        self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size
-        self.vllm_dp_rank = int(os.environ["RANK"]) // self.vllm_tp_size
-        self.vllm_tp_rank = int(os.environ["RANK"]) % self.vllm_tp_size
-
-        # used for sleep/wake_up
-        self.rollout.sharding_manager = self.rollout_sharding_manager
+class AsyncActorRolloutRefWorker(ActorRolloutRefWorker):
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py
index 0dcdbef3705..44a0f4b8675 100644
--- a/recipe/one_step_off_policy/main_ppo.py
+++ b/recipe/one_step_off_policy/main_ppo.py
@@ -23,18 +23,58 @@
 import ray
 from omegaconf import OmegaConf
 
+from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
 from verl.trainer.ppo.reward import load_reward_manager
 
 from .ray_trainer import OneStepOffRayTrainer
 
 
+@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None)
+def main(config):
+    run_ppo(config)
+
+
+# Define a function to run the PPO-like training process
+def run_ppo(config) -> None:
+    # Check if Ray is not initialized
+    if not ray.is_initialized():
+        # Initialize Ray with a local cluster configuration
+        # Set environment variables in the runtime environment to control tokenizer parallelism,
+        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
+        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
+        ray.init(
+            runtime_env=get_ppo_ray_runtime_env(),
+            num_cpus=config.ray_init.num_cpus,
+        )
+
+    # Create a remote instance of the TaskRunner class, and
+    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
+    if (
+        OmegaConf.select(config.trainer, "profile_steps") is not None
+        and len(OmegaConf.select(config.trainer, "profile_steps")) > 0
+    ):
+        nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
+        runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
+    else:
+        runner = TaskRunner.remote()
+    ray.get(runner.run.remote(config))
+
+    # [Optional] get the path of the timeline trace file from the configuration, default to None
+    # This file is used for performance analysis
+    timeline_json_file = config.ray_init.get("timeline_json_file", None)
+    if timeline_json_file:
+        ray.timeline(filename=timeline_json_file)
+
+
 @ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
-class OneStepOffTaskRunner:
+class TaskRunner:
     def run(self, config):
         # Print the initial configuration. `resolve=True` will evaluate symbolic values.
         from pprint import pprint
 
+        from omegaconf import OmegaConf
+
         from verl.utils.fs import copy_to_local
 
         print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
@@ -60,26 +100,38 @@ def run(self, config):
         # Define worker classes based on the actor strategy.
         if config.actor_rollout_ref.actor.strategy == "fsdp2":
             assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-            from recipe.one_step_off_policy.fsdp_workers import (
+            from verl.single_controller.ray import RayWorkerGroup
+
+            from .fsdp_workers import (
+                ActorRolloutRefWorker,
+                AsyncActorRolloutRefWorker,
                 CriticWorker,
-                DetachActorWorker,
-                DetachAsyncRolloutWorker,
-                DetachRolloutWorker,
+                RolloutWorker,
             )
-            from verl.single_controller.ray import RayWorkerGroup
 
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
             ray_worker_group_cls = RayWorkerGroup
 
         elif config.actor_rollout_ref.actor.strategy == "megatron":
             assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-            from recipe.one_step_off_policy.megatron_workers import (
+            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+
+            from .megatron_workers import (
+                ActorRolloutRefWorker,
+                AsyncActorRolloutRefWorker,
                 CriticWorker,
-                DetachActorWorker,
-                DetachAsyncRolloutWorker,
-                DetachRolloutWorker,
+                RolloutWorker,
             )
-            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
 
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
             ray_worker_group_cls = NVMegatronRayWorkerGroup
 
         else:
@@ -88,10 +140,8 @@ def run(self, config):
         from .ray_trainer import ResourcePoolManager, Role
 
         role_worker_mapping = {
-            Role.Actor: ray.remote(DetachActorWorker),
-            Role.Rollout: ray.remote(
-                DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker
-            ),
+            Role.Actor: ray.remote(actor_rollout_cls),
+            Role.Rollout: ray.remote(RolloutWorker),
             Role.Critic: ray.remote(CriticWorker),
         }
 
@@ -122,7 +172,7 @@ def run(self, config):
         # finally, we combine all the rewards together
         # The reward type depends on the tag of the data
         if config.reward_model.enable:
-            if config.reward_model.strategy == "fsdp2":
+            if config.reward_model.strategy in ["fsdp2"]:
                 from verl.workers.fsdp_workers import RewardModelWorker
             elif config.reward_model.strategy == "megatron":
                 from verl.workers.megatron_workers import RewardModelWorker
@@ -133,7 +183,7 @@ def run(self, config):
 
         # Add a reference policy worker if KL loss or KL reward is used.
         if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
-            role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker)
+            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 
         # Load the reward manager for training and validation.
@@ -174,12 +224,5 @@ def run(self, config):
         trainer.fit()
 
 
-@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None)
-def main(config):
-    from verl.trainer.main_ppo import run_ppo
-
-    run_ppo(config, OneStepOffTaskRunner)
-
-
 if __name__ == "__main__":
     main()
diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py
index a9318b8f7b3..f7b58405b4f 100644
--- a/recipe/one_step_off_policy/megatron_workers.py
+++ b/recipe/one_step_off_policy/megatron_workers.py
@@ -27,21 +27,42 @@
 from verl.utils.device import get_device_name, get_torch_device
 from verl.utils.fs import copy_to_local
 from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
-from verl.workers.megatron_workers import (
-    ActorRolloutRefWorker,
-    AsyncActorRolloutRefWorker,
-    CriticWorker,
-)
+from verl.workers.megatron_workers import ActorRolloutRefWorker as ARRWorker
+from verl.workers.megatron_workers import CriticWorker, RewardModelWorker
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
-__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
+__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RewardModelWorker", "RolloutWorker"]
+
 
+class ActorRolloutRefWorker(ARRWorker):
+    def __init__(self, config: DictConfig, role: str):
+        assert role in ["actor", "ref"]
+        tmp_role = "ref" if role == "ref" else "actor_rollout"
+        super().__init__(config, tmp_role)
+        if role == "actor":
+            self._is_rollout = False
+        self.role = role
 
-class DetachNcclSync(ActorRolloutRefWorker):
     def _get_actor_params_generator(self):
-        pass
+        assert self._is_actor
+        from verl.models.mcore import get_mcore_weight_converter
+        from verl.utils.megatron_utils import per_tensor_generator
+
+        layer_name_mapping = {
+            "qkv_layer_name": "self_attention.linear_qkv.",
+            "gate_proj_layer_name": "linear_fc1.",
+        }
+        weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
+        generator = per_tensor_generator(
+            self.actor.actor_module,
+            self.actor_model_config,
+            weight_converter,
+            self.tf_config,
+            layer_name_mapping,
+        )
+        return generator
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
     def sync_rollout_weights(self):
@@ -85,28 +106,11 @@ def get_actor_weights_info(self):
         return ret
 
 
-class DetachActorWorker(DetachNcclSync):
-    def _get_actor_params_generator(self):
-        assert self._is_actor
-        from verl.models.mcore import get_mcore_weight_converter
-        from verl.utils.megatron_utils import per_tensor_generator
-
-        layer_name_mapping = {
-            "qkv_layer_name": "self_attention.linear_qkv.",
-            "gate_proj_layer_name": "linear_fc1.",
-        }
-        weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
-        generator = per_tensor_generator(
-            self.actor.actor_module,
-            self.actor_model_config,
-            weight_converter,
-            self.tf_config,
-            layer_name_mapping,
-        )
-        return generator
-
+class RolloutWorker(ActorRolloutRefWorker):
+    def __init__(self, config: DictConfig, role: str):
+        assert role == "rollout"
+        ARRWorker.__init__(self, config, role)
 
-class DetachRolloutWorker(DetachNcclSync):
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
         if self.config.model.get("external_lib", None) is not None:
@@ -138,9 +142,12 @@ def init_model(self):
         from torch.distributed.device_mesh import init_device_mesh
 
         assert self.config.rollout.name == "vllm"
+        assert self.config.rollout.mode == "sync"
 
         from verl.workers.rollout.vllm_rollout import vLLMRollout
 
+        from .vllm_sharding_manager import VLLMShardingManager
+
         # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor,
         # we will reorganize their weight format when resharding from actor to rollout.
 
@@ -168,17 +175,14 @@ def init_model(self):
         )
         log_gpu_memory_usage("After building vllm rollout", logger=logger)
 
-        from .detach_sharding_manager import DetachShardingManager
-
-        rollout_sharding_manager = DetachShardingManager(
-            inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
+        sharding_manager = VLLMShardingManager(
+            inference_engine=rollout.inference_engine,
+            device_mesh=rollout_device_mesh,
         )
-
         log_gpu_memory_usage("After building sharding manager", logger=logger)
 
-        self.rollout = rollout
-        self.sharding_manager = rollout_sharding_manager
-        self.rollout.sharding_manager = rollout_sharding_manager
+        self.rollout, self.sharding_manager = rollout, sharding_manager
+        self.rollout.sharding_manager = sharding_manager
 
     @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
     def async_generate_sequences(self, *args, **kwargs):
@@ -190,11 +194,6 @@ def set_actor_weights_info(self, weights_info):
         self._weights_info = weights_info
 
 
-class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker):
-    def __init__(self, config: DictConfig, role: str):
-        print(DetachAsyncRolloutWorker.__mro__)
-        DetachRolloutWorker.__init__(self, config, role)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def init_model(self):
-        DetachRolloutWorker.init_model(self)
+class AsyncActorRolloutRefWorker(ActorRolloutRefWorker):
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index ef8d6d8792e..1f7011bdf54 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -18,24 +18,40 @@
 This trainer supports model-agonistic model initialization with huggingface
 """
 
-import warnings
+import uuid
 from pprint import pprint
 
+import numpy as np
 import ray
+import torch
 from omegaconf import OmegaConf
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
 
+from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.ppo import core_algos
-from verl.trainer.ppo.core_algos import AdvantageEstimator
+from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
+from verl.trainer.ppo.metric_utils import (
+    compute_data_metrics,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+)
 from verl.trainer.ppo.ray_trainer import (
     RayPPOTrainer,
     ResourcePoolManager,
     Role,
     WorkerType,
+    apply_kl_penalty,
+    compute_advantage,
+    compute_response_mask,
 )
+from verl.trainer.ppo.reward import compute_reward, compute_reward_async
 from verl.utils.debug import marked_timer
+from verl.utils.metric import (
+    reduce_metrics,
+)
 from verl.utils.tracking import ValidationGenerationsLogger
 
 
@@ -89,7 +105,7 @@ def __init__(
         val_dataset: Dataset | None = None,
         collate_fn=None,
         train_sampler: Sampler | None = None,
-        device_name=None,
+        device_name="cuda",
     ):
         """
         Initialize distributed PPO trainer with Ray backend.
@@ -127,31 +143,32 @@ def __init__(
         self.use_reference_policy = Role.RefPolicy in role_worker_mapping
         self.use_rm = Role.RewardModel in role_worker_mapping
         self.ray_worker_group_cls = ray_worker_group_cls
-        self.device_name = device_name if device_name else self.config.trainer.device
-        self.validation_generations_logger = ValidationGenerationsLogger(
-            project_name=self.config.trainer.project_name,
-            experiment_name=self.config.trainer.experiment_name,
-        )
+        self.device_name = device_name
+        self.validation_generations_logger = ValidationGenerationsLogger()
 
         # if ref_in_actor is True, the reference policy will be actor without lora applied
         self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
 
         # define in-reward KL control
         # kl loss control currently not suppoorted
-        if self.config.algorithm.use_kl_in_reward:
-            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
+        if config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
 
-        if config.critic.enable is not None:
-            self.use_critic = bool(config.critic.enable)
-        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+        if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
             self.use_critic = True
-        else:
-            warnings.warn(
-                "Disabled critic as algorithm.adv_estimator != gae. "
-                "If it is not intended, please set critic.enable=True",
-                stacklevel=2,
-            )
+        elif self.config.algorithm.adv_estimator in [
+            AdvantageEstimator.GRPO,
+            AdvantageEstimator.GRPO_PASSK,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS,
+            # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy
+            AdvantageEstimator.RLOO,
+            AdvantageEstimator.OPO,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
+            AdvantageEstimator.GPG,
+        ]:
             self.use_critic = False
+        else:
+            raise NotImplementedError
 
         self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
@@ -162,32 +179,94 @@ def _validate(self):
         self.actor_rollout_wg = self.actor_wg
         return ret
 
-    def _create_actor_rollout_classes(self):
+    def init_workers(self):
+        """Initialize distributed training workers using Ray backend.
+
+        Creates:
+        1. Ray resource pools from configuration
+        2. Worker groups for each role (actor, critic, etc.)
+        """
+        self.resource_pool_manager.create_resource_pool()
+
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
         # create actor and rollout
-        for role in [Role.Actor, Role.Rollout]:
+        for role, role_name in [(Role.Actor, "actor"), (Role.Rollout, "rollout")]:
             resource_pool = self.resource_pool_manager.get_resource_pool(role)
             role_cls = RayClassWithInitArgs(
                 cls=self.role_worker_mapping[role],
                 config=self.config.actor_rollout_ref,
-                role=str(role),
+                role=role_name,
+            )
+            self.resource_pool_to_cls[resource_pool][role_name] = role_cls
+
+        # create critic
+        if self.use_critic:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
+            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
+
+        # create reference policy if needed
+        if self.use_reference_policy:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            ref_policy_cls = RayClassWithInitArgs(
+                self.role_worker_mapping[Role.RefPolicy],
+                config=self.config.actor_rollout_ref,
+                role="ref",
+                profile_option=self.config.trainer.npu_profile.options,
+            )
+            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
+
+        # create a reward model if reward_fn is None
+        if self.use_rm:
+            # we create a RM here
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
+            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
+
+        # initialize WorkerGroup
+        # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
+        # you should not use `create_colocated_worker_cls`.
+        # Instead, directly pass different resource pool to different worker groups.
+        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
+        all_wg = {}
+        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
+            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
+            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
+                "worker_nsight_options must be set when profile_steps is set"
+            )
+            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
+                OmegaConf.select(self.config.trainer, "worker_nsight_options")
+            )
+
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                device_name=self.device_name,
+                **wg_kwargs,
             )
-            self.resource_pool_to_cls[resource_pool][str(role)] = role_cls
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
 
-    def _init_models(self):
         if self.use_critic:
-            self.critic_wg = self.all_wg[str(Role.Critic)]
+            self.critic_wg = all_wg["critic"]
             self.critic_wg.init_model()
 
         if self.use_reference_policy and not self.ref_in_actor:
-            self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)]
+            self.ref_policy_wg = all_wg["ref"]
             self.ref_policy_wg.init_model()
 
         if self.use_rm:
-            self.rm_wg = self.all_wg[str(Role.RewardModel)]
+            self.rm_wg = all_wg["rm"]
             self.rm_wg.init_model()
 
-        self.actor_wg = self.all_wg[str(Role.Actor)]
-        self.rollout_wg = self.all_wg[str(Role.Rollout)]
+        self.actor_wg = all_wg["actor"]
+        self.rollout_wg = all_wg["rollout"]
         self.actor_wg.init_model()
         self.rollout_wg.init_model()
         self.actor_rollout_wg = self.actor_wg  # to be compatible with the functions that not be modified
@@ -205,9 +284,21 @@ def _init_models(self):
         )
         self.sync_rollout_weights()
 
+        # create async rollout manager and request scheduler
+        self.async_rollout_mode = False
+        if self.config.actor_rollout_ref.rollout.mode == "async" and self._is_rollout:
+            from verl.workers.rollout.async_server import AsyncLLMServerManager
+
+            self.async_rollout_mode = True
+            self.async_rollout_manager = AsyncLLMServerManager(
+                config=self.config,
+                worker_group=self.rollout_wg,
+            )
+
     def sync_rollout_weights(self):
-        self.actor_wg.sync_rollout_weights()
-        ray.get(self.rollout_wg.sync_rollout_weights())
+        if not self.hybrid_engine:
+            self.actor_wg.sync_rollout_weights()
+            ray.get(self.rollout_wg.sync_rollout_weights())
 
     def _create_continuous_iterator(self):
         """
@@ -229,7 +320,23 @@ def _async_gen_next_batch(self, continuous_iterator):
         except Exception as e:
             print(f"Error in async_gen_next_batch: {e}")
             return None
-        batch, gen_batch = self._prepare_generate_batch(batch_dict)
+        batch = DataProto.from_single_dict(batch_dict)
+        # pop those keys for generation
+        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+        if "multi_modal_data" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("multi_modal_data")
+        if "raw_prompt" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("raw_prompt")
+        if "tools_kwargs" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("tools_kwargs")
+        if "interaction_kwargs" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+        gen_batch = batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+        )
+        gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
         # sync weights from actor to rollout
         self.sync_rollout_weights()
         # async generation
@@ -243,6 +350,7 @@ def fit(self):
         to construct the PPO dataflow.
         The light-weight advantage computation is done on the driver process.
         """
+        from omegaconf import OmegaConf
 
         from verl.utils.tracking import Tracking
 
@@ -274,7 +382,6 @@ def fit(self):
         # we start from step 1
         self.global_steps += 1
         last_val_metrics = None
-        self.max_steps_duration = 0
 
         # across epoch iterator
         continuous_iterator = self._create_continuous_iterator()
@@ -283,16 +390,24 @@ def fit(self):
         batch_data_future = self._async_gen_next_batch(continuous_iterator)
 
         while batch_data_future is not None:
-            metrics = {}
-            timing_raw = {}
-
             do_profile = (
                 self.global_steps in self.config.trainer.profile_steps
                 if self.config.trainer.profile_steps is not None
                 else False
             )
-            self._start_profiling(do_profile, timing_raw)
+            if do_profile:
+                self.actor_wg.start_profile()
+                if not self.hybrid_engine:
+                    self.rollout_wg.start_profile()
+                if self.use_reference_policy:
+                    self.ref_policy_wg.start_profile()
+                if self.use_critic:
+                    self.critic_wg.start_profile()
+                if self.use_rm:
+                    self.rm_wg.start_profile()
 
+            metrics = {}
+            timing_raw = {}
             is_last_step = self.global_steps >= self.total_training_steps
 
             with marked_timer("step", timing_raw):
@@ -307,15 +422,184 @@ def fit(self):
                     if not is_last_step:
                         batch_data_future = self._async_gen_next_batch(continuous_iterator)
 
-                batch = self._post_generate_batch(batch, gen_batch_output, metrics)
-                batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
-                self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-                last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
-                self._check_save_checkpoint(is_last_step, timing_raw)
-
-            self._stop_profiling(do_profile, timing_raw)
-            self._collect_metrics(batch, epoch, metrics, timing_raw)
-            self._post_batch_processing(batch)
+                batch.non_tensor_batch["uid"] = np.array(
+                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
+                )
+                # repeat to align with repeated responses in rollout
+                batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                batch = batch.union(gen_batch_output)
+
+                batch.batch["response_mask"] = compute_response_mask(batch)
+                # Balance the number of valid tokens across DP ranks.
+                # NOTE: This usually changes the order of data in the `batch`,
+                # which won't affect the advantage calculation (since it's based on uid),
+                # but might affect the loss calculation (due to the change of mini-batching).
+                # TODO: Decouple the DP balancing and mini-batching.
+                if self.config.trainer.balance_batch:
+                    self._balance_batch(batch, metrics=metrics)
+
+                # compute global_valid tokens
+                batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
+                with marked_timer("reward", timing_raw, color="yellow"):
+                    # compute reward model score
+                    if self.use_rm:
+                        reward_tensor = self.rm_wg.compute_rm_score(batch)
+                        batch = batch.union(reward_tensor)
+
+                    if self.config.reward_model.launch_reward_fn_async:
+                        future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer)
+                    else:
+                        reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+
+                # recompute old_log_probs
+                with marked_timer("old_log_prob", timing_raw, color="blue"):
+                    old_log_prob = self.actor_wg.compute_log_prob(batch)
+                    entropys = old_log_prob.batch["entropys"]
+                    response_masks = batch.batch["response_mask"]
+                    loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                    entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                    old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                    metrics.update(old_log_prob_metrics)
+                    old_log_prob.batch.pop("entropys")
+                    batch = batch.union(old_log_prob)
+
+                    if "rollout_log_probs" in batch.batch.keys():
+                        # TODO: we may want to add diff of probs too.
+                        rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                        actor_old_log_probs = batch.batch["old_log_probs"]
+                        attention_mask = batch.batch["attention_mask"]
+                        responses = batch.batch["responses"]
+                        response_length = responses.size(1)
+                        response_mask = attention_mask[:, -response_length:]
+
+                        rollout_probs = torch.exp(rollout_old_log_probs)
+                        actor_probs = torch.exp(actor_old_log_probs)
+                        rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                        rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                        rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                        rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                        rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                        metrics.update(
+                            {
+                                "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                                "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                                "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                            }
+                        )
+
+                if self.use_reference_policy:
+                    # compute reference log_prob
+                    with marked_timer("ref", timing_raw, color="olive"):
+                        if not self.ref_in_actor:
+                            ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                        else:
+                            ref_log_prob = self.actor_wg.compute_ref_log_prob(batch)
+                        batch = batch.union(ref_log_prob)
+
+                # compute values
+                if self.use_critic:
+                    with marked_timer("values", timing_raw, color="cyan"):
+                        values = self.critic_wg.compute_values(batch)
+                        batch = batch.union(values)
+
+                with marked_timer("adv", timing_raw, color="brown"):
+                    # we combine with rule-based rm
+                    reward_extra_infos_dict: dict[str, list]
+                    if self.config.reward_model.launch_reward_fn_async:
+                        reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
+                    batch.batch["token_level_scores"] = reward_tensor
+
+                    if reward_extra_infos_dict:
+                        batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+                    # compute rewards. apply_kl_penalty if available
+                    if self.config.algorithm.use_kl_in_reward:
+                        batch, kl_metrics = apply_kl_penalty(
+                            batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                        )
+                        metrics.update(kl_metrics)
+                    else:
+                        batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+
+                    # compute advantages, executed on the driver process
+
+                    norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                        "norm_adv_by_std_in_grpo", True
+                    )  # GRPO adv normalization factor
+
+                    batch = compute_advantage(
+                        batch,
+                        adv_estimator=self.config.algorithm.adv_estimator,
+                        gamma=self.config.algorithm.gamma,
+                        lam=self.config.algorithm.lam,
+                        num_repeat=self.config.actor_rollout_ref.rollout.n,
+                        norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                        config=self.config.algorithm,
+                    )
+
+                # update critic
+                if self.use_critic:
+                    with marked_timer("update_critic", timing_raw, color="pink"):
+                        critic_output = self.critic_wg.update_critic(batch)
+                    critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                    metrics.update(critic_output_metrics)
+
+                # implement critic warmup
+                if self.config.trainer.critic_warmup <= self.global_steps:
+                    # update actor
+                    with marked_timer("update_actor", timing_raw, color="red"):
+                        batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                        actor_output = self.actor_wg.update_actor(batch)
+                    actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                    metrics.update(actor_output_metrics)
+
+                # Log rollout generations if enabled
+                rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                if rollout_data_dir:
+                    with marked_timer("dump_rollout_generations", timing_raw, color="green"):
+                        inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                        outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                        scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                        self._dump_generations(
+                            inputs=inputs,
+                            outputs=outputs,
+                            scores=scores,
+                            reward_extra_infos_dict=reward_extra_infos_dict,
+                            dump_path=rollout_data_dir,
+                        )
+
+                # validate
+                if (
+                    self.val_reward_fn is not None
+                    and self.config.trainer.test_freq > 0
+                    and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+                ):
+                    with marked_timer("testing", timing_raw, color="green"):
+                        val_metrics: dict = self._validate()
+                        if is_last_step:
+                            last_val_metrics = val_metrics
+                    metrics.update(val_metrics)
+
+                if self.config.trainer.save_freq > 0 and (
+                    is_last_step or self.global_steps % self.config.trainer.save_freq == 0
+                ):
+                    with marked_timer("save_checkpoint", timing_raw, color="green"):
+                        self._save_checkpoint()
+
+            # training metrics
+            metrics.update(
+                {
+                    "training/global_step": self.global_steps,
+                    "training/epoch": epoch,
+                }
+            )
+            # collect metrics
+            metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+            metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+            # TODO: implement actual tflpo and theoretical tflpo
+            n_gpus = self.resource_pool_manager.get_n_gpus()
+            metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
 
             # TODO: make a canonical logger that supports various backend
             logger.log(data=metrics, step=self.global_steps)
@@ -323,6 +607,17 @@ def fit(self):
             progress_bar.update(1)
             self.global_steps += 1
 
+            if do_profile:
+                self.actor_wg.stop_profile()
+                if not self.hybrid_engine:
+                    self.rollout_wg.stop_profile()
+                if self.use_reference_policy:
+                    self.ref_policy_wg.stop_profile()
+                if self.use_critic:
+                    self.critic_wg.stop_profile()
+                if self.use_rm:
+                    self.rm_wg.stop_profile()
+
             if is_last_step:
                 pprint(f"Final validation metrics: {last_val_metrics}")
                 progress_bar.close()
diff --git a/recipe/one_step_off_policy/vllm_sharding_manager.py b/recipe/one_step_off_policy/vllm_sharding_manager.py
new file mode 100644
index 00000000000..c33ba585470
--- /dev/null
+++ b/recipe/one_step_off_policy/vllm_sharding_manager.py
@@ -0,0 +1,74 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+from torch.distributed.device_mesh import DeviceMesh
+
+from verl import DataProto
+from verl.protocol import all_gather_data_proto
+from verl.third_party.vllm import parallel_state as vllm_ps
+from verl.utils.debug import GPUMemoryLogger
+from verl.utils.device import get_torch_device
+from verl.utils.torch_functional import check_device_is_available
+from verl.workers.sharding_manager.base import BaseShardingManager
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+class VLLMShardingManager(BaseShardingManager):
+    @check_device_is_available()
+    def __init__(self, inference_engine, device_mesh: DeviceMesh):
+        self.device_mesh = device_mesh
+        self.inference_engine = inference_engine
+        inference_engine.wake_up()
+        assert device_mesh is not None
+        assert inference_engine is not None
+        self.tp_size = self.device_mesh["infer_tp"].size()
+        self.tp_rank = self.device_mesh["infer_tp"].get_local_rank()
+        self.timing = {}
+        gen_dp_rank = self.device_mesh["dp"].get_local_rank()
+        get_torch_device().manual_seed(gen_dp_rank + 1000)
+        self.gen_random_states = get_torch_device().get_rng_state()
+
+    @GPUMemoryLogger(role="vllm sharding_manager", logger=logger)
+    def __enter__(self):
+        get_torch_device().set_rng_state(self.gen_random_states)
+
+    @GPUMemoryLogger(role="vllm sharding_manager", logger=logger)
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.gen_random_states = get_torch_device().get_rng_state()
+        self.inference_engine.reset_prefix_cache()
+
+    @GPUMemoryLogger(role="vllm sharding_manager", logger=logger)
+    def preprocess_data(self, data: DataProto) -> DataProto:
+        """All gather across tp group to make each rank has identical input."""
+        if self.tp_size == 1:
+            return data
+
+        group = vllm_ps.get_tensor_model_parallel_group().device_group
+
+        all_gather_data_proto(data=data, process_group=group)
+        return data
+
+    @GPUMemoryLogger(role="vllm sharding_manager", logger=logger)
+    def postprocess_data(self, data: DataProto) -> DataProto:
+        """Get chunk data of this tp rank since we do all gather in preprocess."""
+        if self.tp_size == 1:
+            return data
+
+        return data.chunk(chunks=self.tp_size)[self.tp_rank]
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 142ee3e8806..63cfcf622a6 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -59,7 +59,7 @@ train_prompt_mini_bsz=32
 total_rollout_steps=$(((128*2)))
 test_freq=10
 staleness_threshold=1
-trigger_parameter_sync_step=1
+trigger_parameter_sync_step=16
 partial_rollout=True
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"

From aa57cd48a150b3e968d88a8b66f217f25097b3e7 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 5 Sep 2025 17:32:39 +0800
Subject: [PATCH 099/182] fix some metrics aggregate

---
 recipe/fully_async_policy/detach_utils.py     | 129 ++++++------------
 .../fully_async_policy/fully_async_trainer.py |   4 +-
 2 files changed, 41 insertions(+), 92 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 48a41443612..e01d82c1726 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -234,7 +234,7 @@ def assemble_batch_from_rollout_samples(
 class MetricsAggregator:
     """Metrics aggregator, used to combine metrics from multiple training steps"""
     
-    def __init__(self):
+    def __init__(self, total_gpus: int):
         # Store all values ​​for each metric
         self.metric_values: Dict[str, List[float]] = defaultdict(list)
         # Store the number of samples at each step for weighted averaging
@@ -243,6 +243,8 @@ def __init__(self):
         self.timestamps: List[float] = []
         # Step Count
         self.step_count = 0
+        # total num gpus used
+        self.total_gpus = total_gpus
         
         # Metric aggregation rule configuration
         self.aggregation_rules = self._init_aggregation_rules()
@@ -250,57 +252,7 @@ def __init__(self):
     def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]:
         """Initialize metrics aggregation rules"""
         return {
-            # # Cumulative metrics - take the last value
-            # 'last': [
-            #     'fully_async/stale_samples_processed',
-            #     'fully_async/current_param_version',
-            #     'global_steps',
-            #     'epoch',
-            # ],
-            
-            # # Weighted average metrics - weighted by sample size
-            # 'weighted_avg': [
-            #     'fully_async/stale_samples_ratio',
-            #     'policy_loss',
-            #     'value_loss',
-            #     'entropy_loss',
-            #     'kl_divergence',
-            #     'advantage_mean',
-            #     'advantage_std',
-            #     'learning_rate',
-            # ],
-            
-            # # Summation type metrics - direct accumulation
-            # 'sum': [
-            #     'fully_async/total_wait_time',
-            #     'processed_samples',
-            #     'total_tokens',
-            # ],
-            
-            # Average metrics - Simple Average
-            # 'avg': [
-            #     'perf/throughput',
-            #     'fully_async/avg_processing_time',
-            #     'fully_async/tp50_processing_time',
-            #     'fully_async/tp95_processing_time',
-            #     'fully_async/tp99_processing_time',
-            #     'grad_norm',
-            # ],
-            
-            # # Maximum value metrics
-            # 'max': [
-            #     'fully_async/max_processing_time',
-            #     'max_grad_norm',
-            #     'peak_memory_usage',
-            # ],
-            
-            # # Minimum value metrics
-            # 'min': [
-            #     'fully_async/min_processing_time',
-            #     'min_learning_rate',
-            # ],
-            
-            # Time-Based metrics - Special Treatment
+            # Time-Based metrics, can add metrics here
             'time_sum': [
                 'timing_s/adv',
                 'timing_s/gen',
@@ -332,35 +284,26 @@ def _get_aggregation_type(self, metric_name: str) -> str:
         for agg_type, metric_list in self.aggregation_rules.items():
             if metric_name in metric_list:
                 return agg_type
+                
+        metric_lower = metric_name.lower()
+        if any(keyword in metric_lower for keyword in ['timing_s/']):
+            return 'time_sum'
+        if any(keyword in metric_lower for keyword in ['mean', 'avg', 'average']):
+            return 'avg'
+        if any(keyword in metric_lower for keyword in ['max', 'maximum']):
+            return 'max'
+        if any(keyword in metric_lower for keyword in ['min', 'minimum']):
+            return 'min'
+        if any(keyword in metric_lower for keyword in ['sum', 'total']):
+           return 'sum'
+        if any(keyword in metric_lower for keyword in ['weighted_avg']):
+            return 'weighted_avg'
+        
         import warnings
         warnings.warn(f"No aggregation rule is matched in init_aggregation_rules. \
-                      For metric {metric_name}, the 'last' method is used")
-        return 'last'
+                      For metric {metric_name}, the 'avg' method is used")
+        return 'avg'
 
-        # raise ValueError(f"No aggregation rule is matched in init_aggregation_rules. \
-        #                 Metric name: {metric_name}")    # TODO: 删除
-
-        
-        # Aggregation rules based on naming patterns
-        if metric_name.startswith('time/'):
-            aggregation_type = 'time_sum'
-        elif metric_name.endswith('_ratio') or metric_name.endswith('_rate'):
-            aggregation_type = 'weighted_avg'
-        elif metric_name.endswith('_count') or metric_name.endswith('_total'):
-            aggregation_type = 'sum'
-        elif metric_name.startswith('max_') or metric_name.endswith('_max'):
-            aggregation_type = 'max'
-        elif metric_name.startswith('min_') or metric_name.endswith('_min'):
-            aggregation_type = 'min'
-        else:
-            # The default is weighted average.
-            aggregation_type = 'weighted_avg'
-        import warnings
-        warnings.simplefilter("always", DeprecationWarning)
-        warnings.warn("No aggregation rule is matched in init_aggregation_rules. \
-                      Aggregation rule is matched based on name prefix:", aggregation_type)
-        return aggregation_type
-    
     def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> float:
         """Aggregating a single metric"""
         if not values:
@@ -402,6 +345,7 @@ def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> flo
     
     def get_aggregated_metrics(self) -> Dict[str, Any]:
         """aggregated metrics"""
+        t = time.time()
         if self.step_count == 0:
             return {}
         
@@ -411,21 +355,24 @@ def get_aggregated_metrics(self) -> Dict[str, Any]:
         for metric_name, values in self.metric_values.items():
             aggregated[metric_name] = self._aggregate_single_metric(metric_name, values)
         
-        # # Adding aggregate statistics
-        # aggregated.update({
-        #     'aggregation/step_count': self.step_count,
-        #     'aggregation/total_samples': sum(self.sample_counts),
-        #     'aggregation/avg_samples_per_step': sum(self.sample_counts) / self.step_count,
-        #     'aggregation/time_span': self.timestamps[-1] - self.timestamps[0] if len(self.timestamps) > 1 else 0,
-        # })
+        # Aggregate special metrics  
+        aggregated = self._special_metrics_aggergate(aggregated)
+
+        print(f"******************************aggregated metrics done. cost {time.time() - t}")
         
-        # # Add statistics on sample size
-        # if self.sample_counts:
-        #     aggregated.update({
-        #         'aggregation/min_samples_per_step': min(self.sample_counts),
-        #         'aggregation/max_samples_per_step': max(self.sample_counts),
-        #     })
+        return aggregated
+    
+    def _special_metrics_aggergate(self, aggregated: Dict[str, Any]) -> Dict[str, Any]:
+        """calculate special metrics"""
+
+        if "global_seqlen/minmax_diff" in aggregated.keys():
+            aggregated["global_seqlen/minmax_diff"] = aggregated["global_seqlen/max"] - aggregated["global_seqlen/min"]
         
+        REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"}
+        if REQUIRED_PERF_KEYS.issubset(aggregated):
+            aggregated["perf/throughput"] = aggregated['perf/total_num_tokens'] / \
+                (aggregated["perf/time_per_step"] * self.total_gpus)
+            
         return aggregated
     
     def reset(self):
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 9276d148b66..0a200e76b1d 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -121,7 +121,9 @@ def __init__(
         self.required_samples = int(
             self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n
         )
-        self.metrics_aggregator = MetricsAggregator()
+        total_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node + \
+            config.rollout.nnodes * config.rollout.n_gpus_per_node
+        self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus)
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
         """Set message queue client"""

From 570eb3ba9fda31f35118e99cc407dc36ca87a61b Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 5 Sep 2025 17:53:20 +0800
Subject: [PATCH 100/182] temporarily fix log_prob

---
 verl/trainer/ppo/ray_trainer.py | 75 +++++++++++++++++----------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index e61b1dc5fe0..42c728fa79d 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1247,42 +1247,45 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
-            old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
-            entropys = old_log_prob.batch["entropys"]
-            response_masks = batch.batch["response_mask"]
-            loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
-            entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
-            old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
-            metrics.update(old_log_prob_metrics)
-            old_log_prob.batch.pop("entropys")
-            batch = batch.union(old_log_prob)
-
-            if "rollout_log_probs" in batch.batch.keys():
-                # TODO: we may want to add diff of probs too.
-                rollout_old_log_probs = batch.batch["rollout_log_probs"]
-                actor_old_log_probs = batch.batch["old_log_probs"]
-                attention_mask = batch.batch["attention_mask"]
-                responses = batch.batch["responses"]
-                response_length = responses.size(1)
-                response_mask = attention_mask[:, -response_length:]
-
-                rollout_probs = torch.exp(rollout_old_log_probs)
-                actor_probs = torch.exp(actor_old_log_probs)
-                rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
-                rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
-                rollout_probs_diff_max = torch.max(rollout_probs_diff)
-                rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
-                rollout_probs_diff_std = torch.std(rollout_probs_diff)
-                metrics.update(
-                    {
-                        "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
-                        "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
-                        "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
-                    }
-                )
-                if self.config.async_training and self.config.async_training.use_rollout_log_probs:
-                    batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
-                    del actor_old_log_probs
+            if self.config.async_training and self.config.async_training.use_rollout_log_probs:
+                batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
+                del actor_old_log_probs
+            else:
+
+                old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                entropys = old_log_prob.batch["entropys"]
+                response_masks = batch.batch["response_mask"]
+                loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                metrics.update(old_log_prob_metrics)
+                old_log_prob.batch.pop("entropys")
+                batch = batch.union(old_log_prob)
+
+                if "rollout_log_probs" in batch.batch.keys():
+                    # TODO: we may want to add diff of probs too.
+                    rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                    actor_old_log_probs = batch.batch["old_log_probs"]
+                    attention_mask = batch.batch["attention_mask"]
+                    responses = batch.batch["responses"]
+                    response_length = responses.size(1)
+                    response_mask = attention_mask[:, -response_length:]
+
+                    rollout_probs = torch.exp(rollout_old_log_probs)
+                    actor_probs = torch.exp(actor_old_log_probs)
+                    rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                    rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                    rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                    rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                    rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                    metrics.update(
+                        {
+                            "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                            "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                            "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                        }
+                    )
+
 
         if self.use_reference_policy:
             # compute reference log_prob

From 5a85685bf472fab84d3719664cf456097426b588 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 5 Sep 2025 17:54:17 +0800
Subject: [PATCH 101/182] add exp  folder

---
 .../dapo_7b_math_fsdp2_colocate.sh            | 137 ++++++++++++++
 .../fsdp2_colocate/runtime_env.yaml           |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_16-16/runtime_env.yaml  |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_24-8/runtime_env.yaml   |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_8-24/runtime_env.yaml   |   5 +
 8 files changed, 679 insertions(+)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
new file mode 100644
index 00000000000..8b627fd6eed
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
@@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32_tfq20'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+
+
+python -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 9199f56726ae78266bf536d728495c91bb46ff44 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 5 Sep 2025 21:16:01 +0800
Subject: [PATCH 102/182] exp shell files qwen3-32B_32 megatron colocate

---
 .../dapo_7b_math_fsdp2_colocate.sh            |   6 +-
 .../fsdp2_colocate/runtime_env.yaml           |   3 +
 .../dapo_7b_math_fsdp2_colocate.sh            | 133 +++++++++++++
 .../fsdp2_colocate_64/runtime_env.yaml        |   3 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh |   0
 .../fsdp2_fully-async_16-16}/runtime_env.yaml |   0
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh |   0
 .../fsdp2_fully-async_24-8}/runtime_env.yaml  |   0
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh |   0
 .../fsdp2_fully-async_8-24}/runtime_env.yaml  |   0
 .../dapo_7b_math_megatron_colocate.sh         | 135 ++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   3 +
 .../dapo_7b_math_fsdp2_colocate.sh            | 133 +++++++++++++
 .../fsdp2_colocate/runtime_env.yaml           |   3 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_16-16}/runtime_env.yaml |   0
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_24-8/runtime_env.yaml   |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_8-24/runtime_env.yaml   |   5 +
 .../dapo_7b_math_megatron_colocate.sh         | 135 ++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   3 +
 .../early_megatron_colocate.sh                | 154 ++++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   5 +
 verl/trainer/ppo/ray_trainer.py               |   6 +-
 25 files changed, 1247 insertions(+), 7 deletions(-)
 rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh (94%)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml
 rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh (100%)
 rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_colocate => qwen2-7B-math_32/fsdp2_fully-async_16-16}/runtime_env.yaml (100%)
 rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh (100%)
 rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_fully-async_16-16 => qwen2-7B-math_32/fsdp2_fully-async_24-8}/runtime_env.yaml (100%)
 rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh (100%)
 rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_fully-async_24-8 => qwen2-7B-math_32/fsdp2_fully-async_8-24}/runtime_env.yaml (100%)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_fully-async_8-24 => qwen2-7B-math_64/fsdp2_fully-async_16-16}/runtime_env.yaml (100%)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
similarity index 94%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
index 8b627fd6eed..8d42dca04ca 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32_tfq20'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32'
 
 adv_estimator=grpo
 
@@ -35,10 +35,6 @@ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 # Paths
 RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
 # very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
 MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
 CKPTS_DIR=./ckpts/${project_name}/${exp_name}
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..39c5a3593e8
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh
new file mode 100644
index 00000000000..8d42dca04ca
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+
+
+python -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml
new file mode 100644
index 00000000000..39c5a3593e8
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh
new file mode 100644
index 00000000000..8bf1af32da8
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+train_tp=2
+train_pp=1
+
+# TODO: support dynamic_bsz for megatron
+# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=megatron \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..3a35b4a52ad
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
new file mode 100644
index 00000000000..e6ab551869d
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+
+
+python -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..514ab9a73f0
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml
rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh
new file mode 100644
index 00000000000..7444ec90c99
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+train_tp=2
+train_pp=1
+
+# TODO: support dynamic_bsz for megatron
+# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=megatron \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..a8cd045e180
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
new file mode 100644
index 00000000000..55e8733a9fb
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-32B_32k_megatron_colocate_32_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+
+NNODES=${NNODES:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+train_tp=4
+train_pp=2
+EP=1
+ETP=1
+CP=1
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..0d5684b1c73
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_fsdp2_colocate_32_mbs32"
+  HYDRA_FULL_ERROR: "1"
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 42c728fa79d..2d5a0538616 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1247,9 +1247,11 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
-            if self.config.async_training and self.config.async_training.use_rollout_log_probs:
+            async_training = self.config.get("async_training", None)
+            if async_training and async_training.use_rollout_log_prob:
                 batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
-                del actor_old_log_probs
+                batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature
+
             else:
 
                 old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)

From 363d12d1a0517029d380c3d0001dfe1b3c2f5bee Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 5 Sep 2025 22:11:01 +0800
Subject: [PATCH 103/182] exp shell file colocate done

---
 .../early_megatron_colocate.sh                | 159 ++++++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   5 +
 .../early_megatron_colocate.sh                | 159 ++++++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   5 +
 .../early_megatron_colocate.sh                | 159 ++++++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   5 +
 .../early_megatron_colocate.sh                | 154 +++++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   5 +
 .../megatron_colocate/runtime_env.yaml        |   2 +-
 .../early_megatron_colocate.sh                | 154 +++++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   5 +
 11 files changed, 811 insertions(+), 1 deletion(-)
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh
new file mode 100644
index 00000000000..26507694635
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+
+NNODES=${NNODES:-16}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+train_tp=1
+train_pp=1
+EP=8
+ETP=1
+CP=1
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..4a714f40f43
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-128/dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32"
+  HYDRA_FULL_ERROR: "1"
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
new file mode 100644
index 00000000000..69e5a723e9b
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+
+NNODES=${NNODES:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+train_tp=1
+train_pp=1
+EP=8
+ETP=1
+CP=1
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..052557120ad
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-32/dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32"
+  HYDRA_FULL_ERROR: "1"
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
new file mode 100644
index 00000000000..8e632a9dbfb
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+train_tp=1
+train_pp=1
+EP=8
+ETP=1
+CP=1
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..3a497e90dd0
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32"
+  HYDRA_FULL_ERROR: "1"
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh
new file mode 100644
index 00000000000..3b9ce953d85
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-32B_32k_megatron_colocate_128_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+
+NNODES=${NNODES:-16}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+train_tp=4
+train_pp=2
+EP=1
+ETP=1
+CP=1
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..1bbc3faadc9
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-128/dapo_qwen3-32B_32k_megatron_colocate_128_mbs32"
+  HYDRA_FULL_ERROR: "1"
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml
index 0d5684b1c73..2d0930d13ab 100644
--- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml
@@ -1,5 +1,5 @@
 env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_fsdp2_colocate_32_mbs32"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_megatron_colocate_32_mbs32"
   HYDRA_FULL_ERROR: "1"
   TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
   CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
new file mode 100644
index 00000000000..280d3e19dbf
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-32B_32k_megatron_colocate_64_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+train_tp=4
+train_pp=2
+EP=1
+ETP=1
+CP=1
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
+    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
+    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..d3dc7176f0a
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-64/dapo_qwen3-32B_32k_megatron_colocate_64_mbs32"
+  HYDRA_FULL_ERROR: "1"
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file

From b0586059f38495a7b81b9cb35a8ee86dfcf9c316 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sat, 6 Sep 2025 01:28:38 +0800
Subject: [PATCH 104/182] megatron fix

---
 .../megatron_colocate/early_megatron_colocate.sh                | 2 ++
 .../megatron_colocate/early_megatron_colocate.sh                | 2 ++
 .../megatron_colocate/early_megatron_colocate.sh                | 2 ++
 .../qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh  | 2 ++
 .../qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh   | 2 ++
 .../qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh   | 2 ++
 recipe/r1/tasks/DocQA.py                                        | 0
 7 files changed, 12 insertions(+)
 create mode 100644 recipe/r1/tasks/DocQA.py

diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh
index 26507694635..c666034ffc3 100644
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh
@@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
     trainer.log_val_generations=10 \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.strategy=megatron \
     actor_rollout_ref.actor.megatron.param_offload=${offload} \
     actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
     actor_rollout_ref.actor.megatron.grad_offload=${offload} \
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
index 69e5a723e9b..b2d735f8704 100644
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
@@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
     trainer.log_val_generations=10 \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.strategy=megatron \
     actor_rollout_ref.actor.megatron.param_offload=${offload} \
     actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
     actor_rollout_ref.actor.megatron.grad_offload=${offload} \
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
index 8e632a9dbfb..336d105cc5c 100644
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
@@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
     trainer.log_val_generations=10 \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.strategy=megatron \
     actor_rollout_ref.actor.megatron.param_offload=${offload} \
     actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
     actor_rollout_ref.actor.megatron.grad_offload=${offload} \
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh
index 3b9ce953d85..a7535e3575d 100644
--- a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh
@@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
     trainer.log_val_generations=10 \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.strategy=megatron \
     actor_rollout_ref.actor.megatron.param_offload=${offload} \
     actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
     actor_rollout_ref.actor.megatron.grad_offload=${offload} \
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
index 55e8733a9fb..085c7231c59 100644
--- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
@@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
     trainer.log_val_generations=10 \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.strategy=megatron \
     actor_rollout_ref.actor.megatron.param_offload=${offload} \
     actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
     actor_rollout_ref.actor.megatron.grad_offload=${offload} \
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
index 280d3e19dbf..145ea3dbec9 100644
--- a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
@@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
     trainer.log_val_generations=10 \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.strategy=megatron \
     actor_rollout_ref.actor.megatron.param_offload=${offload} \
     actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
     actor_rollout_ref.actor.megatron.grad_offload=${offload} \
diff --git a/recipe/r1/tasks/DocQA.py b/recipe/r1/tasks/DocQA.py
new file mode 100644
index 00000000000..e69de29bb2d

From 0be55004b9f20f525c93d92c2f7f3e91d8641b2b Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sat, 6 Sep 2025 01:29:01 +0800
Subject: [PATCH 105/182] rm DocQA

---
 recipe/r1/tasks/DocQA.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 recipe/r1/tasks/DocQA.py

diff --git a/recipe/r1/tasks/DocQA.py b/recipe/r1/tasks/DocQA.py
deleted file mode 100644
index e69de29bb2d..00000000000

From 7f837ac3a4a2b98b217d393a4170456223cdbb05 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sun, 7 Sep 2025 01:56:36 +0800
Subject: [PATCH 106/182] update 7b 128

---
 .../dapo_7b_math_fsdp2_colocate.sh            | 133 +++++++++++++
 .../fsdp2_colocate/runtime_env.yaml           |   3 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_16-16/runtime_env.yaml  |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_24-8/runtime_env.yaml   |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++
 .../fsdp2_fully-async_8-24/runtime_env.yaml   |   5 +
 .../dapo_7b_math_megatron_colocate.sh         | 135 ++++++++++++++
 .../megatron_colocate/runtime_env.yaml        |   3 +
 .../dapo_7b_math_megatron_colocate.sh         |   4 +-
 .../dapo_7b_math_megatron_colocate.sh         |   4 +-
 .../fsdp2_colocate/runtime_env.yaml           |   3 +
 .../test_dapo_qwen3_30b_math.sh               | 125 +++++++++++++
 .../exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh | 125 +++++++++++++
 .../fsdp2_colocate/runtime_env.yaml           |   3 +
 16 files changed, 1071 insertions(+), 4 deletions(-)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
new file mode 100644
index 00000000000..3538722d8a1
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+
+
+python -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..8fc2de3e70b
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
new file mode 100644
index 00000000000..618497c0257
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 *8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=1
+sp_size=1
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES=${NNODES:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+n_gpus_rollout=6
+n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*100)))
+test_freq=10
+staleness_threshold=1
+trigger_parameter_sync_step=64
+partial_rollout=True
+
+PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
+if [ ! -x "$PYTHON_INTERPRETER" ]; then
+    PYTHON_INTERPRETER="python3"
+fi
+
+$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml
new file mode 100644
index 00000000000..dcca08e67f7
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh
new file mode 100644
index 00000000000..f98aeb86b57
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+train_tp=4
+train_pp=2
+
+# TODO: support dynamic_bsz for megatron
+# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=megatron \
+    critic.strategy=megatron \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..6e33f46a65a
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_megatron_colocate_128_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh
index 8bf1af32da8..3879a99df67 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh
@@ -50,8 +50,8 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 offload=True
 gen_tp=4
-train_tp=2
-train_pp=1
+train_tp=4
+train_pp=2
 
 # TODO: support dynamic_bsz for megatron
 # actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh
index 7444ec90c99..f98aeb86b57 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh
@@ -50,8 +50,8 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 offload=True
 gen_tp=4
-train_tp=2
-train_pp=1
+train_tp=4
+train_pp=2
 
 # TODO: support dynamic_bsz for megatron
 # actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..069b1f14aa0
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-30BA3B/dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh
new file mode 100644
index 00000000000..591ac8533ee
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+sp_size=4
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+fsdp_size=32
+
+python3 -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh
new file mode 100644
index 00000000000..8f2e636c59f
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 32))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+sp_size=4
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+fsdp_size=32
+
+python3 -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..1b4a8ff4b82
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml
@@ -0,0 +1,3 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-32B/dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 3c239be789d18c14023d0b1b7c12f2857a726a42 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Sun, 7 Sep 2025 11:51:15 +0800
Subject: [PATCH 107/182] fix typo in use_rollout_log_probs

---
 recipe/fully_async_policy/detach_utils.py | 2 +-
 verl/trainer/ppo/ray_trainer.py           | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index e01d82c1726..31738234c09 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -358,7 +358,7 @@ def get_aggregated_metrics(self) -> Dict[str, Any]:
         # Aggregate special metrics  
         aggregated = self._special_metrics_aggergate(aggregated)
 
-        print(f"******************************aggregated metrics done. cost {time.time() - t}")
+        print(f"aggregated metrics done. cost {time.time() - t}")
         
         return aggregated
     
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 2d5a0538616..8d2c19d3364 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1248,12 +1248,11 @@ def _process_batch_common(self, batch, metrics, timing_raw):
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
             async_training = self.config.get("async_training", None)
-            if async_training and async_training.use_rollout_log_prob:
+            if async_training and async_training.use_rollout_log_probs:
                 batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
                 batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature
 
             else:
-
                 old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
                 entropys = old_log_prob.batch["entropys"]
                 response_masks = batch.batch["response_mask"]

From 9cbce52395fa01ccf936f12828e1b1853cf1b964 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Sun, 7 Sep 2025 12:42:42 +0800
Subject: [PATCH 108/182] remove unused code

---
 recipe/fully_async_policy/detach_utils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 31738234c09..127afca6881 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -254,12 +254,7 @@ def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]:
         return {
             # Time-Based metrics, can add metrics here
             'time_sum': [
-                'timing_s/adv',
-                'timing_s/gen',
-                'timing_s/old_log_prob',
-                'timing_s/reward',
-                'timing_s/step',
-                'timing_s/update_actor',
+                'perf/time_per_step'
             ],
         }
     

From 5539e037035a71bb93dd892fd6f28ac0af75afe7 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 8 Sep 2025 15:21:43 +0800
Subject: [PATCH 109/182] add exp fully_async 32, 64

---
 .../dapo_7b_math_fsdp2_colocate.sh            | 133 -------------
 .../fsdp2_colocate_64/runtime_env.yaml        |   3 -
 ...8k_fsdp2_fully-async_16-16_mbs32_tfq16.sh} |  36 ++--
 .../fsdp2_fully-async_16-16/runtime_env.yaml  |   2 +-
 ...28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh} |  36 ++--
 .../fsdp2_fully-async_24-8/runtime_env.yaml   |   2 +-
 ...28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh} |  36 ++--
 .../fsdp2_fully-async_8-24/runtime_env.yaml   |   2 +-
 .../fsdp2_fully-async_16-16/runtime_env.yaml  |   5 -
 ...28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh} |  36 ++--
 .../fsdp2_fully-async_24-40/runtime_env.yaml  |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------
 .../fsdp2_fully-async_24-8/runtime_env.yaml   |   5 -
 ..._28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh | 168 +++++++++++++++++
 .../fsdp2_fully-async_32-32/runtime_env.yaml  |   5 +
 ...28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh | 168 +++++++++++++++++
 .../fsdp2_fully-async_40-24/runtime_env.yaml  |   5 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------
 .../fsdp2_fully-async_8-24/runtime_env.yaml   |   5 -
 19 files changed, 414 insertions(+), 586 deletions(-)
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/{fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh} (90%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/{fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh} (90%)
 rename recipe/fully_async_policy/exp/{qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh} (91%)
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml
 rename recipe/fully_async_policy/exp/{qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh} (90%)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh
deleted file mode 100644
index 8d42dca04ca..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-4}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
-
-
-python -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml
deleted file mode 100644
index 39c5a3593e8..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
similarity index 90%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
index 618497c0257..c49a6460696 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -44,7 +44,7 @@ clip_ratio_high=0.28
 
 # Response length parameters
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
+max_response_length=$((1024 * 28))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
@@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 ref_offload=True
 actor_offload=False
-gen_tp=1
-sp_size=1
+gen_tp=4
+sp_size=4
 fsdp_size=2
 
 # Fully async specific parameters
-NNODES=${NNODES:-2}
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-2}
+NNODES_TRAIN=${NNODES_TRAIN:-2}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
+total_rollout_steps=$(((512*400)))
+test_freq=20
 staleness_threshold=1
-trigger_parameter_sync_step=64
+trigger_parameter_sync_step=16
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
index dcca08e67f7..de7e1aa0e1c 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
@@ -1,5 +1,5 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16"
   NCCL_DEBUG: "INFO"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh
similarity index 90%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh
index 618497c0257..6c6cb13cf45 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -44,7 +44,7 @@ clip_ratio_high=0.28
 
 # Response length parameters
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
+max_response_length=$((1024 * 28))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
@@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 ref_offload=True
 actor_offload=False
-gen_tp=1
-sp_size=1
+gen_tp=4
+sp_size=4
 fsdp_size=2
 
 # Fully async specific parameters
-NNODES=${NNODES:-2}
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-3}
+NNODES_TRAIN=${NNODES_TRAIN:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
+total_rollout_steps=$(((512*400)))
+test_freq=20
 staleness_threshold=1
-trigger_parameter_sync_step=64
+trigger_parameter_sync_step=32
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
index dcca08e67f7..7402c1b37b0 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
@@ -1,5 +1,5 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32"
   NCCL_DEBUG: "INFO"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh
similarity index 91%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh
index 618497c0257..9add4e0e8bb 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -44,7 +44,7 @@ clip_ratio_high=0.28
 
 # Response length parameters
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
+max_response_length=$((1024 * 28))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
@@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 ref_offload=True
 actor_offload=False
-gen_tp=1
-sp_size=1
+gen_tp=4
+sp_size=4
 fsdp_size=2
 
 # Fully async specific parameters
-NNODES=${NNODES:-2}
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-1}
+NNODES_TRAIN=${NNODES_TRAIN:-3}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
+total_rollout_steps=$(((512*400)))
+test_freq=20
 staleness_threshold=1
-trigger_parameter_sync_step=64
+trigger_parameter_sync_step=11
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
index dcca08e67f7..fc404cfd985 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
@@ -1,5 +1,5 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11"
   NCCL_DEBUG: "INFO"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml
deleted file mode 100644
index dcca08e67f7..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh
similarity index 90%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh
index 618497c0257..5da2116ef80 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -44,7 +44,7 @@ clip_ratio_high=0.28
 
 # Response length parameters
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
+max_response_length=$((1024 * 28))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
@@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 ref_offload=True
 actor_offload=False
-gen_tp=1
-sp_size=1
+gen_tp=4
+sp_size=4
 fsdp_size=2
 
 # Fully async specific parameters
-NNODES=${NNODES:-2}
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-3}
+NNODES_TRAIN=${NNODES_TRAIN:-5}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
+total_rollout_steps=$(((512*400)))
+test_freq=20
 staleness_threshold=1
-trigger_parameter_sync_step=64
+trigger_parameter_sync_step=6
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml
new file mode 100644
index 00000000000..ef67409ba6f
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
deleted file mode 100644
index 618497c0257..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=1
-sp_size=1
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES=${NNODES:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
-staleness_threshold=1
-trigger_parameter_sync_step=64
-partial_rollout=True
-
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml
deleted file mode 100644
index dcca08e67f7..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
new file mode 100644
index 00000000000..c31c59df4db
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-4}
+NNODES_TRAIN=${NNODES_TRAIN:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=1
+trigger_parameter_sync_step=8
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
new file mode 100644
index 00000000000..20d464776b0
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh
new file mode 100644
index 00000000000..a15cf990bd1
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-5}
+NNODES_TRAIN=${NNODES_TRAIN:-3}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=1
+trigger_parameter_sync_step=11
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml
new file mode 100644
index 00000000000..93ae17ebb6f
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11"
+  NCCL_DEBUG: "INFO"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
deleted file mode 100644
index 618497c0257..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=1
-sp_size=1
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES=${NNODES:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
-staleness_threshold=1
-trigger_parameter_sync_step=64
-partial_rollout=True
-
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml
deleted file mode 100644
index dcca08e67f7..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 07ae4a00ae30faa8275f71d3a95521cceaf3effe Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 8 Sep 2025 17:26:01 +0800
Subject: [PATCH 110/182] add empty_cache after sync_rollout_weights

---
 recipe/one_step_off_policy/fsdp_workers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py
index 086f109e434..dd941c26684 100644
--- a/recipe/one_step_off_policy/fsdp_workers.py
+++ b/recipe/one_step_off_policy/fsdp_workers.py
@@ -100,6 +100,7 @@ def sync_rollout_weights(self):
             collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
             if self._is_rollout:
                 inference_model.load_weights([(key, tensor)])
+        get_torch_device().empty_cache()
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def get_actor_weights_info(self):

From 34cf9e7b91fbabb6761a74047bf33fe13bfb5318 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 8 Sep 2025 17:42:01 +0800
Subject: [PATCH 111/182] add exp fully_async 128 64-64

---
 .../fsdp2_fully-async_16-16/runtime_env.yaml  |   5 -
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------
 .../fsdp2_fully-async_24-8/runtime_env.yaml   |   5 -
 ...28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh} |  36 ++--
 .../fsdp2_fully-async_64-64/runtime_env.yaml  |   4 +
 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------
 .../fsdp2_fully-async_8-24/runtime_env.yaml   |   5 -
 7 files changed, 19 insertions(+), 384 deletions(-)
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml
 rename recipe/fully_async_policy/exp/qwen2-7B-math_128/{fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh} (90%)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml
deleted file mode 100644
index dcca08e67f7..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
deleted file mode 100644
index 618497c0257..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=1
-sp_size=1
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES=${NNODES:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
-staleness_threshold=1
-trigger_parameter_sync_step=64
-partial_rollout=True
-
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml
deleted file mode 100644
index dcca08e67f7..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
similarity index 90%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
index 618497c0257..ce69e60e2b6 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
+
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -44,7 +44,7 @@ clip_ratio_high=0.28
 
 # Response length parameters
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
+max_response_length=$((1024 * 28))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
@@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 ref_offload=True
 actor_offload=False
-gen_tp=1
-sp_size=1
+gen_tp=4
+sp_size=4
 fsdp_size=2
 
 # Fully async specific parameters
-NNODES=${NNODES:-2}
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
+NNODES_TRAIN=${NNODES_TRAIN:-8}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
+total_rollout_steps=$(((512*400)))
+test_freq=20
 staleness_threshold=1
-trigger_parameter_sync_step=64
+trigger_parameter_sync_step=4
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
new file mode 100644
index 00000000000..949fa4ef005
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
@@ -0,0 +1,4 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
deleted file mode 100644
index 618497c0257..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 *8))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=1
-sp_size=1
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES=${NNODES:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-n_gpus_rollout=6
-n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*100)))
-test_freq=10
-staleness_threshold=1
-trigger_parameter_sync_step=64
-partial_rollout=True
-
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml
deleted file mode 100644
index dcca08e67f7..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From a7c06551baf74e3774e66cecd78e8dfa34c5b116 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 8 Sep 2025 21:55:57 +0800
Subject: [PATCH 112/182] fix max_concurrent_samples, fix progress_bar

---
 recipe/fully_async_policy/fully_async_rollouter.py | 4 +++-
 recipe/fully_async_policy/fully_async_trainer.py   | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index f3a25c2c30c..5d612dc4679 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -164,7 +164,9 @@ async def set_required_samples(self, required_samples: int):
             )
 
             # 单次最多扔一次更新需要的样本
-            self.max_concurrent_samples = self.required_samples
+            self.max_concurrent_samples = int(self.config.actor_rollout_ref.actor.ppo_mini_batch_size / \
+                self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 4)
+            self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples)
             self.max_queue_size = self.max_required_samples
 
             print(
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 0a200e76b1d..2b549c0b621 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -335,6 +335,7 @@ def fit(self):
                 self.logger.log(data=val_data.metrics, step=val_data.param_version)
                 self.logger.log(data=val_data.timing_raw, step=val_data.param_version)     
         pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
+        self.progress_bar.close()
 
         self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint
 
@@ -356,6 +357,7 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step
             data=self.metrics_aggregator.get_aggregated_metrics(),
             step=self.current_param_version,
             )
+        self.progress_bar.update(1)
         self.metrics_aggregator.reset()
         ray.get(self.param_synchronizer.wait_last_sync.remote())
         ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, 

From 0e1f2d79cf2b0921c36bd4515e0df8b3327a02c4 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Tue, 9 Sep 2025 00:31:02 +0800
Subject: [PATCH 113/182] change max_concurrent_samples num & change some exp

---
 ...apo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh | 2 +-
 .../qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml | 2 +-
 ...po_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh | 2 +-
 .../qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml  | 3 +--
 ...apo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh | 2 +-
 .../qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml  | 3 +--
 recipe/fully_async_policy/fully_async_rollouter.py             | 2 +-
 7 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
index ce69e60e2b6..9f410f95c6c 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
index 949fa4ef005..5dfe2294911 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
@@ -1,4 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
index c49a6460696..fcc5f472d8c 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
index de7e1aa0e1c..b3063ebc7f1 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
@@ -1,5 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16"
-  NCCL_DEBUG: "INFO"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
index c31c59df4db..221d3c4d5a6 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
index 20d464776b0..160cd46c499 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
@@ -1,5 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8"
-  NCCL_DEBUG: "INFO"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 5d612dc4679..35e199addcb 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -165,7 +165,7 @@ async def set_required_samples(self, required_samples: int):
 
             # 单次最多扔一次更新需要的样本
             self.max_concurrent_samples = int(self.config.actor_rollout_ref.actor.ppo_mini_batch_size / \
-                self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 4)
+                self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 8)
             self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples)
             self.max_queue_size = self.max_required_samples
 

From 6c557d61659516d840145a0f9f2ce9bf07f74c51 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Tue, 9 Sep 2025 14:15:05 +0800
Subject: [PATCH 114/182] remove unused code, add stale 0.1 exp

---
 ...28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh | 168 ++++++++++++++++++
 .../runtime_env.yaml                          |   4 +
 .../fully_async_rollouter.py                  |  24 +--
 recipe/fully_async_policy/message_queue.py    |   7 -
 4 files changed, 179 insertions(+), 24 deletions(-)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
new file mode 100644
index 00000000000..2217661dd33
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-2}
+NNODES_TRAIN=${NNODES_TRAIN:-2}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=16
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml
new file mode 100644
index 00000000000..0b188206127
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml
@@ -0,0 +1,4 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 35e199addcb..c25a52abbe0 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -113,7 +113,7 @@ def __init__(
         self.async_rollout_manager = None
 
         # Config
-        self.staleness_threshold: int = config.async_training.get("staleness_threshold", 1)
+        self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1)
         self.required_samples = None
         self.max_required_samples = None
         # 单次最多扔一次更新需要的样本
@@ -153,7 +153,7 @@ async def set_message_queue_client(self, message_queue_client: MessageQueueClien
     async def set_required_samples(self, required_samples: int):
         async with self.lock:
             self.required_samples = int(required_samples)
-            self.max_required_samples = (
+            self.max_required_samples = int(
                 self.required_samples
                 * (self.staleness_threshold + 1)
                 * self.config.async_training.trigger_parameter_sync_step
@@ -164,8 +164,11 @@ async def set_required_samples(self, required_samples: int):
             )
 
             # 单次最多扔一次更新需要的样本
-            self.max_concurrent_samples = int(self.config.actor_rollout_ref.actor.ppo_mini_batch_size / \
-                self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 8)
+            self.max_concurrent_samples = int(
+                self.config.actor_rollout_ref.actor.ppo_mini_batch_size 
+                / self.config.actor_rollout_ref.rollout.n 
+                * self.async_rollout_manager.rollout_dp_size * 8
+                )
             self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples)
             self.max_queue_size = self.max_required_samples
 
@@ -548,19 +551,6 @@ async def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""
         queue_stats = self.message_queue_client.get_statistics_sync()
         queue_size = queue_stats["queue_size"]
-        current_trainer_version = queue_stats["current_param_version"]
-
-        version_diff = self.current_param_version - current_trainer_version
-
-        if version_diff > self.staleness_threshold:
-            if not self.paused:
-                print(
-                    "[FullyAsyncRollouter][ShouldPause] "
-                    f"due to version_diff > self.staleness_threshold: "
-                    f"rollout_version={self.current_param_version}, "
-                    f"trainer_version={current_trainer_version}, diff={version_diff}"
-                )
-            return True
 
         if queue_size >= self.max_queue_size:
             if not self.paused:
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 0520ec98034..da1780deb47 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -78,13 +78,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
             bool: Whether the sample was successfully put into the queue
         """
         async with self._lock:
-            # Check freshness
-            staleness = self.current_param_version - param_version
-            if staleness > self.staleness_threshold:
-                self.dropped_samples += 1
-                print(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}")
-                return False
-
             # If queue is full, remove the oldest sample (rarely happens)
             if len(self.queue) >= self.max_queue_size:
                 self.queue.popleft()

From 15b53c8fcf3d7040d8ffc2b1a2ea4ce2ed3eb667 Mon Sep 17 00:00:00 2001
From: hadoop-ai-search <hadoop-ai-search@set-zw04-mlp-codelab-pc1189.mt>
Date: Tue, 9 Sep 2025 16:35:06 +0800
Subject: [PATCH 115/182] reset one step

---
 recipe/fully_async_policy/fsdp_workers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index 086f109e434..dd941c26684 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -100,6 +100,7 @@ def sync_rollout_weights(self):
             collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
             if self._is_rollout:
                 inference_model.load_weights([(key, tensor)])
+        get_torch_device().empty_cache()
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def get_actor_weights_info(self):

From 5249bcda3a7765173ba202e59662c6af6e9b5895 Mon Sep 17 00:00:00 2001
From: hadoop-ai-search <hadoop-ai-search@set-zw04-mlp-codelab-pc1189.mt>
Date: Tue, 9 Sep 2025 17:15:51 +0800
Subject: [PATCH 116/182] unchange protobuf

---
 .../unittest/test_protocol_split_merge.py     | 621 ------------------
 verl/protocol.py                              | 166 +----
 2 files changed, 2 insertions(+), 785 deletions(-)
 delete mode 100644 recipe/fully_async_policy/unittest/test_protocol_split_merge.py

diff --git a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py
deleted file mode 100644
index a5c61f11ba6..00000000000
--- a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py
+++ /dev/null
@@ -1,621 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import torch
-from tensordict import TensorDict
-
-from verl.protocol import DataProto, DataProtoItem
-
-
-def create_sample_dataproto():
-    """Create a DataProto similar to the provided example."""
-
-    # Create tensor data similar to the example
-    batch_size = 12
-
-    # Tensor data
-    attention_mask = torch.ones(batch_size, 3072, dtype=torch.int64)
-    input_ids = torch.randint(0, 32000, (batch_size, 3072), dtype=torch.int64)
-    position_ids = torch.arange(3072).unsqueeze(0).repeat(batch_size, 1).long()
-    prompts = torch.randint(0, 32000, (batch_size, 1024), dtype=torch.int64)
-    response_mask = torch.ones(batch_size, 2048, dtype=torch.int64)
-    responses = torch.randint(0, 32000, (batch_size, 2048), dtype=torch.int64)
-
-    # Non-tensor data similar to the example
-    data_source = np.array(["openai/gsm8k"] * batch_size, dtype=object)
-    ability = np.array(["math"] * batch_size, dtype=object)
-
-    reward_model = np.array(
-        [
-            {"ground_truth": "6", "style": "rule"},
-            {"ground_truth": "6", "style": "rule"},
-            {"ground_truth": "220000", "style": "rule"},
-            {"ground_truth": "277", "style": "rule"},
-            {"ground_truth": "277", "style": "rule"},
-            {"ground_truth": "35", "style": "rule"},
-            {"ground_truth": "6", "style": "rule"},
-            {"ground_truth": "220000", "style": "rule"},
-            {"ground_truth": "220000", "style": "rule"},
-            {"ground_truth": "277", "style": "rule"},
-            {"ground_truth": "35", "style": "rule"},
-            {"ground_truth": "35", "style": "rule"},
-        ],
-        dtype=object,
-    )
-
-    extra_info = np.array(
-        [
-            {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"},
-            {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"},
-            {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"},
-            {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"},
-            {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"},
-            {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"},
-            {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"},
-            {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"},
-            {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"},
-            {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"},
-            {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"},
-            {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"},
-        ],
-        dtype=object,
-    )
-
-    uid = np.array(
-        [
-            "80ae1835-a8db-4faa-8b42-2ffa2ca63f28",
-            "80ae1835-a8db-4faa-8b42-2ffa2ca63f28",
-            "cc529271-c2ba-4fe1-a16e-50c5f090538d",
-            "237ea082-350f-4193-b9a2-3a153a3a38b9",
-            "237ea082-350f-4193-b9a2-3a153a3a38b9",
-            "fab3e910-67b3-4653-bc69-377250049267",
-            "80ae1835-a8db-4faa-8b42-2ffa2ca63f28",
-            "cc529271-c2ba-4fe1-a16e-50c5f090538d",
-            "cc529271-c2ba-4fe1-a16e-50c5f090538d",
-            "237ea082-350f-4193-b9a2-3a153a3a38b9",
-            "fab3e910-67b3-4653-bc69-377250049267",
-            "fab3e910-67b3-4653-bc69-377250049267",
-        ],
-        dtype=object,
-    )
-
-    tools_kwargs = np.array([{}] * batch_size, dtype=object)
-    interaction_kwargs = np.array([{}] * batch_size, dtype=object)
-    index = np.array([4570, 4570, 460, 6613, 6613, 1421, 4570, 460, 460, 6613, 1421, 1421], dtype=object)
-
-    # Create DataProto
-    data_proto = DataProto.from_dict(
-        tensors={
-            "attention_mask": attention_mask,
-            "input_ids": input_ids,
-            "position_ids": position_ids,
-            "prompts": prompts,
-            "response_mask": response_mask,
-            "responses": responses,
-        },
-        non_tensors={
-            "data_source": data_source,
-            "ability": ability,
-            "reward_model": reward_model,
-            "extra_info": extra_info,
-            "uid": uid,
-            "tools_kwargs": tools_kwargs,
-            "interaction_kwargs": interaction_kwargs,
-            "index": index,
-        },
-        meta_info={"global_token_num": [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]},
-    )
-
-    return data_proto
-
-
-def test_basic_split_and_merge():
-    """Test basic split and merge functionality."""
-    print("=== Testing Basic Split and Merge ===")
-
-    # Create sample data
-    original_proto = create_sample_dataproto()
-    original_length = len(original_proto)
-
-    print(f"Original DataProto length: {original_length}")
-    print(f"Original tensor keys: {list(original_proto.batch.keys())}")
-    print(f"Original non_tensor keys: {list(original_proto.non_tensor_batch.keys())}")
-
-    # Test split
-    items = original_proto.to_items()
-
-    print(f"Split into {len(items)} items")
-    assert len(items) == original_length, f"Expected {original_length} items, got {len(items)}"
-
-    # Verify individual items
-    for i, item in enumerate(items):
-        print(f"Item {i}: batch_size={item.batch.batch_size}, non_tensor keys={list(item.non_tensor_batch.keys())}")
-
-        # Check that tensor shapes are correct (no batch dimension)
-        assert item.batch.batch_size == torch.Size([]), (
-            f"Item {i} should have empty batch_size, got {item.batch.batch_size}"
-        )
-
-        # Check tensor shapes
-        assert item.batch["attention_mask"].shape == torch.Size([3072]), (
-            f"Unexpected attention_mask shape: {item.batch['attention_mask'].shape}"
-        )
-        assert item.batch["input_ids"].shape == torch.Size([3072]), (
-            f"Unexpected input_ids shape: {item.batch['input_ids'].shape}"
-        )
-        assert item.batch["prompts"].shape == torch.Size([1024]), (
-            f"Unexpected prompts shape: {item.batch['prompts'].shape}"
-        )
-
-        # Check non-tensor data types
-        assert isinstance(item.non_tensor_batch["data_source"], str), (
-            f"data_source should be str, got {type(item.non_tensor_batch['data_source'])}"
-        )
-        assert isinstance(item.non_tensor_batch["reward_model"], dict), (
-            f"reward_model should be dict, got {type(item.non_tensor_batch['reward_model'])}"
-        )
-        assert isinstance(item.non_tensor_batch["extra_info"], dict), (
-            f"extra_info should be dict, got {type(item.non_tensor_batch['extra_info'])}"
-        )
-
-    # Test merge
-    merged_proto = DataProto.from_items(items)
-
-    print(f"Merged DataProto length: {len(merged_proto)}")
-    assert len(merged_proto) == original_length, f"Merged length should be {original_length}, got {len(merged_proto)}"
-
-    # Verify tensor data consistency
-    for key in original_proto.batch.keys():
-        original_tensor = original_proto.batch[key]
-        merged_tensor = merged_proto.batch[key]
-
-        assert original_tensor.shape == merged_tensor.shape, (
-            f"Shape mismatch for {key}: {original_tensor.shape} vs {merged_tensor.shape}"
-        )
-        assert torch.equal(original_tensor, merged_tensor), f"Tensor data mismatch for {key}"
-
-    # Verify non-tensor data consistency
-    for key in original_proto.non_tensor_batch.keys():
-        original_array = original_proto.non_tensor_batch[key]
-        merged_array = merged_proto.non_tensor_batch[key]
-
-        assert original_array.shape == merged_array.shape, (
-            f"Shape mismatch for {key}: {original_array.shape} vs {merged_array.shape}"
-        )
-        assert np.array_equal(original_array, merged_array), f"Non-tensor data mismatch for {key}"
-
-    # Verify meta_info consistency
-    assert original_proto.meta_info == merged_proto.meta_info, "Meta info mismatch"
-
-    print("✓ Basic split and merge test passed!")
-
-
-def test_individual_item_access():
-    """Test accessing individual items matches split results."""
-    print("\n=== Testing Individual Item Access ===")
-
-    original_proto = create_sample_dataproto()
-    items = original_proto.to_items()
-
-    # Compare direct indexing with split results
-    for i in range(len(original_proto)):
-        direct_item = original_proto[i]
-        split_item = items[i]
-
-        # Check tensor data
-        for key in original_proto.batch.keys():
-            assert torch.equal(direct_item.batch[key], split_item.batch[key]), (
-                f"Tensor mismatch at index {i}, key {key}"
-            )
-
-        # Check non-tensor data
-        for key in original_proto.non_tensor_batch.keys():
-            if isinstance(direct_item.non_tensor_batch[key], np.ndarray):
-                assert np.array_equal(direct_item.non_tensor_batch[key], split_item.non_tensor_batch[key]), (
-                    f"Non-tensor mismatch at index {i}, key {key}"
-                )
-            else:
-                assert direct_item.non_tensor_batch[key] == split_item.non_tensor_batch[key], (
-                    f"Non-tensor mismatch at index {i}, key {key}"
-                )
-
-    print("✓ Individual item access test passed!")
-
-
-def test_partial_merge():
-    """Test merging a subset of items."""
-    print("\n=== Testing Partial Merge ===")
-
-    original_proto = create_sample_dataproto()
-    items = original_proto.to_items()
-
-    # Take a subset of items
-    subset_indices = [0, 2, 4, 7, 9]
-    subset_items = [items[i] for i in subset_indices]
-
-    # Merge the subset
-    subset_proto = DataProto.from_items(subset_items)
-
-    assert len(subset_proto) == len(subset_indices), (
-        f"Subset length should be {len(subset_indices)}, got {len(subset_proto)}"
-    )
-
-    # Verify the subset contains correct data
-    for i, original_idx in enumerate(subset_indices):
-        # Compare with original data at original_idx
-        for key in original_proto.batch.keys():
-            expected_tensor = original_proto.batch[key][original_idx]
-            actual_tensor = subset_proto.batch[key][i]
-            assert torch.equal(expected_tensor, actual_tensor), f"Subset tensor mismatch at {i}, key {key}"
-
-        for key in original_proto.non_tensor_batch.keys():
-            expected_value = original_proto.non_tensor_batch[key][original_idx]
-            actual_value = subset_proto.non_tensor_batch[key][i]
-
-            if isinstance(expected_value, np.ndarray):
-                assert np.array_equal(expected_value, actual_value), f"Subset non-tensor mismatch at {i}, key {key}"
-            else:
-                assert expected_value == actual_value, f"Subset non-tensor mismatch at {i}, key {key}"
-
-    print("✓ Partial merge test passed!")
-
-
-def test_item_processing():
-    """Test processing individual items before merging."""
-    print("\n=== Testing Item Processing ===")
-
-    original_proto = create_sample_dataproto()
-    items = original_proto.to_items()
-
-    # Process each item (e.g., add a prefix to uid)
-    processed_items = []
-    for i, item in enumerate(items):
-        processed_item = item.copy()  # Create a copy to avoid modifying original
-
-        # Modify some data
-        processed_item.non_tensor_batch["uid"] = f"processed_{i}_{processed_item.non_tensor_batch['uid']}"
-        processed_item.non_tensor_batch["processing_step"] = i
-        processed_item.meta_info["processed"] = True
-
-        processed_items.append(processed_item)
-
-    # Merge processed items
-    processed_proto = DataProto.from_items(processed_items)
-
-    # Verify processing was applied
-    for i in range(len(processed_proto)):
-        expected_uid = f"processed_{i}_{items[i].non_tensor_batch['uid']}"
-        actual_uid = processed_proto.non_tensor_batch["uid"][i]
-        assert actual_uid == expected_uid, (
-            f"Processing failed for uid at {i}: expected {expected_uid}, got {actual_uid}"
-        )
-
-        expected_step = i
-        actual_step = processed_proto.non_tensor_batch["processing_step"][i]
-        assert actual_step == expected_step, (
-            f"Processing step mismatch at {i}: expected {expected_step}, got {actual_step}"
-        )
-
-    #    assert processed_proto.meta_info.get("processed") == True, "Meta info processing failed"
-
-    print("✓ Item processing test passed!")
-
-
-def test_error_conditions():
-    """Test error conditions."""
-    print("\n=== Testing Error Conditions ===")
-
-    # Test empty list
-    try:
-        DataProto.from_items([])
-    except ValueError as e:
-        print(f"✓ Correctly caught empty list error: {e}")
-
-    # Test inconsistent structure
-    try:
-        # Create items with different tensor keys
-        original_proto = create_sample_dataproto()
-        items = original_proto.to_items()
-
-        # Modify one item to have different keys
-        modified_item = items[1].copy()
-        modified_item.batch = TensorDict({"different_key": torch.randn(3072)}, batch_size=torch.Size([]))
-
-        inconsistent_items = [items[0], modified_item]
-        DataProto.from_items(inconsistent_items)
-    except ValueError as e:
-        print(f"✓ Correctly caught inconsistent structure error: {e}")
-
-    print("✓ Error conditions test passed!")
-
-
-def test_roundtrip_integrity():
-    """Test multiple split/merge cycles maintain data integrity."""
-    print("\n=== Testing Roundtrip Integrity ===")
-
-    original_proto = create_sample_dataproto()
-    current_proto = original_proto
-
-    # Perform multiple split/merge cycles
-    for cycle in range(3):
-        print(f"Cycle {cycle + 1}")
-
-        # Split
-        items = current_proto.to_items()
-
-        # Merge
-        current_proto = DataProto.from_items(items)
-
-        # Verify integrity
-        assert len(current_proto) == len(original_proto), f"Length changed in cycle {cycle + 1}"
-
-        for key in original_proto.batch.keys():
-            assert torch.equal(original_proto.batch[key], current_proto.batch[key]), (
-                f"Tensor {key} changed in cycle {cycle + 1}"
-            )
-
-        for key in original_proto.non_tensor_batch.keys():
-            assert np.array_equal(original_proto.non_tensor_batch[key], current_proto.non_tensor_batch[key]), (
-                f"Non-tensor {key} changed in cycle {cycle + 1}"
-            )
-
-        assert original_proto.meta_info == current_proto.meta_info, f"Meta info changed in cycle {cycle + 1}"
-
-    print("✓ Roundtrip integrity test passed!")
-
-
-def run_visual_comparison():
-    """Run a visual comparison similar to the user's example."""
-    print("\n=== Visual Comparison (Like User Example) ===")
-
-    original_proto = create_sample_dataproto()
-
-    print("Original DataProto:")
-    print(f"batch_size: {original_proto.batch.batch_size}")
-    print(f"tensor keys: {list(original_proto.batch.keys())}")
-    print(f"non_tensor keys: {list(original_proto.non_tensor_batch.keys())}")
-    print(f"Sample data_source: {original_proto.non_tensor_batch['data_source'][:3]}")
-    print(f"Sample uid: {original_proto.non_tensor_batch['uid'][:3]}")
-
-    print("\n" + "=" * 50)
-    print("============= SPLIT =============")
-    print("=" * 50)
-
-    items = original_proto.to_items()
-
-    # Show first few items
-    for i in range(min(3, len(items))):
-        print(f"\nDataProtoItem {i}:")
-        print(f"batch_size: {items[i].batch.batch_size}")
-        print(f"attention_mask shape: {items[i].batch['attention_mask'].shape}")
-        print(f"input_ids shape: {items[i].batch['input_ids'].shape}")
-        print(f"data_source: {items[i].non_tensor_batch['data_source']}")
-        print(f"uid: {items[i].non_tensor_batch['uid']}")
-        print(f"reward_model: {items[i].non_tensor_batch['reward_model']}")
-        print("-" * 30)
-
-    print("\n" + "=" * 50)
-    print("============= MERGE =============")
-    print("=" * 50)
-
-    merged_proto = DataProto.from_items(items)
-
-    print("Merged DataProto:")
-    print(f"batch_size: {merged_proto.batch.batch_size}")
-    print(f"tensor keys: {list(merged_proto.batch.keys())}")
-    print(f"non_tensor keys: {list(merged_proto.non_tensor_batch.keys())}")
-    print(f"Sample data_source: {merged_proto.non_tensor_batch['data_source'][:3]}")
-    print(f"Sample uid: {merged_proto.non_tensor_batch['uid'][:3]}")
-
-    # Verify they're identical
-    success = True
-    try:
-        for key in original_proto.batch.keys():
-            assert torch.equal(original_proto.batch[key], merged_proto.batch[key])
-        for key in original_proto.non_tensor_batch.keys():
-            assert np.array_equal(original_proto.non_tensor_batch[key], merged_proto.non_tensor_batch[key])
-        assert original_proto.meta_info == merged_proto.meta_info
-        print("\n✓ Original and merged DataProto are identical!")
-    except Exception as e:
-        print(f"\n✗ Verification failed: {e}")
-        success = False
-
-    return success
-
-
-def example_basic_split_merge():
-    """Basic example of splitting DataProto into DataProtoItems and merging back."""
-    print("=== Basic Split and Merge Example ===")
-
-    # Create sample data
-    batch_size = 3
-    seq_len = 5
-
-    # Create tensors
-    input_ids = torch.randint(0, 1000, (batch_size, seq_len))
-    attention_mask = torch.ones(batch_size, seq_len)
-
-    # Create non-tensor data
-    prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object)
-    scores = np.array([0.8, 0.9, 0.7], dtype=object)
-
-    # Create DataProto
-    data_proto = DataProto.from_dict(
-        tensors={"input_ids": input_ids, "attention_mask": attention_mask},
-        non_tensors={"prompts": prompts, "scores": scores},
-        meta_info={"model_name": "test_model", "version": "1.0"},
-    )
-
-    print(f"Original DataProto length: {len(data_proto)}")
-    print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}")
-    print(f"Prompts: {data_proto.non_tensor_batch['prompts']}")
-
-    # Split into DataProtoItems
-    items = data_proto.to_items()
-    print(f"\nSplit into {len(items)} items")
-
-    for i, item in enumerate(items):
-        print(f"Item {i}:")
-        print(f"  Input IDs shape: {item.batch['input_ids'].shape}")
-        print(f"  Prompt: {item.non_tensor_batch['prompts']}")
-        print(f"  Score: {item.non_tensor_batch['scores']}")
-
-    # Merge back to DataProto
-    merged_proto = DataProto.from_items(items)
-    print(f"\nMerged DataProto length: {len(merged_proto)}")
-    print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}")
-    print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}")
-
-    # Verify they're identical
-    assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"])
-    assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"])
-    assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"])
-    assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"])
-
-    print("\n✓ Original and merged DataProto are identical!")
-
-
-def example_item_processing():
-    """Example showing individual item processing before merging."""
-    print("\n=== Individual Item Processing Example ===")
-
-    # Create initial data
-    #    batch_size = 4
-
-    values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1)  # Shape: (4, 1)
-    labels = np.array(["A", "B", "C", "D"], dtype=object)
-
-    original_proto = DataProto.from_dict(
-        tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0}
-    )
-
-    print(f"Original values: {original_proto.batch['values'].flatten()}")
-    print(f"Original labels: {original_proto.non_tensor_batch['labels']}")
-
-    # Split and process each item individually
-    items = original_proto.to_items()
-    processed_items = []
-
-    for i, item in enumerate(items):
-        # Process the tensor data (multiply by 2)
-        processed_value = item.batch["values"] * 2
-
-        # Process the non-tensor data (add suffix)
-        processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}"
-
-        # Create new processed item
-        processed_item = DataProtoItem(
-            batch=item.batch.clone(),  # Clone the TensorDict
-            non_tensor_batch=item.non_tensor_batch.copy(),
-            meta_info=item.meta_info.copy(),
-        )
-
-        # Update with processed data
-        processed_item.batch["values"] = processed_value
-        processed_item.non_tensor_batch["labels"] = processed_label
-        processed_item.meta_info["processing_step"] = 1
-
-        processed_items.append(processed_item)
-
-        print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'")
-
-    # Merge processed items back
-    processed_proto = DataProto.from_items(processed_items)
-
-    print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}")
-    print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}")
-    print(f"Processing step: {processed_proto.meta_info['processing_step']}")
-
-
-def example_convenience_methods():
-    """Example showing convenience methods."""
-    print("\n=== Convenience Methods Example ===")
-
-    # Create a single DataProtoItem
-    single_tensor = torch.tensor([42]).unsqueeze(0)  # Shape: (1,)
-    single_item = DataProtoItem(
-        batch=None,  # We'll create TensorDict manually
-        non_tensor_batch={"text": "Hello"},
-        meta_info={"source": "manual"},
-    )
-
-    # Create TensorDict manually for the single item
-    from tensordict import TensorDict
-
-    single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,))
-
-    print(f"Single item data: {single_item.batch['data']}")
-    print(f"Single item text: {single_item.non_tensor_batch['text']}")
-
-    # Convert single item to DataProto using convenience method
-    single_proto = single_item.to_proto()
-    print(f"Converted to DataProto length: {len(single_proto)}")
-
-    # Create multiple items and use static convenience method
-    items = [single_item]
-    for i in range(2):
-        new_item = single_item.copy()  # Use the copy method
-        new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0)
-        new_item.non_tensor_batch["text"] = f"Item {i + 1}"
-        items.append(new_item)
-
-    # Use DataProtoItem.from_items() convenience method
-    merged_proto = DataProtoItem.from_items(items)
-    print(f"Merged using convenience method - length: {len(merged_proto)}")
-    print(f"Data: {merged_proto.batch['data'].flatten()}")
-    print(f"Texts: {merged_proto.non_tensor_batch['text']}")
-
-
-def example_error_handling():
-    """Example showing error handling."""
-    print("\n=== Error Handling Example ===")
-
-    # Try to create DataProto from empty list
-    try:
-        DataProto.from_items([])
-        print("ERROR: Should have raised exception for empty list")
-    except ValueError as e:
-        print(f"✓ Correctly caught error for empty list: {e}")
-
-    # Try to merge items with inconsistent structure
-    try:
-        item1 = DataProtoItem(
-            batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)),
-            non_tensor_batch={"text": "Hello"},
-        )
-        item2 = DataProtoItem(
-            batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)),
-            non_tensor_batch={"text": "World"},
-        )
-
-        DataProto.from_items([item1, item2])
-        print("ERROR: Should have raised exception for inconsistent structure")
-    except ValueError as e:
-        print(f"✓ Correctly caught error for inconsistent structure: {e}")
-
-
-if __name__ == "__main__":
-    # Run all tests
-    test_basic_split_and_merge()
-    test_individual_item_access()
-    test_partial_merge()
-    test_item_processing()
-    test_error_conditions()
-    test_roundtrip_integrity()
-    example_basic_split_merge()
-    example_item_processing()
-    example_convenience_methods()
-    example_error_handling()
-    run_visual_comparison()
diff --git a/verl/protocol.py b/verl/protocol.py
index 17b3b10c1f6..a4d394af97d 100644
--- a/verl/protocol.py
+++ b/verl/protocol.py
@@ -38,7 +38,7 @@
 from verl.utils.py_functional import union_two_dict
 from verl.utils.torch_functional import allgather_dict_tensors
 
-__all__ = ["DataProto", "DataProtoItem", "union_tensor_dict"]
+__all__ = ["DataProto", "union_tensor_dict"]
 
 with contextlib.suppress(Exception):
     tensordict.set_lazy_legacy(False).set()
@@ -198,83 +198,11 @@ def collate_fn(x: list["DataProtoItem"]):
 
 @dataclass
 class DataProtoItem:
-    """
-    A single item from a DataProto batch, representing one sample.
-    This is typically used when accessing individual elements from a DataProto.
-    """
-
+    # TODO(zhangchi.usc1992) add consistency check
     batch: TensorDict = None
     non_tensor_batch: dict = field(default_factory=dict)
     meta_info: dict = field(default_factory=dict)
 
-    def __post_init__(self):
-        """Perform consistency checking after initialization."""
-        self._check_consistency()
-
-    def _check_consistency(self):
-        """Check the consistency of the DataProtoItem."""
-        # For DataProtoItem, batch can have no batch dimension (batch_size=[]) or batch size 1
-        if self.batch is not None:
-            # Allow both cases: tensors without batch dim (batch_size=[]) and tensors with batch size 1
-            if hasattr(self.batch, "batch_size") and len(self.batch.batch_size) > 0:
-                if self.batch.batch_size[0] > 1:
-                    raise ValueError(
-                        f"DataProtoItem batch should have batch size 0 or 1, got {self.batch.batch_size[0]}"
-                    )
-
-        # Check non_tensor_batch consistency
-        if self.non_tensor_batch:
-            for key, val in self.non_tensor_batch.items():
-                # For DataProtoItem, non_tensor values should be individual items, not arrays
-                if isinstance(val, np.ndarray) and val.shape != ():
-                    # Allow only scalar numpy arrays (shape=()) for individual items
-                    if val.shape[0] > 1:
-                        raise ValueError(
-                            f"DataProtoItem non_tensor_batch['{key}']"
-                            "should be a single item, got array with shape {val.shape}"
-                        )
-
-    def to_proto(self) -> "DataProto":
-        """Convert this DataProtoItem to a DataProto with batch size 1.
-
-        Returns:
-            DataProto: A DataProto containing this single item
-        """
-        return DataProto.from_items([self])
-
-    @staticmethod
-    def from_items(items: list["DataProtoItem"]) -> "DataProto":
-        """Create a DataProto from a list of DataProtoItem objects.
-
-        This is a convenience method that calls DataProto.from_items().
-
-        Args:
-            items (List[DataProtoItem]): A list of DataProtoItem objects to merge
-
-        Returns:
-            DataProto: A new DataProto containing all the items as a batch
-        """
-        return DataProto.from_items(items)
-
-    def copy(self) -> "DataProtoItem":
-        """Create a deep copy of this DataProtoItem.
-
-        Returns:
-            DataProtoItem: A deep copy of this item
-        """
-        import copy
-
-        # Deep copy the batch TensorDict
-        batch_copy = copy.deepcopy(self.batch) if self.batch is not None else None
-
-        # Deep copy non_tensor_batch
-        non_tensor_copy = copy.deepcopy(self.non_tensor_batch)
-
-        # Deep copy meta_info
-        meta_info_copy = copy.deepcopy(self.meta_info)
-
-        return DataProtoItem(batch=batch_copy, non_tensor_batch=non_tensor_copy, meta_info=meta_info_copy)
-
 
 @dataclass
 class DataProto:
@@ -810,96 +738,6 @@ def split(self, split_size: int) -> list["DataProto"]:
         """
         return [self[i : i + split_size] for i in range(0, len(self), split_size)]
 
-    def to_items(self) -> list["DataProtoItem"]:
-        """Convert DataProto to a list of DataProtoItem objects.
-
-        Returns:
-            List[DataProtoItem]: A list containing individual DataProtoItem objects,
-                                 one for each sample in the batch
-        """
-        items = []
-        for i in range(len(self)):
-            # Use the existing __getitem__ implementation for single integer access
-            items.append(self[i])
-        return items
-
-    @staticmethod
-    def from_items(items: list["DataProtoItem"]) -> "DataProto":
-        """Create a DataProto from a list of DataProtoItem objects.
-
-        Args:
-            items (List[DataProtoItem]): A list of DataProtoItem objects to merge
-
-        Returns:
-            DataProto: A new DataProto containing all the items as a batch
-
-        Raises:
-            ValueError: If the input list is empty or items have inconsistent structure
-        """
-        if not items:
-            raise ValueError("Cannot create DataProto from empty list of items")
-
-        # Get the first item to determine structure and meta_info
-        first_item = items[0]
-        meta_info = first_item.meta_info
-
-        # Collect all tensor batches
-        batch_tensors = {}
-        non_tensor_batches = {}
-
-        # Process tensor data
-        if first_item.batch is not None:
-            # Get all keys from the first item's batch
-            tensor_keys = list(first_item.batch.keys())
-
-            for key in tensor_keys:
-                tensor_list = []
-                for i, item in enumerate(items):
-                    if item.batch is None or key not in item.batch:
-                        raise ValueError(f"Item {i} missing tensor key '{key}' in batch")
-
-                    tensor = item.batch[key]
-                    # Handle tensors from DataProtoItem which may not have batch dimension
-                    # (as shown in the user's example where batch_size=torch.Size([]))
-                    if tensor.dim() == 0:
-                        # Scalar tensor - add batch dimension
-                        tensor = tensor.unsqueeze(0)
-                    else:
-                        # Multi-dimensional tensor without batch dimension - add batch dimension
-                        tensor = tensor.unsqueeze(0)
-
-                    tensor_list.append(tensor)
-
-                # Concatenate tensors along batch dimension
-                if tensor_list:
-                    batch_tensors[key] = torch.cat(tensor_list, dim=0)
-
-        # Process non-tensor data
-        if first_item.non_tensor_batch:
-            non_tensor_keys = list(first_item.non_tensor_batch.keys())
-
-            for key in non_tensor_keys:
-                non_tensor_list = []
-                for i, item in enumerate(items):
-                    if key not in item.non_tensor_batch:
-                        raise ValueError(f"Item {i} missing non_tensor key '{key}'")
-
-                    non_tensor_data = item.non_tensor_batch[key]
-                    non_tensor_list.append(non_tensor_data)
-
-                # Stack non-tensor data
-                if non_tensor_list:
-                    non_tensor_batches[key] = np.array(non_tensor_list, dtype=object)
-
-        # Create TensorDict for batch
-        if batch_tensors:
-            batch_size = len(items)
-            batch = TensorDict(source=batch_tensors, batch_size=(batch_size,))
-        else:
-            batch = None
-
-        return DataProto(batch=batch, non_tensor_batch=non_tensor_batches, meta_info=meta_info)
-
     @staticmethod
     def concat(data: list["DataProto"]) -> "DataProto":
         """Concat a list of DataProto. The batch is concatenated among dim=0.

From 174e762a05963a13fc16c9ca7d31928539df064f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 9 Sep 2025 20:53:08 +0800
Subject: [PATCH 117/182] move shell

---
 .../fully_async_policy/agent_loop/__init__.py |  22 +
 .../agent_loop/agent_loop.py                  | 637 ++++++++++++++++++
 .../partial_single_turn_agent_loop.py         |  74 ++
 .../{ => shell}/dapo_7b_math_fsdp2_2_6.sh     |   2 +-
 .../{ => shell}/dapo_7b_math_fsdp2_4_12.sh    |   0
 .../{ => shell}/dapo_7b_math_fsdp2_8_8.sh     |   0
 .../dapo_7b_math_fsdp2_colocate.sh            |   0
 .../{ => shell}/dapo_7b_math_fsdp2_server.sh  |   0
 .../dapo_7b_math_megatron_colocate.sh         |   0
 .../{ => shell}/runtime_env.yaml              |   0
 10 files changed, 734 insertions(+), 1 deletion(-)
 create mode 100644 recipe/fully_async_policy/agent_loop/__init__.py
 create mode 100644 recipe/fully_async_policy/agent_loop/agent_loop.py
 create mode 100644 recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
 rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_2_6.sh (99%)
 rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_4_12.sh (100%)
 rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_8_8.sh (100%)
 rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_colocate.sh (100%)
 rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_server.sh (100%)
 rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_megatron_colocate.sh (100%)
 rename recipe/fully_async_policy/{ => shell}/runtime_env.yaml (100%)

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
new file mode 100644
index 00000000000..284f3e975c0
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .agent_loop import AgentLoopBase, AgentLoopManager
+from .single_turn_agent_loop import SingleTurnAgentLoop
+from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
+from .tool_agent_loop import ToolAgentLoop
+
+_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop]
+
+__all__ = ["AgentLoopBase", "AgentLoopManager"]
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
new file mode 100644
index 00000000000..32d52df8804
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -0,0 +1,637 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import heapq
+import logging
+import os
+import random
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import hydra
+import numpy as np
+import ray
+import torch
+from cachetools import LRUCache
+from omegaconf import DictConfig, OmegaConf
+from pydantic import BaseModel
+from tensordict import TensorDict
+from transformers import AutoTokenizer
+
+from verl.protocol import DataProto
+from verl.single_controller.ray.base import RayWorkerGroup
+from verl.utils import hf_tokenizer
+from verl.utils.fs import copy_to_local
+from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
+from verl.workers.rollout.async_server import async_server_class
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+class AsyncLLMServerManager:
+    """
+    A class to manage multiple OpenAI compatible LLM servers. This class provides
+    - Load balance: least requests load balancing
+    - Sticky session: send multi-turn chat completions to same server for automatic prefix caching
+    """
+
+    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000):
+        """Initialize the AsyncLLMServerManager.
+
+        Args:
+            config (DictConfig): YAML config.
+            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
+            max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000.
+        """
+        self.config = config
+        self.server_handles = server_handles
+        random.shuffle(self.server_handles)
+
+        # Least requests load balancing
+        self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles]
+        heapq.heapify(self.weighted_serveres)
+
+        # LRU cache to map request_id to server
+        self.request_id_to_server = LRUCache(maxsize=max_cache_size)
+
+    def _choose_server(self, request_id: str) -> ray.actor.ActorHandle:
+        # TODO: implement server pressure awareness load balancing
+        if request_id in self.request_id_to_server:
+            return self.request_id_to_server[request_id]
+
+        server = self.weighted_serveres[0][1][1]
+        self.weighted_serveres[0][0] += 1
+        heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0])
+        self.request_id_to_server[request_id] = server
+        return server
+
+    @rollout_trace_op
+    async def generate(
+        self,
+        request_id,
+        *,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+    ) -> list[int]:
+        """Generate tokens from prompt ids.
+
+        Args:
+            request_id (str): request id for sticky session.
+            prompt_ids (List[int]): List of prompt token ids.
+            sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
+
+        Returns:
+            List[int]: List of generated token ids.
+        """
+        server = self._choose_server(request_id)
+        output = await server.generate.remote(
+            request_id=request_id,
+            prompt_ids=prompt_ids,
+            sampling_params=sampling_params,
+        )
+        return output
+
+    async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
+        """Generate tokens from prompt ids. with partial rollout function"""
+        server = self._choose_server(request_id)
+        output = await server.generate_for_partial.remote(
+            request_id=request_id,
+            prompt_ids=prompt_ids,
+            sampling_params=sampling_params,
+        )
+        return output
+
+
+class AgentLoopMetrics(BaseModel):
+    """Agent loop performance metrics."""
+
+    generate_sequences: float = 0.0
+    tool_calls: float = 0.0
+
+
+class AgentLoopOutput(BaseModel):
+    """Agent loop output."""
+
+    prompt_ids: list[int]
+    """Prompt token ids."""
+    response_ids: list[int]
+    """Response token ids including LLM generated token, tool response token."""
+    response_mask: list[int]
+    """Response mask, 1 for LLM generated token, 0 for tool response token."""
+    num_turns: int = 0
+    """Number of chat turns, including user, assistant, tool."""
+    metrics: AgentLoopMetrics
+    """Auxiliary performance metrics"""
+    is_cancel: bool = False
+    """Indicates whether the request was interrupted"""
+    log_probs: list[float] = None
+    """Response token log probs including LLM generated token, tool response token."""
+
+
+# make hydra.utils.instantiate happy
+class _DummyConfig:
+    def __init__(self, config: DictConfig) -> None:
+        self.config = config
+
+
+class AgentLoopBase(ABC):
+    """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various
+    environments."""
+
+    _class_initialized = False
+
+    def __init__(
+        self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs
+    ):
+        """Initialize agent loop, each sample will have its own loop instance.
+
+        Args:
+            trainer_config (_DummyConfig): trainer config.
+            server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
+            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
+        """
+        self.init_class(trainer_config.config, tokenizer, **kwargs)
+        self.config = trainer_config.config
+        self.server_manager = server_manager
+        self.tokenizer = tokenizer
+        self.loop = asyncio.get_running_loop()
+
+    @classmethod
+    def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs):
+        """This is used to do heavy initialization work that should shared across all instances. It's only called once.
+
+        Args:
+            config (DictConfig): trainer config.
+            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
+            **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`.
+        """
+        if cls._class_initialized:
+            return
+        cls._class_initialized = True
+
+    @abstractmethod
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
+        """Run agent loop to interact with LLM server and environment.
+
+        Args:
+            messages (List[Dict[str, Any]]): Input messages.
+            sampling_params (Dict[str, Any]): LLM sampling params.
+            partial_output: Optional[AgentLoopOutput]: already rollout result.
+
+        Returns:
+            AgentLoopOutput: Agent loop output.
+        """
+        raise NotImplementedError
+
+
+def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto:
+    """Static method to postprocess a list of AgentLoopOutput into DataProto
+
+    Args:
+        inputs: List of AgentLoopOutput
+        tokenizer: Tokenizer instance
+        config: Configuration object
+
+    Returns:
+        DataProto: Processed batch data
+    """
+    # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
+    # prompts: left pad
+    # responses: right pad
+    # input_ids: prompt + response
+    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+    # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
+
+    # prompts
+    tokenizer.padding_side = "left"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.prompt_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.prompt_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # responses
+    tokenizer.padding_side = "right"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # response_mask
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_mask} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=False,
+    )
+    response_mask = outputs["input_ids"]
+    assert response_ids.shape == response_mask.shape, (
+        f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
+    )
+    response_mask = response_mask * response_attention_mask
+
+    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
+    attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
+    position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
+
+    batch = TensorDict(
+        {
+            "prompts": prompt_ids,  # [bsz, prompt_length]
+            "responses": response_ids,  # [bsz, response_length]
+            "response_mask": response_mask,  # [bsz, response_length]
+            "input_ids": input_ids,  # [bsz, prompt_length + response_length]
+            "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
+            "position_ids": position_ids,  # [bsz, prompt_length + response_length]
+        },
+        batch_size=len(input_ids),
+    )
+
+    num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
+    metrics = [input.metrics.model_dump() for input in inputs]
+    return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
+
+
+@ray.remote
+class AgentLoopWorker:
+    """Agent loop worker takes a batch of messages and run each message in an agent loop."""
+
+    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]):
+        """Initialize agent loop manager.
+
+        Args:
+            config (DictConfig): YAML config.
+            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
+        """
+        self.config = config
+        self.server_manager = AsyncLLMServerManager(config, server_handles)
+
+        model_path = config.actor_rollout_ref.model.path
+        self.model_name = "/".join(model_path.split("/")[-2:])
+        local_path = copy_to_local(config.actor_rollout_ref.model.path)
+        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True)
+
+        agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path
+        if agent_loop_config_path:
+            agent_loop_configs = OmegaConf.load(agent_loop_config_path)
+            for agent_loop_config in agent_loop_configs:
+                _agent_loop_registry[agent_loop_config.name] = agent_loop_config
+
+        trace_config = config.trainer.get("rollout_trace", {})
+        trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
+        RolloutTraceConfig.init(
+            self.config.trainer.project_name,
+            self.config.trainer.experiment_name,
+            trace_config.get("backend"),
+            trace_config.get("token2text", False),
+        )
+
+    async def generate_sequences(self, batch: DataProto) -> DataProto:
+        """Generate sequences from agent loop.
+
+        Args:
+            batch (DataProto): Input batch.
+
+        Returns:
+            DataProto: Output batch.
+            - prompts: [bsz, prompt_length], prompt token ids from dataset.
+            - responses: [bsz, response_length], output token ids include response tokens
+              from LLM generation and observation tokens from tool_calls.
+            - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens.
+            - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens
+              and response tokens.
+            - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens.
+            - position_ids: [bsz, prompt_length + response_length], incremental position ids.
+
+            For multi-turn conversations:
+            responses:     |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->|
+            response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0|
+        """
+        config = self.config.actor_rollout_ref.rollout
+        sampling_params = dict(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            repetition_penalty=1.0,
+        )
+
+        # override sampling params for validation
+        if batch.meta_info.get("validate", False):
+            sampling_params["top_p"] = config.val_kwargs.top_p
+            sampling_params["temperature"] = config.val_kwargs.temperature
+
+        # by default, we assume it's a single turn agent
+        if "agent_name" not in batch.non_tensor_batch:
+            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
+
+        tasks = []
+        agent_names = batch.non_tensor_batch["agent_name"]
+        raw_prompts = batch.non_tensor_batch["raw_prompt"]
+        if "index" in batch.non_tensor_batch:
+            index = batch.non_tensor_batch["index"]
+        else:
+            index = np.arange(len(raw_prompts))
+
+        trajectory_info = await get_trajectory_info(
+            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
+        )
+
+        for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True):
+            tasks.append(
+                asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory))
+            )
+        outputs = await asyncio.gather(*tasks)
+
+        output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config)
+        return output
+
+    async def generate_sequences_no_post(
+        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
+    ) -> list[AgentLoopOutput]:
+        """Generate sequences from agent loop.
+
+        Args:
+            batch (DataProto): Input batch.
+            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
+
+        Returns:
+            list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
+            Each AgentLoopOutput contains:
+            - prompt_ids: prompt token ids
+            - response_ids: response token ids including LLM generated and tool response tokens
+            - response_mask: 1 for LLM generated tokens, 0 for tool response tokens
+            - num_turns: number of chat turns
+            - metrics: performance metrics
+        """
+        config = self.config.actor_rollout_ref.rollout
+        sampling_params = dict(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            repetition_penalty=1.0,
+        )
+
+        # override sampling params for validation
+        if batch.meta_info.get("validate", False):
+            sampling_params["top_p"] = config.val_kwargs.top_p
+            sampling_params["temperature"] = config.val_kwargs.temperature
+
+        # by default, we assume it's a single turn agent
+        if "agent_name" not in batch.non_tensor_batch:
+            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
+
+        tasks = []
+        agent_names = batch.non_tensor_batch["agent_name"]
+        raw_prompts = batch.non_tensor_batch["raw_prompt"]
+        if "index" in batch.non_tensor_batch:
+            index = batch.non_tensor_batch["index"]
+        else:
+            index = np.arange(len(raw_prompts))
+
+        trajectory_info = await get_trajectory_info(
+            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
+        )
+        if not partial_output_list:
+            partial_output_list = [None] * len(batch)
+
+        for agent_name, messages, trajectory, partial_output in zip(
+            agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True
+        ):
+            tasks.append(
+                asyncio.create_task(
+                    self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
+                )
+            )
+        outputs = await asyncio.gather(*tasks)
+
+        return outputs
+
+    async def _run_agent_loop(
+        self,
+        agent_name: str,
+        messages: list[dict[str, Any]],
+        sampling_params: dict[str, Any],
+        trajectory: dict[str, Any],
+        partial_output: Optional[AgentLoopOutput] = None,
+    ) -> AgentLoopOutput:
+        with rollout_trace_attr(
+            step=trajectory["step"],
+            sample_index=trajectory["sample_index"],
+            rollout_n=trajectory["rollout_n"],
+            validate=trajectory["validate"],
+            name="agent_loop",
+        ):
+            assert agent_name in _agent_loop_registry, (
+                f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
+            )
+            agent_loop_config = _agent_loop_registry[agent_name]
+            agent_loop = hydra.utils.instantiate(
+                config=agent_loop_config,
+                trainer_config=_DummyConfig(config=self.config),
+                server_manager=self.server_manager,
+                tokenizer=self.tokenizer,
+            )
+            output = await agent_loop.run(messages, sampling_params, partial_output)
+            return output
+
+
+class AgentLoopManager:
+    """Agent loop manager that manages a group of agent loop workers."""
+
+    def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
+        """Initialize agent loop manager.
+
+        Args:
+            config (DictConfig): trainer config.
+            worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group.
+        """
+        self.config = config
+        self.worker_group = worker_group
+
+        self._initialize_llm_servers()
+        self._init_agent_loop_workers()
+
+        # Initially we're in sleep mode.
+        self.sleep()
+
+    def _initialize_llm_servers(self):
+        self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
+        self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size
+
+        register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center")
+        workers_info = ray.get(register_center.get_worker_info.remote())
+        assert len(workers_info) == self.worker_group.world_size
+
+        self.async_llm_servers = [None] * self.rollout_dp_size
+        self.server_addresses = [None] * self.rollout_dp_size
+
+        if self.config.actor_rollout_ref.rollout.agent.custom_async_server:
+            server_class = async_server_class(
+                rollout_backend=self.config.actor_rollout_ref.rollout.name,
+                rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path,
+                rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name,
+            )
+        else:
+            server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name)
+
+        # Start all server instances, restart if address already in use.
+        unready_dp_ranks = set(range(self.rollout_dp_size))
+        while len(unready_dp_ranks) > 0:
+            servers = {
+                rollout_dp_rank: server_class.options(
+                    # make sure AsyncvLLMServer colocates with its corresponding workers
+                    scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                        node_id=workers_info[rollout_dp_rank * self.rollout_tp_size],
+                        soft=False,
+                    ),
+                    name=f"async_llm_server_{rollout_dp_rank}",
+                ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix)
+                for rollout_dp_rank in unready_dp_ranks
+            }
+
+            for rollout_dp_rank, server in servers.items():
+                try:
+                    address = ray.get(server.get_server_address.remote())
+                    self.server_addresses[rollout_dp_rank] = address
+                    self.async_llm_servers[rollout_dp_rank] = server
+                    unready_dp_ranks.remove(rollout_dp_rank)
+                except Exception:
+                    ray.kill(server)
+                    print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...")
+
+        # All server instances are ready, init AsyncLLM engine.
+        ray.get([server.init_engine.remote() for server in self.async_llm_servers])
+
+    def _init_agent_loop_workers(self):
+        self.agent_loop_workers = []
+        # 获取建议的资源配置
+        agent_config = self.config.actor_rollout_ref.rollout.agent
+        max_concurrency = agent_config.get("max_concurrency", 10)
+        num_cpus = agent_config.get("num_cpus", 2)  # 默认2个CPU核心
+
+        for i in range(agent_config.num_workers):
+            self.agent_loop_workers.append(
+                AgentLoopWorker.options(
+                    name=f"agent_loop_worker_{i}",
+                    max_concurrency=max_concurrency,  # 设置最大并发数
+                    num_cpus=num_cpus,  # 设置CPU资源需求
+                ).remote(self.config, self.async_llm_servers)
+            )
+
+    def generate_sequences(self, prompts: DataProto) -> DataProto:
+        """Split input batch and dispatch to agent loop workers.
+
+        Args:
+            prompts (DataProto): Input batch.
+
+        Returns:
+            DataProto: Output batch.
+        """
+        if self.config.actor_rollout_ref.rollout.free_cache_engine:
+            self.wake_up()
+        chunkes = prompts.chunk(len(self.agent_loop_workers))
+        outputs = ray.get(
+            [
+                worker.generate_sequences.remote(chunk)
+                for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True)
+            ]
+        )
+        output = DataProto.concat(outputs)
+        if self.config.actor_rollout_ref.rollout.free_cache_engine:
+            self.sleep()
+
+        # calculate performance metrics
+        metrics = [output.meta_info["metrics"] for output in outputs]  # List[List[Dict[str, str]]]
+        timing = self._performance_metrics(metrics, output)
+
+        output.meta_info = {"timing": timing}
+        return output
+
+    async def generate_single_sample_async(
+        self,
+        sample: DataProto,
+        partial_output_list: Optional[list[AgentLoopOutput]],
+    ) -> list[AgentLoopOutput]:
+        """
+        异步处理单个样本, 需要复制n次
+
+        Args:
+            sample: 单个样本数据
+            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
+
+        Returns:
+            tuple[AgentLoopOutput, float]: 处理结果和处理时间
+        """
+        # 使用负载均衡选择 worker
+        worker = self._select_best_worker()
+        # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
+        output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list)
+        return await asyncio.wrap_future(output_future.future())
+
+    def _select_best_worker(self):
+        """选择最佳的 worker（简单的轮询负载均衡）"""
+        if not hasattr(self, "_worker_index"):
+            self._worker_index = 0
+
+        worker = self.agent_loop_workers[self._worker_index]
+        self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
+        return worker
+
+    def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
+        timing = {}
+        t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk])
+        t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk])
+        timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min()
+        timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max()
+        timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean()
+        timing["agent_loop/tool_calls/min"] = t_tool_calls.min()
+        timing["agent_loop/tool_calls/max"] = t_tool_calls.max()
+        timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean()
+
+        # batch sequence generation is bounded by the slowest sample
+        slowest = np.argmax(t_generate_sequences + t_tool_calls)
+        attention_mask = output.batch["attention_mask"][slowest]
+        prompt_length = output.batch["prompts"].shape[1]
+        timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
+        timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
+        timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
+        timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
+
+        return timing
+
+    def wake_up(self):
+        """Wake up all rollout server instances."""
+        ray.get([server.wake_up.remote() for server in self.async_llm_servers])
+
+    def sleep(self):
+        """Sleep all rollout server instances."""
+        ray.get([server.sleep.remote() for server in self.async_llm_servers])
+
+    async def cancel_async(self):
+        """Cancel all rollout tasks asynchronously."""
+        futures = [server.cancel.remote() for server in self.async_llm_servers]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+
+    async def resume_async(self):
+        """Cancel all rollout tasks asynchronously."""
+        futures = [server.resume.remote() for server in self.async_llm_servers]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
new file mode 100644
index 00000000000..ccdb9084238
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -0,0 +1,74 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Any, Optional
+from uuid import uuid4
+
+from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from verl.utils.profiler import simple_timer
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+@register("partial_single_turn_agent")
+class PartialSingleTurnAgentLoop(AgentLoopBase):
+    """Naive agent loop that only do single turn chat completion."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
+        self.response_length = self.config.actor_rollout_ref.rollout.response_length
+
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
+        if not output:
+            prompt_ids = await self.loop.run_in_executor(
+                None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+            )
+        else:
+            if output.is_cancel:
+                # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
+                prompt_ids = output.prompt_ids + output.response_ids
+            else:
+                # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
+                return output
+
+        metrics = {}
+        request_id = uuid4().hex
+        with simple_timer("generate_sequences", metrics):
+            response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
+                request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
+            )
+
+        if not output:
+            response_mask = [1] * len(response_ids)
+        # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
+        else:
+            prompt_ids = output.prompt_ids
+            log_probs = output.log_probs + log_probs
+            response_ids = output.response_ids + response_ids
+            response_mask = [1] * len(response_ids)
+
+        return AgentLoopOutput(
+            prompt_ids=prompt_ids,
+            response_ids=response_ids[: self.response_length],
+            response_mask=response_mask[: self.response_length],
+            num_turns=2,
+            metrics=metrics,
+            is_cancel=is_cancel,
+            log_probs=log_probs,
+        )
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
similarity index 99%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
index 5f654227d15..0d303bdde87 100644
--- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
@@ -155,7 +155,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.logger=['console','tensorboard'] \
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
+    trainer.val_before_train=False \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
similarity index 100%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
similarity index 100%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh
similarity index 100%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh
diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh
similarity index 100%
rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh
diff --git a/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh
similarity index 100%
rename from recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh
diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/shell/runtime_env.yaml
similarity index 100%
rename from recipe/fully_async_policy/runtime_env.yaml
rename to recipe/fully_async_policy/shell/runtime_env.yaml

From 085f36795661f7d53fc0085a31fd85771e462dc6 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 9 Sep 2025 21:32:27 +0800
Subject: [PATCH 118/182] rm agent_loop

---
 .../fully_async_policy/agent_loop/__init__.py |  22 -
 .../agent_loop/agent_loop.py                  | 637 ------------------
 .../partial_single_turn_agent_loop.py         |  74 --
 3 files changed, 733 deletions(-)
 delete mode 100644 recipe/fully_async_policy/agent_loop/__init__.py
 delete mode 100644 recipe/fully_async_policy/agent_loop/agent_loop.py
 delete mode 100644 recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
deleted file mode 100644
index 284f3e975c0..00000000000
--- a/recipe/fully_async_policy/agent_loop/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .agent_loop import AgentLoopBase, AgentLoopManager
-from .single_turn_agent_loop import SingleTurnAgentLoop
-from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
-from .tool_agent_loop import ToolAgentLoop
-
-_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop]
-
-__all__ = ["AgentLoopBase", "AgentLoopManager"]
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
deleted file mode 100644
index 32d52df8804..00000000000
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ /dev/null
@@ -1,637 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import heapq
-import logging
-import os
-import random
-from abc import ABC, abstractmethod
-from typing import Any, Optional
-
-import hydra
-import numpy as np
-import ray
-import torch
-from cachetools import LRUCache
-from omegaconf import DictConfig, OmegaConf
-from pydantic import BaseModel
-from tensordict import TensorDict
-from transformers import AutoTokenizer
-
-from verl.protocol import DataProto
-from verl.single_controller.ray.base import RayWorkerGroup
-from verl.utils import hf_tokenizer
-from verl.utils.fs import copy_to_local
-from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
-from verl.workers.rollout.async_server import async_server_class
-
-logger = logging.getLogger(__file__)
-logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
-
-
-class AsyncLLMServerManager:
-    """
-    A class to manage multiple OpenAI compatible LLM servers. This class provides
-    - Load balance: least requests load balancing
-    - Sticky session: send multi-turn chat completions to same server for automatic prefix caching
-    """
-
-    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000):
-        """Initialize the AsyncLLMServerManager.
-
-        Args:
-            config (DictConfig): YAML config.
-            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
-            max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000.
-        """
-        self.config = config
-        self.server_handles = server_handles
-        random.shuffle(self.server_handles)
-
-        # Least requests load balancing
-        self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles]
-        heapq.heapify(self.weighted_serveres)
-
-        # LRU cache to map request_id to server
-        self.request_id_to_server = LRUCache(maxsize=max_cache_size)
-
-    def _choose_server(self, request_id: str) -> ray.actor.ActorHandle:
-        # TODO: implement server pressure awareness load balancing
-        if request_id in self.request_id_to_server:
-            return self.request_id_to_server[request_id]
-
-        server = self.weighted_serveres[0][1][1]
-        self.weighted_serveres[0][0] += 1
-        heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0])
-        self.request_id_to_server[request_id] = server
-        return server
-
-    @rollout_trace_op
-    async def generate(
-        self,
-        request_id,
-        *,
-        prompt_ids: list[int],
-        sampling_params: dict[str, Any],
-    ) -> list[int]:
-        """Generate tokens from prompt ids.
-
-        Args:
-            request_id (str): request id for sticky session.
-            prompt_ids (List[int]): List of prompt token ids.
-            sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
-
-        Returns:
-            List[int]: List of generated token ids.
-        """
-        server = self._choose_server(request_id)
-        output = await server.generate.remote(
-            request_id=request_id,
-            prompt_ids=prompt_ids,
-            sampling_params=sampling_params,
-        )
-        return output
-
-    async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
-        """Generate tokens from prompt ids. with partial rollout function"""
-        server = self._choose_server(request_id)
-        output = await server.generate_for_partial.remote(
-            request_id=request_id,
-            prompt_ids=prompt_ids,
-            sampling_params=sampling_params,
-        )
-        return output
-
-
-class AgentLoopMetrics(BaseModel):
-    """Agent loop performance metrics."""
-
-    generate_sequences: float = 0.0
-    tool_calls: float = 0.0
-
-
-class AgentLoopOutput(BaseModel):
-    """Agent loop output."""
-
-    prompt_ids: list[int]
-    """Prompt token ids."""
-    response_ids: list[int]
-    """Response token ids including LLM generated token, tool response token."""
-    response_mask: list[int]
-    """Response mask, 1 for LLM generated token, 0 for tool response token."""
-    num_turns: int = 0
-    """Number of chat turns, including user, assistant, tool."""
-    metrics: AgentLoopMetrics
-    """Auxiliary performance metrics"""
-    is_cancel: bool = False
-    """Indicates whether the request was interrupted"""
-    log_probs: list[float] = None
-    """Response token log probs including LLM generated token, tool response token."""
-
-
-# make hydra.utils.instantiate happy
-class _DummyConfig:
-    def __init__(self, config: DictConfig) -> None:
-        self.config = config
-
-
-class AgentLoopBase(ABC):
-    """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various
-    environments."""
-
-    _class_initialized = False
-
-    def __init__(
-        self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs
-    ):
-        """Initialize agent loop, each sample will have its own loop instance.
-
-        Args:
-            trainer_config (_DummyConfig): trainer config.
-            server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
-            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
-        """
-        self.init_class(trainer_config.config, tokenizer, **kwargs)
-        self.config = trainer_config.config
-        self.server_manager = server_manager
-        self.tokenizer = tokenizer
-        self.loop = asyncio.get_running_loop()
-
-    @classmethod
-    def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs):
-        """This is used to do heavy initialization work that should shared across all instances. It's only called once.
-
-        Args:
-            config (DictConfig): trainer config.
-            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
-            **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`.
-        """
-        if cls._class_initialized:
-            return
-        cls._class_initialized = True
-
-    @abstractmethod
-    async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput]
-    ) -> AgentLoopOutput:
-        """Run agent loop to interact with LLM server and environment.
-
-        Args:
-            messages (List[Dict[str, Any]]): Input messages.
-            sampling_params (Dict[str, Any]): LLM sampling params.
-            partial_output: Optional[AgentLoopOutput]: already rollout result.
-
-        Returns:
-            AgentLoopOutput: Agent loop output.
-        """
-        raise NotImplementedError
-
-
-def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto:
-    """Static method to postprocess a list of AgentLoopOutput into DataProto
-
-    Args:
-        inputs: List of AgentLoopOutput
-        tokenizer: Tokenizer instance
-        config: Configuration object
-
-    Returns:
-        DataProto: Processed batch data
-    """
-    # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
-    # prompts: left pad
-    # responses: right pad
-    # input_ids: prompt + response
-    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
-    # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
-
-    # prompts
-    tokenizer.padding_side = "left"
-    outputs = tokenizer.pad(
-        [{"input_ids": input.prompt_ids} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.prompt_length,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-    # responses
-    tokenizer.padding_side = "right"
-    outputs = tokenizer.pad(
-        [{"input_ids": input.response_ids} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.response_length,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-    # response_mask
-    outputs = tokenizer.pad(
-        [{"input_ids": input.response_mask} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.response_length,
-        return_tensors="pt",
-        return_attention_mask=False,
-    )
-    response_mask = outputs["input_ids"]
-    assert response_ids.shape == response_mask.shape, (
-        f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
-    )
-    response_mask = response_mask * response_attention_mask
-
-    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
-    attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
-    position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
-
-    batch = TensorDict(
-        {
-            "prompts": prompt_ids,  # [bsz, prompt_length]
-            "responses": response_ids,  # [bsz, response_length]
-            "response_mask": response_mask,  # [bsz, response_length]
-            "input_ids": input_ids,  # [bsz, prompt_length + response_length]
-            "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
-            "position_ids": position_ids,  # [bsz, prompt_length + response_length]
-        },
-        batch_size=len(input_ids),
-    )
-
-    num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
-    metrics = [input.metrics.model_dump() for input in inputs]
-    return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
-
-
-@ray.remote
-class AgentLoopWorker:
-    """Agent loop worker takes a batch of messages and run each message in an agent loop."""
-
-    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]):
-        """Initialize agent loop manager.
-
-        Args:
-            config (DictConfig): YAML config.
-            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
-        """
-        self.config = config
-        self.server_manager = AsyncLLMServerManager(config, server_handles)
-
-        model_path = config.actor_rollout_ref.model.path
-        self.model_name = "/".join(model_path.split("/")[-2:])
-        local_path = copy_to_local(config.actor_rollout_ref.model.path)
-        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True)
-
-        agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path
-        if agent_loop_config_path:
-            agent_loop_configs = OmegaConf.load(agent_loop_config_path)
-            for agent_loop_config in agent_loop_configs:
-                _agent_loop_registry[agent_loop_config.name] = agent_loop_config
-
-        trace_config = config.trainer.get("rollout_trace", {})
-        trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
-        RolloutTraceConfig.init(
-            self.config.trainer.project_name,
-            self.config.trainer.experiment_name,
-            trace_config.get("backend"),
-            trace_config.get("token2text", False),
-        )
-
-    async def generate_sequences(self, batch: DataProto) -> DataProto:
-        """Generate sequences from agent loop.
-
-        Args:
-            batch (DataProto): Input batch.
-
-        Returns:
-            DataProto: Output batch.
-            - prompts: [bsz, prompt_length], prompt token ids from dataset.
-            - responses: [bsz, response_length], output token ids include response tokens
-              from LLM generation and observation tokens from tool_calls.
-            - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens.
-            - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens
-              and response tokens.
-            - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens.
-            - position_ids: [bsz, prompt_length + response_length], incremental position ids.
-
-            For multi-turn conversations:
-            responses:     |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->|
-            response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0|
-        """
-        config = self.config.actor_rollout_ref.rollout
-        sampling_params = dict(
-            temperature=config.temperature,
-            top_p=config.top_p,
-            repetition_penalty=1.0,
-        )
-
-        # override sampling params for validation
-        if batch.meta_info.get("validate", False):
-            sampling_params["top_p"] = config.val_kwargs.top_p
-            sampling_params["temperature"] = config.val_kwargs.temperature
-
-        # by default, we assume it's a single turn agent
-        if "agent_name" not in batch.non_tensor_batch:
-            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
-
-        tasks = []
-        agent_names = batch.non_tensor_batch["agent_name"]
-        raw_prompts = batch.non_tensor_batch["raw_prompt"]
-        if "index" in batch.non_tensor_batch:
-            index = batch.non_tensor_batch["index"]
-        else:
-            index = np.arange(len(raw_prompts))
-
-        trajectory_info = await get_trajectory_info(
-            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
-        )
-
-        for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True):
-            tasks.append(
-                asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory))
-            )
-        outputs = await asyncio.gather(*tasks)
-
-        output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config)
-        return output
-
-    async def generate_sequences_no_post(
-        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
-    ) -> list[AgentLoopOutput]:
-        """Generate sequences from agent loop.
-
-        Args:
-            batch (DataProto): Input batch.
-            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
-
-        Returns:
-            list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
-            Each AgentLoopOutput contains:
-            - prompt_ids: prompt token ids
-            - response_ids: response token ids including LLM generated and tool response tokens
-            - response_mask: 1 for LLM generated tokens, 0 for tool response tokens
-            - num_turns: number of chat turns
-            - metrics: performance metrics
-        """
-        config = self.config.actor_rollout_ref.rollout
-        sampling_params = dict(
-            temperature=config.temperature,
-            top_p=config.top_p,
-            repetition_penalty=1.0,
-        )
-
-        # override sampling params for validation
-        if batch.meta_info.get("validate", False):
-            sampling_params["top_p"] = config.val_kwargs.top_p
-            sampling_params["temperature"] = config.val_kwargs.temperature
-
-        # by default, we assume it's a single turn agent
-        if "agent_name" not in batch.non_tensor_batch:
-            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
-
-        tasks = []
-        agent_names = batch.non_tensor_batch["agent_name"]
-        raw_prompts = batch.non_tensor_batch["raw_prompt"]
-        if "index" in batch.non_tensor_batch:
-            index = batch.non_tensor_batch["index"]
-        else:
-            index = np.arange(len(raw_prompts))
-
-        trajectory_info = await get_trajectory_info(
-            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
-        )
-        if not partial_output_list:
-            partial_output_list = [None] * len(batch)
-
-        for agent_name, messages, trajectory, partial_output in zip(
-            agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True
-        ):
-            tasks.append(
-                asyncio.create_task(
-                    self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
-                )
-            )
-        outputs = await asyncio.gather(*tasks)
-
-        return outputs
-
-    async def _run_agent_loop(
-        self,
-        agent_name: str,
-        messages: list[dict[str, Any]],
-        sampling_params: dict[str, Any],
-        trajectory: dict[str, Any],
-        partial_output: Optional[AgentLoopOutput] = None,
-    ) -> AgentLoopOutput:
-        with rollout_trace_attr(
-            step=trajectory["step"],
-            sample_index=trajectory["sample_index"],
-            rollout_n=trajectory["rollout_n"],
-            validate=trajectory["validate"],
-            name="agent_loop",
-        ):
-            assert agent_name in _agent_loop_registry, (
-                f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
-            )
-            agent_loop_config = _agent_loop_registry[agent_name]
-            agent_loop = hydra.utils.instantiate(
-                config=agent_loop_config,
-                trainer_config=_DummyConfig(config=self.config),
-                server_manager=self.server_manager,
-                tokenizer=self.tokenizer,
-            )
-            output = await agent_loop.run(messages, sampling_params, partial_output)
-            return output
-
-
-class AgentLoopManager:
-    """Agent loop manager that manages a group of agent loop workers."""
-
-    def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
-        """Initialize agent loop manager.
-
-        Args:
-            config (DictConfig): trainer config.
-            worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group.
-        """
-        self.config = config
-        self.worker_group = worker_group
-
-        self._initialize_llm_servers()
-        self._init_agent_loop_workers()
-
-        # Initially we're in sleep mode.
-        self.sleep()
-
-    def _initialize_llm_servers(self):
-        self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
-        self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size
-
-        register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center")
-        workers_info = ray.get(register_center.get_worker_info.remote())
-        assert len(workers_info) == self.worker_group.world_size
-
-        self.async_llm_servers = [None] * self.rollout_dp_size
-        self.server_addresses = [None] * self.rollout_dp_size
-
-        if self.config.actor_rollout_ref.rollout.agent.custom_async_server:
-            server_class = async_server_class(
-                rollout_backend=self.config.actor_rollout_ref.rollout.name,
-                rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path,
-                rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name,
-            )
-        else:
-            server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name)
-
-        # Start all server instances, restart if address already in use.
-        unready_dp_ranks = set(range(self.rollout_dp_size))
-        while len(unready_dp_ranks) > 0:
-            servers = {
-                rollout_dp_rank: server_class.options(
-                    # make sure AsyncvLLMServer colocates with its corresponding workers
-                    scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
-                        node_id=workers_info[rollout_dp_rank * self.rollout_tp_size],
-                        soft=False,
-                    ),
-                    name=f"async_llm_server_{rollout_dp_rank}",
-                ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix)
-                for rollout_dp_rank in unready_dp_ranks
-            }
-
-            for rollout_dp_rank, server in servers.items():
-                try:
-                    address = ray.get(server.get_server_address.remote())
-                    self.server_addresses[rollout_dp_rank] = address
-                    self.async_llm_servers[rollout_dp_rank] = server
-                    unready_dp_ranks.remove(rollout_dp_rank)
-                except Exception:
-                    ray.kill(server)
-                    print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...")
-
-        # All server instances are ready, init AsyncLLM engine.
-        ray.get([server.init_engine.remote() for server in self.async_llm_servers])
-
-    def _init_agent_loop_workers(self):
-        self.agent_loop_workers = []
-        # 获取建议的资源配置
-        agent_config = self.config.actor_rollout_ref.rollout.agent
-        max_concurrency = agent_config.get("max_concurrency", 10)
-        num_cpus = agent_config.get("num_cpus", 2)  # 默认2个CPU核心
-
-        for i in range(agent_config.num_workers):
-            self.agent_loop_workers.append(
-                AgentLoopWorker.options(
-                    name=f"agent_loop_worker_{i}",
-                    max_concurrency=max_concurrency,  # 设置最大并发数
-                    num_cpus=num_cpus,  # 设置CPU资源需求
-                ).remote(self.config, self.async_llm_servers)
-            )
-
-    def generate_sequences(self, prompts: DataProto) -> DataProto:
-        """Split input batch and dispatch to agent loop workers.
-
-        Args:
-            prompts (DataProto): Input batch.
-
-        Returns:
-            DataProto: Output batch.
-        """
-        if self.config.actor_rollout_ref.rollout.free_cache_engine:
-            self.wake_up()
-        chunkes = prompts.chunk(len(self.agent_loop_workers))
-        outputs = ray.get(
-            [
-                worker.generate_sequences.remote(chunk)
-                for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True)
-            ]
-        )
-        output = DataProto.concat(outputs)
-        if self.config.actor_rollout_ref.rollout.free_cache_engine:
-            self.sleep()
-
-        # calculate performance metrics
-        metrics = [output.meta_info["metrics"] for output in outputs]  # List[List[Dict[str, str]]]
-        timing = self._performance_metrics(metrics, output)
-
-        output.meta_info = {"timing": timing}
-        return output
-
-    async def generate_single_sample_async(
-        self,
-        sample: DataProto,
-        partial_output_list: Optional[list[AgentLoopOutput]],
-    ) -> list[AgentLoopOutput]:
-        """
-        异步处理单个样本, 需要复制n次
-
-        Args:
-            sample: 单个样本数据
-            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
-
-        Returns:
-            tuple[AgentLoopOutput, float]: 处理结果和处理时间
-        """
-        # 使用负载均衡选择 worker
-        worker = self._select_best_worker()
-        # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
-        output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list)
-        return await asyncio.wrap_future(output_future.future())
-
-    def _select_best_worker(self):
-        """选择最佳的 worker（简单的轮询负载均衡）"""
-        if not hasattr(self, "_worker_index"):
-            self._worker_index = 0
-
-        worker = self.agent_loop_workers[self._worker_index]
-        self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
-        return worker
-
-    def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
-        timing = {}
-        t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk])
-        t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk])
-        timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min()
-        timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max()
-        timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean()
-        timing["agent_loop/tool_calls/min"] = t_tool_calls.min()
-        timing["agent_loop/tool_calls/max"] = t_tool_calls.max()
-        timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean()
-
-        # batch sequence generation is bounded by the slowest sample
-        slowest = np.argmax(t_generate_sequences + t_tool_calls)
-        attention_mask = output.batch["attention_mask"][slowest]
-        prompt_length = output.batch["prompts"].shape[1]
-        timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
-        timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
-        timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
-        timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
-
-        return timing
-
-    def wake_up(self):
-        """Wake up all rollout server instances."""
-        ray.get([server.wake_up.remote() for server in self.async_llm_servers])
-
-    def sleep(self):
-        """Sleep all rollout server instances."""
-        ray.get([server.sleep.remote() for server in self.async_llm_servers])
-
-    async def cancel_async(self):
-        """Cancel all rollout tasks asynchronously."""
-        futures = [server.cancel.remote() for server in self.async_llm_servers]
-        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
-
-    async def resume_async(self):
-        """Cancel all rollout tasks asynchronously."""
-        futures = [server.resume.remote() for server in self.async_llm_servers]
-        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
deleted file mode 100644
index ccdb9084238..00000000000
--- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import os
-from typing import Any, Optional
-from uuid import uuid4
-
-from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
-from verl.utils.profiler import simple_timer
-
-logger = logging.getLogger(__file__)
-logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
-
-
-@register("partial_single_turn_agent")
-class PartialSingleTurnAgentLoop(AgentLoopBase):
-    """Naive agent loop that only do single turn chat completion."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
-        self.response_length = self.config.actor_rollout_ref.rollout.response_length
-
-    async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
-    ) -> AgentLoopOutput:
-        if not output:
-            prompt_ids = await self.loop.run_in_executor(
-                None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
-            )
-        else:
-            if output.is_cancel:
-                # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
-                prompt_ids = output.prompt_ids + output.response_ids
-            else:
-                # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
-                return output
-
-        metrics = {}
-        request_id = uuid4().hex
-        with simple_timer("generate_sequences", metrics):
-            response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
-                request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
-            )
-
-        if not output:
-            response_mask = [1] * len(response_ids)
-        # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
-        else:
-            prompt_ids = output.prompt_ids
-            log_probs = output.log_probs + log_probs
-            response_ids = output.response_ids + response_ids
-            response_mask = [1] * len(response_ids)
-
-        return AgentLoopOutput(
-            prompt_ids=prompt_ids,
-            response_ids=response_ids[: self.response_length],
-            response_mask=response_mask[: self.response_length],
-            num_turns=2,
-            metrics=metrics,
-            is_cancel=is_cancel,
-            log_probs=log_probs,
-        )

From fa9e103cf36359425222d19471d73c5a71cec09c Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 9 Sep 2025 22:11:10 +0800
Subject: [PATCH 119/182] refactor agent_loop

---
 recipe/fully_async_policy/detach_utils.py     | 147 +++++++++---------
 recipe/fully_async_policy/fully_async_main.py |   5 +-
 .../fully_async_rollouter.py                  |  25 ++-
 .../fully_async_policy/fully_async_trainer.py |  42 ++---
 recipe/fully_async_policy/message_queue.py    |   2 +-
 recipe/fully_async_policy/param_sync.py       |   7 +-
 verl/experimental/agent_loop/__init__.py      |   2 +-
 verl/experimental/agent_loop/agent_loop.py    |   5 +-
 .../agent_loop/single_turn_agent_loop.py      |   6 +-
 .../agent_loop/tool_agent_loop.py             |   6 +-
 verl/trainer/ppo/ray_trainer.py               |   1 -
 11 files changed, 124 insertions(+), 124 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 127afca6881..18e45d50a16 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
-from dataclasses import dataclass
-from typing import Any, Optional, Dict, List
 from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -231,157 +231,160 @@ def assemble_batch_from_rollout_samples(
 
     return final_batch
 
+
 class MetricsAggregator:
     """Metrics aggregator, used to combine metrics from multiple training steps"""
-    
+
     def __init__(self, total_gpus: int):
         # Store all values ​​for each metric
-        self.metric_values: Dict[str, List[float]] = defaultdict(list)
+        self.metric_values: dict[str, list[float]] = defaultdict(list)
         # Store the number of samples at each step for weighted averaging
-        self.sample_counts: List[int] = []
+        self.sample_counts: list[int] = []
         # Store the timestamp of each step for time-related calculations
-        self.timestamps: List[float] = []
+        self.timestamps: list[float] = []
         # Step Count
         self.step_count = 0
         # total num gpus used
         self.total_gpus = total_gpus
-        
+
         # Metric aggregation rule configuration
         self.aggregation_rules = self._init_aggregation_rules()
-    
-    def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]:
+
+    def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]:
         """Initialize metrics aggregation rules"""
         return {
             # Time-Based metrics, can add metrics here
-            'time_sum': [
-                'perf/time_per_step'
-            ],
+            "time_sum": ["perf/time_per_step"],
         }
-    
-    def add_step_metrics(self, metrics: Dict[str, Any], sample_count: int, timestamp: float = None):
+
+    def add_step_metrics(self, metrics: dict[str, Any], sample_count: int, timestamp: float = None):
         """Adding a single-step metrics"""
         if timestamp is None:
             timestamp = time.time()
-            
+
         self.sample_counts.append(sample_count)
         self.timestamps.append(timestamp)
         self.step_count += 1
-        
+
         # Store all metrics values
         for key, value in metrics.items():
             if isinstance(value, (int, float, np.number)):
                 self.metric_values[key].append(float(value))
             elif isinstance(value, torch.Tensor):
                 self.metric_values[key].append(float(value.item()))
-    
+
     def _get_aggregation_type(self, metric_name: str) -> str:
         """Determine the aggregation type based on the metric name"""
         for agg_type, metric_list in self.aggregation_rules.items():
             if metric_name in metric_list:
                 return agg_type
-                
+
         metric_lower = metric_name.lower()
-        if any(keyword in metric_lower for keyword in ['timing_s/']):
-            return 'time_sum'
-        if any(keyword in metric_lower for keyword in ['mean', 'avg', 'average']):
-            return 'avg'
-        if any(keyword in metric_lower for keyword in ['max', 'maximum']):
-            return 'max'
-        if any(keyword in metric_lower for keyword in ['min', 'minimum']):
-            return 'min'
-        if any(keyword in metric_lower for keyword in ['sum', 'total']):
-           return 'sum'
-        if any(keyword in metric_lower for keyword in ['weighted_avg']):
-            return 'weighted_avg'
-        
+        if any(keyword in metric_lower for keyword in ["timing_s/"]):
+            return "time_sum"
+        if any(keyword in metric_lower for keyword in ["mean", "avg", "average"]):
+            return "avg"
+        if any(keyword in metric_lower for keyword in ["max", "maximum"]):
+            return "max"
+        if any(keyword in metric_lower for keyword in ["min", "minimum"]):
+            return "min"
+        if any(keyword in metric_lower for keyword in ["sum", "total"]):
+            return "sum"
+        if any(keyword in metric_lower for keyword in ["weighted_avg"]):
+            return "weighted_avg"
+
         import warnings
-        warnings.warn(f"No aggregation rule is matched in init_aggregation_rules. \
-                      For metric {metric_name}, the 'avg' method is used")
-        return 'avg'
 
-    def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> float:
+        warnings.warn(
+            f"No aggregation rule is matched in init_aggregation_rules. \
+                      For metric {metric_name}, the 'avg' method is used"
+        )
+        return "avg"
+
+    def _aggregate_single_metric(self, metric_name: str, values: list[float]) -> float:
         """Aggregating a single metric"""
         if not values:
             return 0.0
-            
+
         agg_type = self._get_aggregation_type(metric_name)
-        
-        if agg_type == 'last':
+
+        if agg_type == "last":
             return values[-1]
-        
-        elif agg_type == 'weighted_avg':
+
+        elif agg_type == "weighted_avg":
             # Weighted average
             if len(values) != len(self.sample_counts):
                 # If the lengths do not match, use a simple average
                 return sum(values) / len(values)
-            
+
             total_samples = sum(self.sample_counts)
             if total_samples == 0:
                 return sum(values) / len(values)
-            
-            weighted_sum = sum(v * c for v, c in zip(values, self.sample_counts))
+
+            weighted_sum = sum(v * c for v, c in zip(values, self.sample_counts, strict=False))
             return weighted_sum / total_samples
-        
-        elif agg_type == 'sum' or agg_type == 'time_sum':
+
+        elif agg_type == "sum" or agg_type == "time_sum":
             return sum(values)
-        
-        elif agg_type == 'avg':
+
+        elif agg_type == "avg":
             return sum(values) / len(values)
-        
-        elif agg_type == 'max':
+
+        elif agg_type == "max":
             return max(values)
-        
-        elif agg_type == 'min':
+
+        elif agg_type == "min":
             return min(values)
-        
+
         else:
             # Default average
             return sum(values) / len(values)
-    
-    def get_aggregated_metrics(self) -> Dict[str, Any]:
+
+    def get_aggregated_metrics(self) -> dict[str, Any]:
         """aggregated metrics"""
         t = time.time()
         if self.step_count == 0:
             return {}
-        
+
         aggregated = {}
-        
+
         # Aggregate all metrics
         for metric_name, values in self.metric_values.items():
             aggregated[metric_name] = self._aggregate_single_metric(metric_name, values)
-        
-        # Aggregate special metrics  
+
+        # Aggregate special metrics
         aggregated = self._special_metrics_aggergate(aggregated)
 
         print(f"aggregated metrics done. cost {time.time() - t}")
-        
+
         return aggregated
-    
-    def _special_metrics_aggergate(self, aggregated: Dict[str, Any]) -> Dict[str, Any]:
+
+    def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, Any]:
         """calculate special metrics"""
 
         if "global_seqlen/minmax_diff" in aggregated.keys():
             aggregated["global_seqlen/minmax_diff"] = aggregated["global_seqlen/max"] - aggregated["global_seqlen/min"]
-        
+
         REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"}
         if REQUIRED_PERF_KEYS.issubset(aggregated):
-            aggregated["perf/throughput"] = aggregated['perf/total_num_tokens'] / \
-                (aggregated["perf/time_per_step"] * self.total_gpus)
-            
+            aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / (
+                aggregated["perf/time_per_step"] * self.total_gpus
+            )
+
         return aggregated
-    
+
     def reset(self):
         """Reset Aggregator"""
         self.metric_values.clear()
         self.sample_counts.clear()
         self.timestamps.clear()
         self.step_count = 0
-    
-    def get_current_stats(self) -> Dict[str, Any]:
+
+    def get_current_stats(self) -> dict[str, Any]:
         """Get statistics about the current aggregation state (for debugging)"""
         return {
-            'step_count': self.step_count,
-            'metric_count': len(self.metric_values),
-            'total_samples': sum(self.sample_counts),
-            'metric_names': list(self.metric_values.keys()),
+            "step_count": self.step_count,
+            "metric_count": len(self.metric_values),
+            "total_samples": sum(self.sample_counts),
+            "metric_names": list(self.metric_values.keys()),
         }
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 78fc1784b82..a588679991c 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -218,9 +218,8 @@ def _initialize_components(self, config) -> None:
 
         # load checkpoint and sync parameter before doing anything
         val_before_train = val_reward_fn is not None and config.trainer.get("val_before_train", True)
-        ray.get(self.components["trainer"].load_checkpoint.remote()) 
-        ray.get(param_synchronizer.sync_weights.remote(version=0,
-                                                       validate=val_before_train))
+        ray.get(self.components["trainer"].load_checkpoint.remote())
+        ray.get(param_synchronizer.sync_weights.remote(version=0, validate=val_before_train))
 
         self.components["param_synchronizer"] = param_synchronizer
         print("[ASYNC MAIN] All components initialized successfully")
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index c25a52abbe0..4a2a7d7200c 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 import asyncio
 import time
-from pprint import pformat, pprint
+from pprint import pformat
 
 import ray
-from omegaconf import OmegaConf
 
 from recipe.fully_async_policy.detach_utils import (
     RolloutSample,
@@ -165,10 +164,11 @@ async def set_required_samples(self, required_samples: int):
 
             # 单次最多扔一次更新需要的样本
             self.max_concurrent_samples = int(
-                self.config.actor_rollout_ref.actor.ppo_mini_batch_size 
-                / self.config.actor_rollout_ref.rollout.n 
-                * self.async_rollout_manager.rollout_dp_size * 8
-                )
+                self.config.actor_rollout_ref.actor.ppo_mini_batch_size
+                / self.config.actor_rollout_ref.rollout.n
+                * self.async_rollout_manager.rollout_dp_size
+                * 8
+            )
             self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples)
             self.max_queue_size = self.max_required_samples
 
@@ -207,16 +207,13 @@ async def update_param_version(self, version: int, validate: bool = False, globa
                 self.val_reward_fn is not None
                 and self.config.rollout.test_freq > 0
                 and self.current_param_version % self.config.rollout.test_freq == 0
-                and self.current_param_version > 0 # don't test here in the initial parameter sync
-            ) or (
-                validate and self.val_reward_fn is not None
-            ):
+                and self.current_param_version > 0  # don't test here in the initial parameter sync
+            ) or (validate and self.val_reward_fn is not None):
                 with marked_timer("testing", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
-                data = ValidateMetrics(timing_raw=timing_raw,
-                                       metrics=val_metrics,
-                                       global_steps=global_steps,
-                                       param_version=version)
+                data = ValidateMetrics(
+                    timing_raw=timing_raw, metrics=val_metrics, global_steps=global_steps, param_version=version
+                )
                 await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
 
     def _validate_config(self):
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 2b549c0b621..0d83a00ba4a 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -15,6 +15,7 @@
 import time
 import warnings
 from datetime import datetime
+from pprint import pprint
 from typing import Any
 
 import ray
@@ -22,9 +23,9 @@
 from tqdm import tqdm
 
 from recipe.fully_async_policy.detach_utils import (
+    MetricsAggregator,
     ValidateMetrics,
     assemble_batch_from_rollout_samples,
-    MetricsAggregator,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
@@ -37,7 +38,7 @@
     WorkerType,
 )
 from verl.utils.debug import marked_timer
-from pprint import pprint
+
 
 @ray.remote(num_cpus=10)
 class FullyAsyncTrainer(RayPPOTrainer):
@@ -121,8 +122,10 @@ def __init__(
         self.required_samples = int(
             self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n
         )
-        total_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node + \
-            config.rollout.nnodes * config.rollout.n_gpus_per_node
+        total_gpus = (
+            config.trainer.nnodes * config.trainer.n_gpus_per_node
+            + config.rollout.nnodes * config.rollout.n_gpus_per_node
+        )
         self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus)
 
     def set_message_queue_client(self, message_queue_client: MessageQueueClient):
@@ -303,9 +306,7 @@ def fit(self):
 
             self._collect_metrics(batch, 0, metrics, timing_raw)
             self.metrics_aggregator.add_step_metrics(
-                metrics=metrics, 
-                sample_count=self.required_samples,
-                timestamp=time.time()
+                metrics=metrics, sample_count=self.required_samples, timestamp=time.time()
             )
             # Trigger parameter synchronization after training step
             time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3]
@@ -321,23 +322,25 @@ def fit(self):
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
                 self.logger.log(data=val_data.metrics, step=val_data.param_version)
                 self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
-                pprint(f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \
-                      Validation metrics: {val_data.metrics}")
+                pprint(
+                    f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \
+                      Validation metrics: {val_data.metrics}"
+                )
             self.global_steps += 1
 
         # final parameter sync and validate
         if val_data is None:
-            self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps-1)
+            self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps - 1)
             ray.get(self.param_synchronizer.wait_last_sync.remote())
             val_data = self.message_queue_client.get_validate_sync()
             if val_data:
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
                 self.logger.log(data=val_data.metrics, step=val_data.param_version)
-                self.logger.log(data=val_data.timing_raw, step=val_data.param_version)     
+                self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
         pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
         self.progress_bar.close()
 
-        self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint
+        self._check_save_checkpoint(True, timing_raw)  # TODO: 检查checkpoint
 
     def load_checkpoint(self):
         return self._load_checkpoint()
@@ -347,20 +350,21 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step
         Trigger parameter synchronization after training step
         This ensures rollouter always uses the latest trained parameters
         """
-        if self.local_trigger_step < self.trigger_parameter_sync_step  and not validate:
+        if self.local_trigger_step < self.trigger_parameter_sync_step and not validate:
             self.local_trigger_step += 1
             return
 
-        self.current_param_version += 1 
+        self.current_param_version += 1
         self.local_trigger_step = 1
         self.logger.log(
             data=self.metrics_aggregator.get_aggregated_metrics(),
             step=self.current_param_version,
-            )
+        )
         self.progress_bar.update(1)
         self.metrics_aggregator.reset()
         ray.get(self.param_synchronizer.wait_last_sync.remote())
-        ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, 
-                                                            validate=validate,
-                                                            global_steps=global_steps)
-                                                            )
\ No newline at end of file
+        ray.get(
+            self.param_synchronizer.sync_weights.remote(
+                self.current_param_version, validate=validate, global_steps=global_steps
+            )
+        )
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index da1780deb47..22573f4b9d5 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -261,4 +261,4 @@ def get_statistics_sync(self) -> dict[str, Any]:
 
     def update_param_version_sync(self, version: int):
         """Update parameter version (async)"""
-        return ray.get(self.queue_actor.update_param_version.remote(version))
\ No newline at end of file
+        return ray.get(self.queue_actor.update_param_version.remote(version))
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 34fbca1c3e3..35efdd9c950 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -72,7 +72,7 @@ def _init_sync_group(self):
             group_name=self.sync_group_name,
         )
 
-    def sync_weights(self, version, validate = False, global_steps = 0):
+    def sync_weights(self, version, validate=False, global_steps=0):
         start_time = time.time()
 
         self.current_version = version
@@ -94,9 +94,8 @@ def sync_weights(self, version, validate = False, global_steps = 0):
         self.wait_last = self.rollouter.resume.remote()
 
     def wait_last_sync(self):
-        print(f"[ParameterSynchronizer] waiting last parameter sync and validate...")
-        start_time =  time.time()
+        print("[ParameterSynchronizer] waiting last parameter sync and validate...")
+        start_time = time.time()
         if self.wait_last:
             ray.get(self.wait_last)
         print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds")
-
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index 0d131dd1d3a..67dcb16047e 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 from .agent_loop import AgentLoopBase, AgentLoopManager
-from .single_turn_agent_loop import SingleTurnAgentLoop
 from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
+from .single_turn_agent_loop import SingleTurnAgentLoop
 from .tool_agent_loop import ToolAgentLoop
 
 _ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop]
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 8c49390f456..117ca13a7a7 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -469,7 +469,10 @@ async def _run_agent_loop(
                 server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
             )
-            output = await agent_loop.run(messages, sampling_params, partial_output)
+            if agent_name == "partial_single_turn_agent":
+                output = await agent_loop.run(messages, sampling_params, partial_output)
+            else:
+                output = await agent_loop.run(messages, sampling_params)
             return output
 
 
diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py
index df6e1991888..411388e7321 100644
--- a/verl/experimental/agent_loop/single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/single_turn_agent_loop.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import logging
 import os
-from typing import Any, Optional
+from typing import Any
 from uuid import uuid4
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
@@ -32,9 +32,7 @@ def __init__(self, *args, **kwargs):
         self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
         self.response_length = self.config.actor_rollout_ref.rollout.response_length
 
-    async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
-    ) -> AgentLoopOutput:
+    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index 7c945b7d4c9..3437c0be5ab 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -15,7 +15,7 @@
 import json
 import logging
 import os
-from typing import Any, Optional
+from typing import Any
 from uuid import uuid4
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
@@ -56,9 +56,7 @@ def init_class(cls, config, tokenizer, **kwargs):
         cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True)
 
     @rollout_trace_op
-    async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
-    ) -> AgentLoopOutput:
+    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
         metrics = {}
         request_id = uuid4().hex
         prompt_ids = await self.loop.run_in_executor(
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 8d2c19d3364..56a1e5bcab1 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1287,7 +1287,6 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                         }
                     )
 
-
         if self.use_reference_policy:
             # compute reference log_prob
             with marked_timer("ref", timing_raw, color="olive"):

From 7bd48597adc7a48ab77a1af34fe7265ed3c2e37b Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 9 Sep 2025 22:25:12 +0800
Subject: [PATCH 120/182] refactor vllm async

---
 verl/experimental/agent_loop/agent_loop.py        | 15 ++-------------
 .../rollout/vllm_rollout/vllm_async_server.py     |  3 ++-
 .../rollout/vllm_rollout/vllm_rollout_spmd.py     |  2 --
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 117ca13a7a7..d00b7176380 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -182,16 +182,12 @@ def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs):
         cls._class_initialized = True
 
     @abstractmethod
-    async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput]
-    ) -> AgentLoopOutput:
+    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
         """Run agent loop to interact with LLM server and environment.
 
         Args:
             messages (List[Dict[str, Any]]): Input messages.
             sampling_params (Dict[str, Any]): LLM sampling params.
-            partial_output: Optional[AgentLoopOutput]: already rollout result.
-
         Returns:
             AgentLoopOutput: Agent loop output.
         """
@@ -567,17 +563,10 @@ def _initialize_llm_servers(self):
 
     def _init_agent_loop_workers(self):
         self.agent_loop_workers = []
-        # 获取建议的资源配置
-        agent_config = self.config.actor_rollout_ref.rollout.agent
-        max_concurrency = agent_config.get("max_concurrency", 10)
-        num_cpus = agent_config.get("num_cpus", 2)  # 默认2个CPU核心
-
-        for i in range(agent_config.num_workers):
+        for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers):
             self.agent_loop_workers.append(
                 AgentLoopWorker.options(
                     name=f"agent_loop_worker_{i}",
-                    max_concurrency=max_concurrency,  # 设置最大并发数
-                    num_cpus=num_cpus,  # 设置CPU资源需求
                 ).remote(self.config, self.async_llm_servers)
             )
 
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 3b3e9542252..2dc386e76fa 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -74,6 +74,7 @@ def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]:
     actor_names = sorted(actor_names, key=get_pg_index_and_local_rank)
     actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size]
     workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names]
+    print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}")
 
     return workers
 
@@ -205,8 +206,8 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_
         self.vllm_dp_rank = vllm_dp_rank
         self.wg_prefix = wg_prefix
         self.engine: AsyncLLM = None
-        # for cancel
 
+        # for cancel LLMServer
         self.paused = False
         self.lock = asyncio.Lock()
         self.cancel_event: dict[str, asyncio.Event] = {}
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index 071dd917119..5bd571016ac 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -58,7 +58,6 @@
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
-
 # TODO
 # 1. support pp in vllm
 # 2. passing tokenizer is not necessary? no encoding/decoding is happending here
@@ -459,7 +458,6 @@ def get_zeromq_address(self):
 
     def init_worker(self, all_kwargs: list[dict[str, Any]]):
         """Initialize worker engine."""
-
         all_kwargs[0]["rank"] = int(os.environ["RANK"])
         all_kwargs[0]["local_rank"] = 0
 

From ec3f0c52fb445968add8201e08f8f89467a31d80 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 9 Sep 2025 22:30:22 +0800
Subject: [PATCH 121/182] refactor logs

---
 verl/experimental/agent_loop/agent_loop.py             | 1 +
 verl/workers/rollout/vllm_rollout/vllm_async_server.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index d00b7176380..ddcad093326 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -188,6 +188,7 @@ async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, A
         Args:
             messages (List[Dict[str, Any]]): Input messages.
             sampling_params (Dict[str, Any]): LLM sampling params.
+
         Returns:
             AgentLoopOutput: Agent loop output.
         """
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 2dc386e76fa..4826ebaa1d0 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -370,7 +370,7 @@ async def generate_for_partial(
             token_ids = self.req_output[request_id].outputs[0].token_ids
             log_probs: list[float] = []
             for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs):
-                # sampling_params 中 logprobs 设置为1，只返回1个
+                # sampling_params 中 logprobs 设置为1，应该返回1个, 但是实测会有多个，取token_id所对应的log_prob
                 token_id = self.req_output[request_id].outputs[0].token_ids[i]
                 log_probs.append(x[token_id].logprob)
             is_cancel = generation_handle not in done

From 547d68f8572c68626be7ae60766bdad9c39d65d2 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 10 Sep 2025 15:46:44 +0800
Subject: [PATCH 122/182] qwen3 A3B

---
 .../fsdp2_fully-async_64-64_stal0.1/run.sh    | 168 ++++++++++++++++++
 .../runtime_env.yaml                          |   4 +
 .../exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh | 125 +++++++++++++
 .../fsdp2_colocate/runtime_env.yaml           |   5 +
 .../fsdp2_fully-async_32-32/run.sh            | 150 ++++++++++++++++
 .../fsdp2_fully-async_32-32/runtime_env.yaml  |   4 +
 .../megatron_colocate/runtime_env.yaml        |   5 +-
 7 files changed, 457 insertions(+), 4 deletions(-)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh
new file mode 100644
index 00000000000..e9133e50eac
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
+# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=4
+fsdp_size=2
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
+NNODES_TRAIN=${NNODES_TRAIN:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=4
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml
new file mode 100644
index 00000000000..92bacbdd204
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml
@@ -0,0 +1,4 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh
new file mode 100644
index 00000000000..5a0ca29d2a5
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-30BA3B_8k_fsdp2_colocate_64_mbs32'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+sp_size=4
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+fsdp_size=32
+
+python3 -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml
new file mode 100644
index 00000000000..b2333e66179
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml
@@ -0,0 +1,5 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_8k_fsdp2_colocate_64_mbs32"
+  HYDRA_FULL_ERROR: "1"
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh
new file mode 100644
index 00000000000..1cee5cce560
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen3-30BA3B_8k_fsdp2_async_32-32_mbs32_tpf8'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+sp_size=4
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+fsdp_size=32
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-4}
+NNODES_TRAIN=${NNODES_TRAIN:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=8
+partial_rollout=True
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml
new file mode 100644
index 00000000000..817cea30d09
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml
@@ -0,0 +1,4 @@
+env_vars:
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_8k_fsdp2_async_32-32_mbs32_tpf8"
+  HYDRA_FULL_ERROR: "1"
+  VLLM_USE_V1: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
index 3a497e90dd0..3fa60a48917 100644
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
@@ -1,5 +1,2 @@
 env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32"
-  HYDRA_FULL_ERROR: "1"
-  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_8k_fsdp2_colocate_64_mbs32"

From 1fc52bb5087d50858dbb9127d0d684d0d195f2a6 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 10 Sep 2025 19:45:07 +0800
Subject: [PATCH 123/182] staleness_threshold=0.1

---
 .../exp/qwen2-32B_128/fsdp2_colocate/run.sh   | 133 +++++++++++++++
 .../fsdp2_colocate}/runtime_env.yaml          |   4 +-
 .../fsdp2_fully-async_64-64/run.sh            | 153 +++++++++++++++++
 .../fsdp2_fully-async_64-64/runtime_env.yaml  |   4 +
 ...{dapo_7b_math_fsdp2_colocate.sh => run.sh} |   0
 ...fully-async_64-64_mbs32_tfq4.sh => run.sh} |   2 +-
 ...po_7b_math_megatron_colocate.sh => run.sh} |   0
 ...{dapo_7b_math_fsdp2_colocate.sh => run.sh} |   0
 ...ully-async_16-16_mbs32_tfq16.sh => run.sh} |   2 +-
 ...ully-async_16-16_mbs32_tfq16.sh => run.sh} |   0
 ...fully-async_24-8_mbs32_tfq32.sh => run.sh} |   2 +-
 ...fully-async_8-24_mbs32_tfq11.sh => run.sh} |   2 +-
 ...po_7b_math_megatron_colocate.sh => run.sh} |   0
 ...{dapo_7b_math_fsdp2_colocate.sh => run.sh} |   0
 ...fully-async_24-40_mbs32_tfq6.sh => run.sh} |   2 +-
 ...fully-async_32-32_mbs32_tfq8.sh => run.sh} |   2 +-
 ...ully-async_40-24_mbs32_tfq11.sh => run.sh} |   2 +-
 ...po_7b_math_megatron_colocate.sh => run.sh} |   0
 .../{test_dapo_qwen3_30b_math.sh => run.sh}   |   0
 .../{early_megatron_colocate.sh => run.sh}    |   0
 .../early_megatron_colocate.sh                | 161 ------------------
 .../megatron_colocate/runtime_env.yaml        |   5 -
 .../early_megatron_colocate.sh                | 161 ------------------
 .../megatron_colocate/runtime_env.yaml        |   5 -
 .../fsdp2_colocate/{fsdp2.sh => run.sh}       |   0
 .../{early_megatron_colocate.sh => run.sh}    |   0
 .../early_megatron_colocate.sh                | 156 -----------------
 .../early_megatron_colocate.sh                | 156 -----------------
 .../megatron_colocate/runtime_env.yaml        |   5 -
 .../shell/dapo_7b_math_fsdp2_2_6.sh           |   2 +-
 .../shell/dapo_7b_math_fsdp2_4_12.sh          |   2 +-
 .../shell/dapo_7b_math_fsdp2_8_8.sh           |   2 +-
 tests/special_e2e/run_fully_async_policy.sh   |   2 +-
 33 files changed, 303 insertions(+), 662 deletions(-)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh
 rename recipe/fully_async_policy/exp/{qwen3-32B_32/megatron_colocate => qwen2-32B_128/fsdp2_colocate}/runtime_env.yaml (50%)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
 rename recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/{dapo_7b_math_fsdp2_colocate.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh => run.sh} (99%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/{dapo_7b_math_megatron_colocate.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/{dapo_7b_math_fsdp2_colocate.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh => run.sh} (99%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh => run.sh} (99%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh => run.sh} (99%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/{dapo_7b_math_megatron_colocate.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/{dapo_7b_math_fsdp2_colocate.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh => run.sh} (99%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh => run.sh} (99%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh => run.sh} (99%)
 rename recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/{dapo_7b_math_megatron_colocate.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/{test_dapo_qwen3_30b_math.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/{early_megatron_colocate.sh => run.sh} (100%)
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
 rename recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/{fsdp2.sh => run.sh} (100%)
 rename recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/{early_megatron_colocate.sh => run.sh} (100%)
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh
new file mode 100644
index 00000000000..92203a7d87a
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-32B_20k_fsdp2_colocate_128'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 20))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+sp_size=8
+fsdp_size=-1
+
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+
+
+python -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=20 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=400 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml
similarity index 50%
rename from recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml
rename to recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml
index 2d0930d13ab..e33cfd681ca 100644
--- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml
@@ -1,5 +1,5 @@
 env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_megatron_colocate_32_mbs32"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_colocate_128"
   HYDRA_FULL_ERROR: "1"
   TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
+  VLLM_USE_V1: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
new file mode 100644
index 00000000000..270533a84c4
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
@@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64'
+
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 20))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=8
+fsdp_size=-1
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
+NNODES_TRAIN=${NNODES_TRAIN:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=4
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
new file mode 100644
index 00000000000..77590fb2709
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
@@ -0,0 +1,4 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh
similarity index 99%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh
index 9f410f95c6c..03ebab25cea 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh
@@ -80,7 +80,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=4
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
similarity index 99%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
index fcc5f472d8c..cdefd5a4b57 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
@@ -80,7 +80,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh
similarity index 99%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh
index 6c6cb13cf45..3de9279a9bc 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh
@@ -80,7 +80,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=32
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh
similarity index 99%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh
index 9add4e0e8bb..4ba49146329 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh
@@ -80,7 +80,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=11
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh
similarity index 99%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh
index 5da2116ef80..3d56ea8b403 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh
@@ -80,7 +80,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=6
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh
similarity index 99%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh
index 221d3c4d5a6..cc26be4f100 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh
@@ -80,7 +80,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=8
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh
similarity index 99%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh
index a15cf990bd1..0a67a563819 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh
@@ -80,7 +80,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=11
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh
rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh
rename to recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh
rename to recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
deleted file mode 100644
index b2d735f8704..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-train_prompt_mini_bsz=32
-n_resp_per_prompt=16
-
-NNODES=${NNODES:-4}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
-infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
-offload=True
-gen_tp=4
-train_tp=1
-train_pp=1
-EP=8
-ETP=1
-CP=1
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10 \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.strategy=megatron \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
-    actor_rollout_ref.actor.megatron.use_mbridge=True
-
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index 052557120ad..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-32/dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32"
-  HYDRA_FULL_ERROR: "1"
-  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
deleted file mode 100644
index 336d105cc5c..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-train_prompt_mini_bsz=32
-n_resp_per_prompt=16
-
-NNODES=${NNODES:-8}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
-infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
-offload=True
-gen_tp=4
-train_tp=1
-train_pp=1
-EP=8
-ETP=1
-CP=1
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10 \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.strategy=megatron \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
-    actor_rollout_ref.actor.megatron.use_mbridge=True
-
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index 3a497e90dd0..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32"
-  HYDRA_FULL_ERROR: "1"
-  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh
rename to recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh
similarity index 100%
rename from recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh
rename to recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
deleted file mode 100644
index 085c7231c59..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-32B_32k_megatron_colocate_32_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-train_prompt_mini_bsz=32
-n_resp_per_prompt=16
-
-NNODES=${NNODES:-4}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
-infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
-offload=True
-gen_tp=4
-train_tp=4
-train_pp=2
-EP=1
-ETP=1
-CP=1
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10 \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.strategy=megatron \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
-    actor_rollout_ref.actor.megatron.use_mbridge=True
-
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
deleted file mode 100644
index 145ea3dbec9..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-32B_32k_megatron_colocate_64_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-train_prompt_mini_bsz=32
-n_resp_per_prompt=16
-
-NNODES=${NNODES:-8}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
-infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
-offload=True
-gen_tp=4
-train_tp=4
-train_pp=2
-EP=1
-ETP=1
-CP=1
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10 \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.strategy=megatron \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
-    actor_rollout_ref.actor.megatron.use_mbridge=True
-
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index d3dc7176f0a..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-64/dapo_qwen3-32B_32k_megatron_colocate_64_mbs32"
-  HYDRA_FULL_ERROR: "1"
-  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
index 0d303bdde87..10563218878 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
@@ -78,7 +78,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=64
 total_rollout_steps=$(((512*100)))
 test_freq=2
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index 2b4bf9c31fe..fc9b2ad6607 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -78,7 +78,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=64
 total_rollout_steps=$(((512*100)))
 test_freq=10
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
index 688a87fab92..c59877d97f9 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
@@ -78,7 +78,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=64
 total_rollout_steps=$(((512*100)))
 test_freq=10
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 63cfcf622a6..4813f159696 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -58,7 +58,7 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((128*2)))
 test_freq=10
-staleness_threshold=1
+staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 

From d471890b404dd4ee470ccb8da6ab1fd047422b3e Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 10 Sep 2025 22:04:16 +0800
Subject: [PATCH 124/182] fix last_valid bug, fix staleness_samples reset

---
 recipe/fully_async_policy/fully_async_rollouter.py | 12 +++++++++---
 recipe/fully_async_policy/fully_async_trainer.py   |  4 +++-
 recipe/fully_async_policy/message_queue.py         |  5 ++++-
 recipe/fully_async_policy/param_sync.py            | 12 +++++++-----
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 4a2a7d7200c..162836a00f6 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -197,10 +197,16 @@ async def update_param_version(self, version: int, validate: bool = False, globa
             old_version = self.current_param_version
             self.current_param_version = version
             # every time param change, reset staleness_samples
-            self.staleness_samples = 0
+            self.staleness_samples = (
+                len(self.active_tasks)
+                + self.result_queue.qsize()
+                + self.cancel_queue.qsize()
+                + await self.message_queue_client.get_queue_size()
+            )
             print(
                 f"[FullyAsyncRollouter][Public][update_param_version] "
-                f"Parameter version updated from {old_version} to {version}"
+                f"Parameter version updated from {old_version} to {version} "
+                f",reset staleness_samples to: {self.staleness_samples}"
             )
             timing_raw = {}
             if (
@@ -412,7 +418,7 @@ async def _consumer_worker(self):
             rollout_sample = await self.result_queue.get()
             rollout_sample = merge_rollout_sample(self.config, self.tokenizer, rollout_sample)
 
-            # 直接将 RolloutSample 放入消息队列
+            # 将 RolloutSample 放入消息队列
             success = await self.message_queue_client.put_sample(
                 sample=ray.cloudpickle.dumps(rollout_sample),
                 param_version=rollout_sample.param_version,
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 0d83a00ba4a..5d2a2c794e8 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -337,7 +337,9 @@ def fit(self):
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
                 self.logger.log(data=val_data.metrics, step=val_data.param_version)
                 self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
-        pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
+                pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
+        else:
+            pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
         self.progress_bar.close()
 
         self._check_save_checkpoint(True, timing_raw)  # TODO: 检查checkpoint
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 22573f4b9d5..5094f9ab90a 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -79,9 +79,11 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
         """
         async with self._lock:
             # If queue is full, remove the oldest sample (rarely happens)
+            is_drop = False
             if len(self.queue) >= self.max_queue_size:
                 self.queue.popleft()
                 self.dropped_samples += 1
+                is_drop = True
                 logger.warning("Queue full, dropped sample")
             self.queue.append(sample)
             self.total_produced += 1
@@ -91,7 +93,8 @@ async def put_sample(self, sample: Any, param_version: int) -> bool:
 
             if self.total_produced % 100 == 0:
                 print(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}")
-
+            if is_drop:
+                return False
             return True
 
     async def get_sample(self) -> Any | None:
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 35efdd9c950..2e11327afec 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -41,7 +41,8 @@ def __init__(self, config, trainer, rollouter, mq):
         self.weights_info = None
         self.sync_group_initialized = False
         self.sync_group_name = "actor_rollout"
-        self.wait_last = None
+        self.wait_last0 = None
+        self.wait_last1 = None
 
         # Statistics
         self.current_version = 0
@@ -90,12 +91,13 @@ def sync_weights(self, version, validate=False, global_steps=0):
         print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds")
 
         # Async Update rollout version & validation
-        self.rollouter.update_param_version.remote(version, validate, global_steps)
-        self.wait_last = self.rollouter.resume.remote()
+        self.wait_last0 = self.rollouter.update_param_version.remote(version, validate, global_steps)
+        self.wait_last1 = self.rollouter.resume.remote()
 
     def wait_last_sync(self):
         print("[ParameterSynchronizer] waiting last parameter sync and validate...")
         start_time = time.time()
-        if self.wait_last:
-            ray.get(self.wait_last)
+        if self.wait_last0 or self.wait_last1 :
+            ray.get(self.wait_last0)
+            ray.get(self.wait_last1)
         print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds")

From 840cc73ae87418601f3ab54d91056f4fd7008287 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 10 Sep 2025 22:07:23 +0800
Subject: [PATCH 125/182] fix wait_last

---
 recipe/fully_async_policy/param_sync.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 2e11327afec..89deecad962 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -97,7 +97,8 @@ def sync_weights(self, version, validate=False, global_steps=0):
     def wait_last_sync(self):
         print("[ParameterSynchronizer] waiting last parameter sync and validate...")
         start_time = time.time()
-        if self.wait_last0 or self.wait_last1 :
+        if self.wait_last0:
             ray.get(self.wait_last0)
+        if self.wait_last1:
             ray.get(self.wait_last1)
         print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds")

From fd48a9a50cc53bf987415b6f25ae3eb9eef22e51 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 11 Sep 2025 14:27:26 +0800
Subject: [PATCH 126/182] qwen2.5 32B

---
 .../fsdp2_fully-async_48-80/run.sh            | 153 ++++++++++++++++++
 .../fsdp2_fully-async_48-80/runtime_env.yaml  |   4 +
 2 files changed, 157 insertions(+)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh
new file mode 100644
index 00000000000..8ab8f9be2d9
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh
@@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_48-80'
+
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 20))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=8
+fsdp_size=-1
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-6}
+NNODES_TRAIN=${NNODES_TRAIN:-10}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=3
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml
new file mode 100644
index 00000000000..0caf9804ebc
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml
@@ -0,0 +1,4 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_48-80"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 8f445b04b1e78424f75aa66c013c77de020d0c64 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 11 Sep 2025 14:58:25 +0800
Subject: [PATCH 127/182] fsdp2_fully-async_16-16

---
 .../fsdp2_fully-async_16-16/run.sh            |  14 --
 .../fsdp2_fully-async_16-16_stal0.1/run.sh    | 168 ------------------
 .../runtime_env.yaml                          |   4 -
 3 files changed, 186 deletions(-)
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
index cdefd5a4b57..2a22fd97d08 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
@@ -5,22 +5,8 @@ project_name='DAPO'
 exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs'
 
 # Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
 MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
 CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
 TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
 TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
 
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh
deleted file mode 100644
index 2217661dd33..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-2}
-NNODES_TRAIN=${NNODES_TRAIN:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=16
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml
deleted file mode 100644
index 0b188206127..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From f85c1381217f8b03c0bd6f49e7a1fd95dd757215 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 12 Sep 2025 19:30:44 +0800
Subject: [PATCH 128/182] update note in config, param_sync, mq

---
 .../config/fully_async_ppo_trainer.yaml       | 45 +++++++++++++------
 recipe/fully_async_policy/message_queue.py    |  6 +--
 recipe/fully_async_policy/param_sync.py       | 17 +++----
 3 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index c1f94b56b6b..0d6c05a9a5c 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -6,23 +6,42 @@ defaults:
   - ppo_trainer
   - _self_
 
-# ============= 完全异步训练配置 (Fully Async Training Config) =============
-
 async_training:
-  # 新鲜度控制 (Freshness Control)
-  staleness_threshold: 1              # 样本新鲜度阈值
-  trigger_parameter_sync_step: 4     # >=1 train 每次训练一个batch, 迭代多少次后触发更新
-  partial_rollout: True               # 同步参数时，是否中断 rollout
-  use_rollout_log_probs: True
 
-# Rollout配置
+  # Maximum samples staleness threshold
+  staleness_threshold: 0.1
+       
+  # Frequency of parameter synchronization between rollouter and trainer, 
+  # One step means trainer obtains a batch of required samples
+  trigger_parameter_sync_step: 4
+  
+  # When synchronizing parameters, whether to interrupt rollouter and perform partial rollout
+  partial_rollout: True
+
+  # Whether to use rollout log probs for training
+  use_rollout_log_probs: True         
+
+# Rollout config
 rollout:
-  nnodes: 1                          # Number of nodes used in the rollout
-  n_gpus_per_node: 8                 # Number of GPUs per node
-  n: 4                               # 每个prompt生成的响应数量
+
+  # Number of nodes used in the rollout
+  nnodes: 1
+
+  # Number of GPUs per node                     
+  n_gpus_per_node: 8
+
+  # number of responses (i.e. num sample times). > 1 for grpo
+  n: 4                                
+
+  # Number of epochs in training 
   total_rollout_steps: 100
+
+  # 
   total_epochs: 10
-  test_freq: 1                       # 测试频率, 每多少次参数更新后进行一次测试
+
+  # Test frequency, how many times a parameter update triggers a validation
+  test_freq: 1                   
 
 data:
-  gen_batch_size: 32
+  # Number of samples generated, currently only support 1
+  gen_batch_size: 1
diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py
index 5094f9ab90a..85860c6f2a0 100644
--- a/recipe/fully_async_policy/message_queue.py
+++ b/recipe/fully_async_policy/message_queue.py
@@ -27,12 +27,10 @@
 class MessageQueue:
     """
     Simplified Ray-based asynchronous message queue for communication between Rollouter and Trainer
-    使用 asyncio 实现异步消息队列
     """
 
     def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         self.config = config
-        # 确保 max_queue_size 不为 None
         if max_queue_size is None:
             raise ValueError(f"max_queue_size cannot be None, got: {max_queue_size}")
         self.max_queue_size = int(max_queue_size)
@@ -52,7 +50,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000):
         # Asyncio for message handling
         self.running = True
 
-        # async safe - 在第一次使用时初始化
+        # async safe
         self._lock = asyncio.Lock()
         self._consumer_condition = asyncio.Condition(self._lock)
 
@@ -249,7 +247,7 @@ async def get_memory_usage(self) -> dict:
         future = self.queue_actor.get_memory_usage.remote()
         return await asyncio.wrap_future(future.future())
 
-    # 为了兼容性，保留同步版本的方法（但标记为deprecated）
+    # Synchronous version of the method (deprecated)
     def put_sample_sync(self, sample: Any, param_version: int) -> bool:
         """Put batch into queue (sync - deprecated, use put_sample instead)"""
         return ray.get(self.queue_actor.put_sample.remote(sample, param_version))
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 89deecad962..2a58292ff78 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -41,8 +41,8 @@ def __init__(self, config, trainer, rollouter, mq):
         self.weights_info = None
         self.sync_group_initialized = False
         self.sync_group_name = "actor_rollout"
-        self.wait_last0 = None
-        self.wait_last1 = None
+        self.wait_last_update = None
+        self.wait_last_resume = None
 
         # Statistics
         self.current_version = 0
@@ -74,6 +74,7 @@ def _init_sync_group(self):
         )
 
     def sync_weights(self, version, validate=False, global_steps=0):
+        """Sync weights between trainer and rollouter, and update parameter version"""
         start_time = time.time()
 
         self.current_version = version
@@ -91,14 +92,14 @@ def sync_weights(self, version, validate=False, global_steps=0):
         print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds")
 
         # Async Update rollout version & validation
-        self.wait_last0 = self.rollouter.update_param_version.remote(version, validate, global_steps)
-        self.wait_last1 = self.rollouter.resume.remote()
+        self.wait_last_update = self.rollouter.update_param_version.remote(version, validate, global_steps)
+        self.wait_last_resume = self.rollouter.resume.remote()
 
     def wait_last_sync(self):
         print("[ParameterSynchronizer] waiting last parameter sync and validate...")
         start_time = time.time()
-        if self.wait_last0:
-            ray.get(self.wait_last0)
-        if self.wait_last1:
-            ray.get(self.wait_last1)
+        if self.wait_last_update:
+            ray.get(self.wait_last_update)
+        if self.wait_last_resume:
+            ray.get(self.wait_last_resume)
         print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds")

From 56f853b38b794ce261e43ed6f5e4b1c201932b6d Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Sat, 13 Sep 2025 23:57:26 +0800
Subject: [PATCH 129/182] qwen2-32B

---
 .../exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh   |  6 +++---
 .../fsdp2_fully-async_64-64/runtime_env.yaml           |  2 +-
 .../run.sh                                             | 10 +++++-----
 .../runtime_env.yaml                                   |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)
 rename recipe/fully_async_policy/exp/qwen2-32B_128/{fsdp2_fully-async_48-80 => fsdp2_fully-async_80-48}/run.sh (97%)
 rename recipe/fully_async_policy/exp/qwen2-32B_128/{fsdp2_fully-async_48-80 => fsdp2_fully-async_80-48}/runtime_env.yaml (81%)

diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
index 270533a84c4..8427547d161 100644
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64'
+exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1'
 
 # Paths
 MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
@@ -62,11 +62,11 @@ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=32
+train_prompt_mini_bsz=128
 total_rollout_steps=$(((512*400)))
 test_freq=20
 staleness_threshold=0.1
-trigger_parameter_sync_step=4
+trigger_parameter_sync_step=1
 partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
index 77590fb2709..ea506be787e 100644
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
@@ -1,4 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh
similarity index 97%
rename from recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh
rename to recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh
index 8ab8f9be2d9..fd2874d0f98 100644
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_48-80'
+exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1'
 
 # Paths
 MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
@@ -55,18 +55,18 @@ sp_size=8
 fsdp_size=-1
 
 # Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-6}
-NNODES_TRAIN=${NNODES_TRAIN:-10}
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-10}
+NNODES_TRAIN=${NNODES_TRAIN:-6}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=32
+train_prompt_mini_bsz=128
 total_rollout_steps=$(((512*400)))
 test_freq=20
 staleness_threshold=0.1
-trigger_parameter_sync_step=3
+trigger_parameter_sync_step=1
 partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml
similarity index 81%
rename from recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml
rename to recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml
index 0caf9804ebc..9997c4130f2 100644
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml
@@ -1,4 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_48-80"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 9f53cd71d571c2649300ab3602916488c2dc38b8 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Mon, 15 Sep 2025 14:06:20 +0800
Subject: [PATCH 130/182] update 32 workers

---
 .../exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh         | 2 +-
 .../qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
index 2a22fd97d08..9fca6da9878 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
@@ -52,7 +52,7 @@ ref_offload=True
 actor_offload=False
 gen_tp=4
 sp_size=4
-fsdp_size=2
+fsdp_size=8
 
 # Fully async specific parameters
 NNODES_ROLLOUT=${NNODES_ROLLOUT:-2}
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
index b3063ebc7f1..5f0292d2c0d 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
@@ -1,4 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16-fsdpsize_8"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From d0a5142c7547e759829f5b0d1938d844f7a25600 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 15 Sep 2025 15:10:28 +0800
Subject: [PATCH 131/182] extract modified files in verl/

---
 .../fully_async_policy/agent_loop/__init__.py |   21 +
 .../agent_loop/agent_loop.py                  |  704 ++++++++
 .../partial_single_turn_agent_loop.py         |   74 +
 .../agent_loop/single_turn_agent_loop.py      |   55 +
 .../agent_loop/vllm_async_server.py           |  401 +++++
 .../config/fully_async_ppo_trainer.yaml       |    4 +-
 recipe/fully_async_policy/fsdp_workers.py     |    7 +-
 recipe/fully_async_policy/fully_async_main.py |   10 +-
 .../fully_async_rollouter.py                  |    4 +-
 .../fully_async_policy/fully_async_trainer.py |    2 +-
 recipe/fully_async_policy/main_ppo.py         |  344 ++++
 recipe/fully_async_policy/ray_trainer.py      | 1434 +++++++++++++++++
 12 files changed, 3045 insertions(+), 15 deletions(-)
 create mode 100644 recipe/fully_async_policy/agent_loop/__init__.py
 create mode 100644 recipe/fully_async_policy/agent_loop/agent_loop.py
 create mode 100644 recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
 create mode 100644 recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py
 create mode 100644 recipe/fully_async_policy/agent_loop/vllm_async_server.py
 create mode 100644 recipe/fully_async_policy/main_ppo.py
 create mode 100644 recipe/fully_async_policy/ray_trainer.py

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
new file mode 100644
index 00000000000..7e583cb220d
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .agent_loop import AgentLoopBase, AgentLoopManager
+from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
+from .single_turn_agent_loop import SingleTurnAgentLoop
+
+_ = [SingleTurnAgentLoop, PartialSingleTurnAgentLoop]
+
+__all__ = ["AgentLoopBase", "AgentLoopManager"]
\ No newline at end of file
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
new file mode 100644
index 00000000000..4e6c9ff9285
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -0,0 +1,704 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import heapq
+import logging
+import os
+import random
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import hydra
+import numpy as np
+import ray
+import torch
+from cachetools import LRUCache
+from omegaconf import DictConfig, OmegaConf
+from pydantic import BaseModel
+from tensordict import TensorDict
+from transformers import AutoTokenizer
+
+from verl.protocol import DataProto
+from verl.single_controller.ray.base import RayWorkerGroup
+from verl.utils import hf_tokenizer
+from verl.utils.fs import copy_to_local
+from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+class AsyncLLMServerManager:
+    """
+    A class to manage multiple OpenAI compatible LLM servers. This class provides
+    - Load balance: least requests load balancing
+    - Sticky session: send multi-turn chat completions to same server for automatic prefix caching
+    """
+
+    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000):
+        """Initialize the AsyncLLMServerManager.
+
+        Args:
+            config (DictConfig): YAML config.
+            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
+            max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000.
+        """
+        self.config = config
+        self.server_handles = server_handles
+        random.shuffle(self.server_handles)
+
+        # Least requests load balancing
+        self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles]
+        heapq.heapify(self.weighted_serveres)
+
+        # LRU cache to map request_id to server
+        self.request_id_to_server = LRUCache(maxsize=max_cache_size)
+
+    def _choose_server(self, request_id: str) -> ray.actor.ActorHandle:
+        # TODO: implement server pressure awareness load balancing
+        if request_id in self.request_id_to_server:
+            return self.request_id_to_server[request_id]
+
+        server = self.weighted_serveres[0][1][1]
+        self.weighted_serveres[0][0] += 1
+        heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0])
+        self.request_id_to_server[request_id] = server
+        return server
+
+    @rollout_trace_op
+    async def generate(
+        self,
+        request_id,
+        *,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+    ) -> list[int]:
+        """Generate tokens from prompt ids.
+
+        Args:
+            request_id (str): request id for sticky session.
+            prompt_ids (List[int]): List of prompt token ids.
+            sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
+
+        Returns:
+            List[int]: List of generated token ids.
+        """
+        server = self._choose_server(request_id)
+        output = await server.generate.remote(
+            request_id=request_id,
+            prompt_ids=prompt_ids,
+            sampling_params=sampling_params,
+        )
+        return output
+
+    async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
+        """Generate tokens from prompt ids. with partial rollout function"""
+        server = self._choose_server(request_id)
+        output = await server.generate_for_partial.remote(
+            request_id=request_id,
+            prompt_ids=prompt_ids,
+            sampling_params=sampling_params,
+        )
+        return output
+
+
+class AgentLoopMetrics(BaseModel):
+    """Agent loop performance metrics."""
+
+    generate_sequences: float = 0.0
+    tool_calls: float = 0.0
+
+
+class AgentLoopOutput(BaseModel):
+    """Agent loop output."""
+
+    prompt_ids: list[int]
+    """Prompt token ids."""
+    response_ids: list[int]
+    """Response token ids including LLM generated token, tool response token."""
+    response_mask: list[int]
+    """Response mask, 1 for LLM generated token, 0 for tool response token."""
+    num_turns: int = 0
+    """Number of chat turns, including user, assistant, tool."""
+    metrics: AgentLoopMetrics
+    """Auxiliary performance metrics"""
+    is_cancel: bool = False
+    """Indicates whether the request was interrupted"""
+    log_probs: list[float] = None
+    """Response token log probs including LLM generated token, tool response token."""
+
+
+# make hydra.utils.instantiate happy
+class _DummyConfig:
+    def __init__(self, config: DictConfig) -> None:
+        self.config = config
+
+
+class AgentLoopBase(ABC):
+    """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various
+    environments."""
+
+    _class_initialized = False
+
+    def __init__(
+        self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs
+    ):
+        """Initialize agent loop, each sample will have its own loop instance.
+
+        Args:
+            trainer_config (_DummyConfig): trainer config.
+            server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
+            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
+        """
+        self.init_class(trainer_config.config, tokenizer, **kwargs)
+        self.config = trainer_config.config
+        self.server_manager = server_manager
+        self.tokenizer = tokenizer
+        self.loop = asyncio.get_running_loop()
+
+    @classmethod
+    def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs):
+        """This is used to do heavy initialization work that should shared across all instances. It's only called once.
+
+        Args:
+            config (DictConfig): trainer config.
+            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
+            **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`.
+        """
+        if cls._class_initialized:
+            return
+        cls._class_initialized = True
+
+    @abstractmethod
+    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
+        """Run agent loop to interact with LLM server and environment.
+
+        Args:
+            messages (List[Dict[str, Any]]): Input messages.
+            sampling_params (Dict[str, Any]): LLM sampling params.
+
+        Returns:
+            AgentLoopOutput: Agent loop output.
+        """
+        raise NotImplementedError
+
+
+"""Agent loop registry: key is agent_name, value is a dict of agent loop config
+used by hydra.utils.instantiate to initialize agent loop instance.
+
+https://hydra.cc/docs/advanced/instantiate_objects/overview/
+"""
+_agent_loop_registry: dict[str, dict] = {}
+
+
+def register(agent_name: str):
+    """Register agent loop class."""
+
+    def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]:
+        fqdn = f"{subclass.__module__}.{subclass.__qualname__}"
+        _agent_loop_registry[agent_name] = {"_target_": fqdn}
+        return subclass
+
+    return decorator
+
+
+def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto:
+    """Static method to postprocess a list of AgentLoopOutput into DataProto
+
+    Args:
+        inputs: List of AgentLoopOutput
+        tokenizer: Tokenizer instance
+        config: Configuration object
+
+    Returns:
+        DataProto: Processed batch data
+    """
+    # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
+    # prompts: left pad
+    # responses: right pad
+    # input_ids: prompt + response
+    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+    # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
+
+    # prompts
+    tokenizer.padding_side = "left"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.prompt_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.prompt_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # responses
+    tokenizer.padding_side = "right"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # response_mask
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_mask} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=False,
+    )
+    response_mask = outputs["input_ids"]
+    assert response_ids.shape == response_mask.shape, (
+        f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
+    )
+    response_mask = response_mask * response_attention_mask
+
+    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
+    attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
+    position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
+
+    batch = TensorDict(
+        {
+            "prompts": prompt_ids,  # [bsz, prompt_length]
+            "responses": response_ids,  # [bsz, response_length]
+            "response_mask": response_mask,  # [bsz, response_length]
+            "input_ids": input_ids,  # [bsz, prompt_length + response_length]
+            "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
+            "position_ids": position_ids,  # [bsz, prompt_length + response_length]
+        },
+        batch_size=len(input_ids),
+    )
+
+    num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
+    metrics = [input.metrics.model_dump() for input in inputs]
+    return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
+
+
+@ray.remote
+class AgentLoopWorker:
+    """Agent loop worker takes a batch of messages and run each message in an agent loop."""
+
+    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]):
+        """Initialize agent loop manager.
+
+        Args:
+            config (DictConfig): YAML config.
+            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
+        """
+        self.config = config
+        self.server_manager = AsyncLLMServerManager(config, server_handles)
+
+        model_path = config.actor_rollout_ref.model.path
+        self.model_name = "/".join(model_path.split("/")[-2:])
+        local_path = copy_to_local(config.actor_rollout_ref.model.path)
+        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True)
+
+        agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path
+        if agent_loop_config_path:
+            agent_loop_configs = OmegaConf.load(agent_loop_config_path)
+            for agent_loop_config in agent_loop_configs:
+                _agent_loop_registry[agent_loop_config.name] = agent_loop_config
+
+        trace_config = config.trainer.get("rollout_trace", {})
+        trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
+        RolloutTraceConfig.init(
+            self.config.trainer.project_name,
+            self.config.trainer.experiment_name,
+            trace_config.get("backend"),
+            trace_config.get("token2text", False),
+        )
+
+    async def generate_sequences(self, batch: DataProto) -> DataProto:
+        """Generate sequences from agent loop.
+
+        Args:
+            batch (DataProto): Input batch.
+
+        Returns:
+            DataProto: Output batch.
+            - prompts: [bsz, prompt_length], prompt token ids from dataset.
+            - responses: [bsz, response_length], output token ids include response tokens
+              from LLM generation and observation tokens from tool_calls.
+            - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens.
+            - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens
+              and response tokens.
+            - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens.
+            - position_ids: [bsz, prompt_length + response_length], incremental position ids.
+
+            For multi-turn conversations:
+            responses:     |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->|
+            response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0|
+        """
+        config = self.config.actor_rollout_ref.rollout
+        sampling_params = dict(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            repetition_penalty=1.0,
+        )
+
+        # override sampling params for validation
+        if batch.meta_info.get("validate", False):
+            sampling_params["top_p"] = config.val_kwargs.top_p
+            sampling_params["temperature"] = config.val_kwargs.temperature
+
+        # by default, we assume it's a single turn agent
+        if "agent_name" not in batch.non_tensor_batch:
+            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
+
+        tasks = []
+        agent_names = batch.non_tensor_batch["agent_name"]
+        raw_prompts = batch.non_tensor_batch["raw_prompt"]
+        if "index" in batch.non_tensor_batch:
+            index = batch.non_tensor_batch["index"]
+        else:
+            index = np.arange(len(raw_prompts))
+
+        trajectory_info = await get_trajectory_info(
+            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
+        )
+
+        for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True):
+            tasks.append(
+                asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory))
+            )
+        outputs = await asyncio.gather(*tasks)
+
+        output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config)
+        return output
+
+    async def generate_sequences_no_post(
+        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
+    ) -> list[AgentLoopOutput]:
+        """Generate sequences from agent loop.
+
+        Args:
+            batch (DataProto): Input batch.
+            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
+
+        Returns:
+            list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
+            Each AgentLoopOutput contains:
+            - prompt_ids: prompt token ids
+            - response_ids: response token ids including LLM generated and tool response tokens
+            - response_mask: 1 for LLM generated tokens, 0 for tool response tokens
+            - num_turns: number of chat turns
+            - metrics: performance metrics
+        """
+        config = self.config.actor_rollout_ref.rollout
+        sampling_params = dict(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            repetition_penalty=1.0,
+        )
+
+        # override sampling params for validation
+        if batch.meta_info.get("validate", False):
+            sampling_params["top_p"] = config.val_kwargs.top_p
+            sampling_params["temperature"] = config.val_kwargs.temperature
+
+        # by default, we assume it's a single turn agent
+        if "agent_name" not in batch.non_tensor_batch:
+            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
+
+        tasks = []
+        agent_names = batch.non_tensor_batch["agent_name"]
+        raw_prompts = batch.non_tensor_batch["raw_prompt"]
+        if "index" in batch.non_tensor_batch:
+            index = batch.non_tensor_batch["index"]
+        else:
+            index = np.arange(len(raw_prompts))
+
+        trajectory_info = await get_trajectory_info(
+            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
+        )
+        if not partial_output_list:
+            partial_output_list = [None] * len(batch)
+
+        for agent_name, messages, trajectory, partial_output in zip(
+            agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True
+        ):
+            tasks.append(
+                asyncio.create_task(
+                    self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
+                )
+            )
+        outputs = await asyncio.gather(*tasks)
+
+        return outputs
+
+    async def _run_agent_loop(
+        self,
+        agent_name: str,
+        messages: list[dict[str, Any]],
+        sampling_params: dict[str, Any],
+        trajectory: dict[str, Any],
+        partial_output: Optional[AgentLoopOutput] = None,
+    ) -> AgentLoopOutput:
+        with rollout_trace_attr(
+            step=trajectory["step"],
+            sample_index=trajectory["sample_index"],
+            rollout_n=trajectory["rollout_n"],
+            validate=trajectory["validate"],
+            name="agent_loop",
+        ):
+            assert agent_name in _agent_loop_registry, (
+                f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
+            )
+            agent_loop_config = _agent_loop_registry[agent_name]
+            agent_loop = hydra.utils.instantiate(
+                config=agent_loop_config,
+                trainer_config=_DummyConfig(config=self.config),
+                server_manager=self.server_manager,
+                tokenizer=self.tokenizer,
+            )
+            if agent_name == "partial_single_turn_agent":
+                output = await agent_loop.run(messages, sampling_params, partial_output)
+            else:
+                output = await agent_loop.run(messages, sampling_params)
+            return output
+
+
+async def get_trajectory_info(step, index, validate):
+    """Get trajectory info.
+
+    Args:
+        step (int): global steps in the trainer.
+        index (list): form datastore extra_info.index column.
+        validate (bool): whether is a validate step.
+
+    Returns:
+        list: trajectory.
+    """
+    trajectory_info = []
+    rollout_n = 0
+    for i in range(len(index)):
+        if i > 0 and index[i - 1] == index[i]:
+            rollout_n += 1
+        else:
+            rollout_n = 0
+        trajectory_info.append({"step": step, "sample_index": index[i], "rollout_n": rollout_n, "validate": validate})
+    return trajectory_info
+
+
+class AgentLoopManager:
+    """Agent loop manager that manages a group of agent loop workers."""
+
+    def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
+        """Initialize agent loop manager.
+
+        Args:
+            config (DictConfig): trainer config.
+            worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group.
+        """
+        self.config = config
+        self.worker_group = worker_group
+
+        self._initialize_llm_servers()
+        self._init_agent_loop_workers()
+
+        # Initially we're in sleep mode.
+        self.sleep()
+
+    def _initialize_llm_servers(self):
+        self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
+        self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size
+
+        register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center")
+        workers_info = ray.get(register_center.get_worker_info.remote())
+        assert len(workers_info) == self.worker_group.world_size
+
+        self.async_llm_servers = [None] * self.rollout_dp_size
+        self.server_addresses = [None] * self.rollout_dp_size
+
+        if self.config.actor_rollout_ref.rollout.agent.custom_async_server:
+            server_class = async_server_class(
+                rollout_backend=self.config.actor_rollout_ref.rollout.name,
+                rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path,
+                rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name,
+            )
+        else:
+            server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name)
+
+        # Start all server instances, restart if address already in use.
+        unready_dp_ranks = set(range(self.rollout_dp_size))
+        while len(unready_dp_ranks) > 0:
+            servers = {
+                rollout_dp_rank: server_class.options(
+                    # make sure AsyncvLLMServer colocates with its corresponding workers
+                    scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                        node_id=workers_info[rollout_dp_rank * self.rollout_tp_size],
+                        soft=False,
+                    ),
+                    name=f"async_llm_server_{rollout_dp_rank}",
+                ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix)
+                for rollout_dp_rank in unready_dp_ranks
+            }
+
+            for rollout_dp_rank, server in servers.items():
+                try:
+                    address = ray.get(server.get_server_address.remote())
+                    self.server_addresses[rollout_dp_rank] = address
+                    self.async_llm_servers[rollout_dp_rank] = server
+                    unready_dp_ranks.remove(rollout_dp_rank)
+                except Exception:
+                    ray.kill(server)
+                    print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...")
+
+        # All server instances are ready, init AsyncLLM engine.
+        ray.get([server.init_engine.remote() for server in self.async_llm_servers])
+
+    def _init_agent_loop_workers(self):
+        self.agent_loop_workers = []
+        for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers):
+            self.agent_loop_workers.append(
+                AgentLoopWorker.options(
+                    name=f"agent_loop_worker_{i}",
+                ).remote(self.config, self.async_llm_servers)
+            )
+
+    def generate_sequences(self, prompts: DataProto) -> DataProto:
+        """Split input batch and dispatch to agent loop workers.
+
+        Args:
+            prompts (DataProto): Input batch.
+
+        Returns:
+            DataProto: Output batch.
+        """
+        if self.config.actor_rollout_ref.rollout.free_cache_engine:
+            self.wake_up()
+        chunkes = prompts.chunk(len(self.agent_loop_workers))
+        outputs = ray.get(
+            [
+                worker.generate_sequences.remote(chunk)
+                for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True)
+            ]
+        )
+        output = DataProto.concat(outputs)
+        if self.config.actor_rollout_ref.rollout.free_cache_engine:
+            self.sleep()
+
+        # calculate performance metrics
+        metrics = [output.meta_info["metrics"] for output in outputs]  # List[List[Dict[str, str]]]
+        timing = self._performance_metrics(metrics, output)
+
+        output.meta_info = {"timing": timing}
+        return output
+
+    async def generate_single_sample_async(
+        self,
+        sample: DataProto,
+        partial_output_list: Optional[list[AgentLoopOutput]],
+    ) -> list[AgentLoopOutput]:
+        """
+        异步处理单个样本, 需要复制n次
+
+        Args:
+            sample: 单个样本数据
+            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
+
+        Returns:
+            tuple[AgentLoopOutput, float]: 处理结果和处理时间
+        """
+        # 使用负载均衡选择 worker
+        worker = self._select_best_worker()
+        # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
+        output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list)
+        return await asyncio.wrap_future(output_future.future())
+
+    def _select_best_worker(self):
+        """选择最佳的 worker（简单的轮询负载均衡）"""
+        if not hasattr(self, "_worker_index"):
+            self._worker_index = 0
+
+        worker = self.agent_loop_workers[self._worker_index]
+        self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
+        return worker
+
+    def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
+        timing = {}
+        t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk])
+        t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk])
+        timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min()
+        timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max()
+        timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean()
+        timing["agent_loop/tool_calls/min"] = t_tool_calls.min()
+        timing["agent_loop/tool_calls/max"] = t_tool_calls.max()
+        timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean()
+
+        # batch sequence generation is bounded by the slowest sample
+        slowest = np.argmax(t_generate_sequences + t_tool_calls)
+        attention_mask = output.batch["attention_mask"][slowest]
+        prompt_length = output.batch["prompts"].shape[1]
+        timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
+        timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
+        timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
+        timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
+
+        return timing
+
+    def wake_up(self):
+        """Wake up all rollout server instances."""
+        ray.get([server.wake_up.remote() for server in self.async_llm_servers])
+
+    def sleep(self):
+        """Sleep all rollout server instances."""
+        ray.get([server.sleep.remote() for server in self.async_llm_servers])
+
+    async def cancel_async(self):
+        """Cancel all rollout tasks asynchronously."""
+        futures = [server.cancel.remote() for server in self.async_llm_servers]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+
+    async def resume_async(self):
+        """Cancel all rollout tasks asynchronously."""
+        futures = [server.resume.remote() for server in self.async_llm_servers]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+
+
+
+from verl.workers.rollout.async_server import AsyncServerBase
+def async_server_class(
+    rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None
+) -> type[AsyncServerBase]:
+    """Get async server class.
+
+    Args:
+        rollout_backend: str, rollout backend type (alias), should be "vllm".
+        rollout_backend_module: Optional[str], import path of the rollout backend.
+        rollout_backend_class: Optional[str], class name of the rollout backend.
+
+    Returns:
+        Type[AsyncServerBase]: async server class.
+    """
+    if rollout_backend_class is None and rollout_backend_module is None:
+        # If both are None, use the default backend class
+        # Do not change the original import behavior
+        # importlib.import_module and from ... import ... have subtle differences in ray
+
+        if rollout_backend == "vllm":
+            from recipe.fully_async_policy.agent_loop.vllm_async_server import AsyncvLLMServer
+            return AsyncvLLMServer
+        else:
+            raise NotImplementedError(f"rollout backend {rollout_backend} is not supported")
+
+    if rollout_backend_module is None or rollout_backend_class is None:
+        raise ValueError("rollout_backend_module and rollout_backend_class must be both provided for customization")
+
+    from verl.utils.import_utils import load_extern_type
+
+    return load_extern_type(rollout_backend_module, rollout_backend_class)
\ No newline at end of file
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
new file mode 100644
index 00000000000..cf95c1eb965
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -0,0 +1,74 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Any, Optional
+from uuid import uuid4
+
+from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from verl.utils.profiler import simple_timer
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+@register("partial_single_turn_agent")
+class PartialSingleTurnAgentLoop(AgentLoopBase):
+    """Naive agent loop that only do single turn chat completion."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
+        self.response_length = self.config.actor_rollout_ref.rollout.response_length
+
+    async def run(
+        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+    ) -> AgentLoopOutput:
+        if not output:
+            prompt_ids = await self.loop.run_in_executor(
+                None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+            )
+        else:
+            if output.is_cancel:
+                # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
+                prompt_ids = output.prompt_ids + output.response_ids
+            else:
+                # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
+                return output
+
+        metrics = {}
+        request_id = uuid4().hex
+        with simple_timer("generate_sequences", metrics):
+            response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
+                request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
+            )
+
+        if not output:
+            response_mask = [1] * len(response_ids)
+        # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
+        else:
+            prompt_ids = output.prompt_ids
+            log_probs = output.log_probs + log_probs
+            response_ids = output.response_ids + response_ids
+            response_mask = [1] * len(response_ids)
+
+        return AgentLoopOutput(
+            prompt_ids=prompt_ids,
+            response_ids=response_ids[: self.response_length],
+            response_mask=response_mask[: self.response_length],
+            num_turns=2,
+            metrics=metrics,
+            is_cancel=is_cancel,
+            log_probs=log_probs,
+        )
diff --git a/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py
new file mode 100644
index 00000000000..6dcdf327b09
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Any
+from uuid import uuid4
+
+from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from verl.utils.profiler import simple_timer
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+@register("single_turn_agent")
+class SingleTurnAgentLoop(AgentLoopBase):
+    """Naive agent loop that only do single turn chat completion."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
+        self.response_length = self.config.actor_rollout_ref.rollout.response_length
+
+    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
+        metrics = {}
+        request_id = uuid4().hex
+        prompt_ids = await self.loop.run_in_executor(
+            None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        )
+
+        with simple_timer("generate_sequences", metrics):
+            response_ids = await self.server_manager.generate(
+                request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
+            )
+        response_mask = [1] * len(response_ids)
+
+        output = AgentLoopOutput(
+            prompt_ids=prompt_ids,
+            response_ids=response_ids[: self.response_length],
+            response_mask=response_mask[: self.response_length],
+            num_turns=2,
+            metrics=metrics,
+        )
+        return output
diff --git a/recipe/fully_async_policy/agent_loop/vllm_async_server.py b/recipe/fully_async_policy/agent_loop/vllm_async_server.py
new file mode 100644
index 00000000000..03fc28c8549
--- /dev/null
+++ b/recipe/fully_async_policy/agent_loop/vllm_async_server.py
@@ -0,0 +1,401 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+import os
+import pickle
+from typing import Any, Callable, Optional, Sequence
+
+import ray
+import zmq
+from omegaconf import DictConfig
+from starlette.requests import Request
+from starlette.responses import JSONResponse, StreamingResponse
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.inputs import TokensPrompt
+from vllm.outputs import RequestOutput
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.executor.abstract import Executor
+from vllm.worker.worker_base import WorkerWrapperBase
+
+from verl.utils.fs import copy_to_local
+from verl.workers.rollout.async_server import AsyncServerBase
+
+logger = logging.getLogger(__file__)
+
+
+def _get_model_runner_workers(vllm_config, init_ray: bool = True):
+    assert vllm_config.instance_id is not None, "instance_id must be set for external ray actors."
+
+    fields = vllm_config.instance_id.split(":")
+    assert len(fields) == 4, (
+        f"instance_id: {vllm_config.instance_id} must be in the format of "
+        f"<namespace>:<wg_prefix>:<vllm_dp_size>:<vllm_dp_rank>."
+    )
+    namespace, wg_prefix, vllm_dp_size, vllm_dp_rank = fields[0], fields[1], int(fields[2]), int(fields[3])
+
+    # Make sure subprocess in same namespace as parent actor.
+    # actor name format: {name_prefix}WorkerDict_{pg_idx}:{local_rank}
+    if init_ray:
+        ray.init(namespace=namespace)
+    actor_names = [
+        actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict")
+    ]
+
+    vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size
+    assert len(actor_names) == vllm_dp_size * vllm_tp_size, (
+        f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: "
+        f"{vllm_dp_size} * vllm_tp_size: {vllm_tp_size} = {vllm_dp_size * vllm_tp_size} is expected."
+    )
+
+    def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]:
+        fields = actor_name.split(":")
+        assert len(fields) == 2, f"invalid actor name: {actor_name}"
+        pg_index, local_rank = int(fields[0].split("_")[-1]), int(fields[1])
+        return pg_index, local_rank
+
+    # sort actor names by pg_index and local_rank
+    actor_names = sorted(actor_names, key=get_pg_index_and_local_rank)
+    actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size]
+    workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names]
+    print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}")
+
+    return workers
+
+
+class ExternalRayDistributedExecutor(Executor):
+    """An executor that engines are launched by external ray actors."""
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True)
+
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=None,
+            rank=None,
+            distributed_init_method="env://",
+            is_driver_worker=True,
+        )
+        self.collective_rpc("init_worker", args=([kwargs],))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+        print(f"instance_id: {self.vllm_config.instance_id} initializes finished.")
+
+    def collective_rpc(
+        self,
+        method: str | Callable,
+        timeout: Optional[float] = None,
+        args: tuple = (),
+        kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[Any]:
+        # TODO(wuxibin): support ray compiled graph
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = pickle.dumps(method)
+        del method
+
+        # ~3ms overhead per schedule step due to SchedulerOutput/ModelRunnerOutput serialization/deserialization.
+        outputs = ray.get(
+            [worker.execute_method.remote(sent_method, *args, **(kwargs or {})) for worker in self.workers]
+        )
+        return outputs
+
+    def check_health(self):
+        return
+
+
+class ExternalZeroMQDistributedExecutor(Executor):
+    """An executor that engines are launched by external ray actors."""
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",")
+        self.context = zmq.Context()
+        self.sockets = []
+        for address in addresses:
+            socket = self.context.socket(zmq.REQ)
+            socket.connect(address)
+            self.sockets.append(socket)
+
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=None,
+            rank=None,
+            distributed_init_method="env://",
+            is_driver_worker=True,
+        )
+        self.collective_rpc("init_worker", args=([kwargs],))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def collective_rpc(
+        self,
+        method: str | Callable,
+        timeout: Optional[float] = None,
+        args: tuple = (),
+        kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[Any]:
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = pickle.dumps(method)
+        del method
+
+        message = pickle.dumps((sent_method, args, kwargs or {}))
+        for socket in self.sockets:
+            socket.send(message, zmq.DONTWAIT)
+
+        outputs = []
+        for socket in self.sockets:
+            outputs.append(pickle.loads(socket.recv()))
+        return outputs
+
+    def check_health(self):
+        return
+
+
+@ray.remote(num_cpus=1)
+class AsyncvLLMServer(AsyncServerBase):
+    """
+    AsyncvLLMServer is a wrapper for AsyncLLM, it uses ExternalRayDistributedExecutor to launch engines
+    in hybrid rollout workers, i.e AsyncActorRolloutRefWorker.
+
+    AsyncvLLMServer works as follows:
+    1. Start FastAPI server first.
+    2. Initialize AsyncLLM with ExternalRayDistributedExecutor.
+    3. AsyncLLM spawn EngineCore in subprocess.
+    4. EngineCore initialize ExternalRayDistributedExecutor.
+    5. ExternalRayDistributedExecutor lookup its corresponding actors by name.
+    6. ExternalRayDistributedExecutor init executor: init_worker, init_device, load_model.
+
+    For vLLM AsyncLLM design, see: https://github.com/vllm-project/vllm/pull/9826
+    """
+
+    def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_prefix: str):
+        """
+        Args:
+            config: DictConfig.
+            vllm_dp_size: int, vllm data parallel size.
+            vllm_dp_rank: int, vllm data parallel rank.
+            wg_prefix: str, worker group prefix, used to lookup actors.
+        """
+        super().__init__()
+
+        self.config = config.actor_rollout_ref
+        self.vllm_dp_size = vllm_dp_size
+        self.vllm_dp_rank = vllm_dp_rank
+        self.wg_prefix = wg_prefix
+        self.engine: AsyncLLM = None
+
+        # for cancel LLMServer
+        self.paused = False
+        self.lock = asyncio.Lock()
+        self.cancel_event: dict[str, asyncio.Event] = {}
+        self.req_output: dict[str, Optional[RequestOutput]] = {}
+
+    async def init_engine(self):
+        """Init vLLM AsyncLLM engine."""
+        config = self.config
+        model_path = config.model.path
+        model_name = "/".join(model_path.split("/")[-2:])
+        local_path = copy_to_local(model_path)
+        trust_remote_code = config.model.get("trust_remote_code", False)
+        config = config.rollout
+
+        tensor_parallel_size = config.get("tensor_model_parallel_size", 1)
+        max_num_batched_tokens = config.get("max_num_batched_tokens", 8192)
+        max_model_len = config.max_model_len if config.max_model_len else config.prompt_length + config.response_length
+        self.max_model_len = int(max_model_len)
+
+        # Override default generation config from hugging face model config,
+        # user can still override them by passing kwargs in each request.
+        kwargs = dict(
+            n=1,
+            logprobs=0,
+            repetition_penalty=1.0,
+            max_new_tokens=config.response_length,
+        )
+        for k in config.keys():
+            if hasattr(SamplingParams(), str(k)):
+                kwargs[k] = config.get(k)
+        print(f"override_generation_config: {kwargs}")
+
+        backend = os.environ.get("VERL_VLLM_DISTRIBUTED_BACKEND", "zeromq")
+        if backend == "zeromq":
+            distributed_executor_backend = ExternalZeroMQDistributedExecutor
+        elif backend == "ray":
+            distributed_executor_backend = ExternalRayDistributedExecutor
+        else:
+            distributed_executor_backend = None
+
+        engine_args = AsyncEngineArgs(
+            model=local_path,
+            enable_sleep_mode=config.free_cache_engine,
+            override_generation_config=kwargs,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            dtype=config.dtype,
+            enforce_eager=config.enforce_eager,
+            gpu_memory_utilization=config.gpu_memory_utilization,
+            disable_custom_all_reduce=True,
+            skip_tokenizer_init=False,
+            max_model_len=self.max_model_len,
+            load_format="auto",
+            disable_log_stats=config.disable_log_stats,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=config.enable_chunked_prefill,
+            enable_prefix_caching=True,
+            trust_remote_code=trust_remote_code,
+            seed=config.get("seed", 0),
+        )
+
+        # init async llm engine
+        vllm_config = self._create_engine_config(engine_args)
+        self.engine = AsyncLLM.from_vllm_config(vllm_config)
+
+        # build serving chat
+        model_config = self.engine.model_config
+        BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)]
+        models = OpenAIServingModels(self.engine, model_config, BASE_MODEL_PATHS)
+        self.openai_serving_chat = OpenAIServingChat(
+            self.engine,
+            model_config,
+            models,
+            "assistant",
+            request_logger=RequestLogger(max_log_len=4096),
+            chat_template=None,
+            chat_template_content_format="auto",
+            enable_auto_tools=config.multi_turn.tool_config_path is not None,
+            tool_parser=config.multi_turn.format,  # hermes, llama3_json, ...
+        )
+
+    def _create_engine_config(self, engine_args: AsyncEngineArgs):
+        vllm_config = engine_args.create_engine_config()
+        namespace = ray.get_runtime_context().namespace
+        vllm_config.instance_id = f"{namespace}:{self.wg_prefix}:{self.vllm_dp_size}:{self.vllm_dp_rank}"
+
+        # VERL_VLLM_ZMQ_ADDRESSES
+        if engine_args.distributed_executor_backend == ExternalZeroMQDistributedExecutor:
+            workers = _get_model_runner_workers(vllm_config=vllm_config, init_ray=False)
+            zmq_addresses = ray.get([worker.get_zeromq_address.remote() for worker in workers])
+            print(f"VERL_VLLM_ZMQ_ADDRESSES: {zmq_addresses}")
+            os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses)
+
+        return vllm_config
+
+    async def chat_completion(self, raw_request: Request):
+        """OpenAI-compatible HTTP endpoint.
+
+        API reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
+        """
+        request_json = await raw_request.json()
+        request = ChatCompletionRequest(**request_json)
+        generator = await self.openai_serving_chat.create_chat_completion(request, raw_request)
+
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(), status_code=generator.code)
+        if request.stream:
+            return StreamingResponse(content=generator, media_type="text/event-stream")
+        else:
+            assert isinstance(generator, ChatCompletionResponse)
+            return JSONResponse(content=generator.model_dump())
+
+    async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]:
+        max_tokens = self.max_model_len - len(prompt_ids)
+        sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params)
+        prompt = TokensPrompt(prompt_token_ids=prompt_ids)
+        generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
+
+        # Get final response
+        final_res: Optional[RequestOutput] = None
+        async for output in generator:
+            final_res = output
+        assert final_res is not None
+
+        return final_res.outputs[0].token_ids
+
+    async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str):
+        max_tokens = self.max_model_len - len(prompt_ids)
+        sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params)
+        prompt = TokensPrompt(prompt_token_ids=prompt_ids)
+        generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
+
+        # Get final response
+        self.req_output[request_id]: Optional[RequestOutput] = None
+        async for output in generator:
+            self.req_output[request_id] = output
+        assert self.req_output[request_id] is not None
+
+    async def generate_for_partial(
+        self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
+    ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
+        # 设置中断标志
+        async with self.lock:
+            if self.paused:
+                # cancel 后， 所有任务直接返回，等待下次提交
+                return [], [], True
+            self.cancel_event[request_id] = asyncio.Event()
+            cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
+            generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
+
+        done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED)
+
+        for task in done:
+            await task
+
+        for task in pend:
+            task.cancel()
+
+        async with self.lock:
+            token_ids = self.req_output[request_id].outputs[0].token_ids
+            log_probs: list[float] = []
+            for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs):
+                # sampling_params 中 logprobs 设置为1，应该返回1个, 但是实测会有多个，取token_id所对应的log_prob
+                token_id = self.req_output[request_id].outputs[0].token_ids[i]
+                log_probs.append(x[token_id].logprob)
+            is_cancel = generation_handle not in done
+            self.cancel_event.pop(request_id, None)
+            self.req_output.pop(request_id, None)
+        return token_ids, log_probs, is_cancel
+
+    async def cancel(self):
+        async with self.lock:
+            self.paused = True
+            for request_id in self.cancel_event:
+                self.cancel_event[request_id].set()
+
+    async def resume(self):
+        async with self.lock:
+            self.paused = False
+
+    async def wake_up(self):
+        if self.config.rollout.free_cache_engine:
+            await self.engine.wake_up()
+
+    async def sleep(self):
+        # TODO: https://github.com/vllm-project/vllm/issues/17103
+        await self.engine.reset_prefix_cache()
+        if self.config.rollout.free_cache_engine:
+            await self.engine.sleep()
+
+
diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 0d6c05a9a5c..c2708b975be 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -33,10 +33,10 @@ rollout:
   # number of responses (i.e. num sample times). > 1 for grpo
   n: 4                                
 
-  # Number of epochs in training 
+  # total rollout samples # TODO rename to total_rollout_samples
   total_rollout_steps: 100
 
-  # 
+  # Number of epochs in training 
   total_epochs: 10
 
   # Test frequency, how many times a parameter update triggers a validation
diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index dd941c26684..41fa3a55eec 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -51,19 +51,16 @@
 
 def get_inference_model(rollout):
     """
-    根据不同类型的inference_engine获取模型对象
+    get models according to different types of inference_engine
     Args:
-        rollout: rollout对象，包含inference_engine
+        rollout: rollout object
     Returns:
         model: 模型对象
     """
     inference_engine = rollout.inference_engine
-    # 判断inference_engine的类型
     if hasattr(inference_engine, "llm_engine"):
-        # LLM类型 - vLLMRollout
         inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
     elif hasattr(inference_engine, "worker"):
-        # WorkerWrapperBase类型 - vLLMAsyncRollout
         inference_model = inference_engine.worker.model_runner.model
     else:
         raise AttributeError(
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index a588679991c..699222f350a 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -24,7 +24,7 @@
 from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter
 from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer
 from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
-from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from recipe.fully_async_policy.ray_trainer import ResourcePoolManager, Role
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.utils.fs import copy_to_local
 
@@ -185,16 +185,16 @@ def _initialize_components(self, config) -> None:
         print("[ASYNC MAIN] Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
-        # 同步require samples
+        # sync require samples between rollouter and trainer
         required_samples = ray.get(self.components["trainer"].get_required_samples.remote())
         ray.get(self.components["rollouter"].set_required_samples.remote(required_samples))
 
-        # 同步total_train_steps
+        # sync total_train_steps between rollouter and trainer
         total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote())
         print(f"total_train_steps {total_train_steps}")
         ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps))
 
-        # 获取 max_queue_size (使用同步方法避免异步返回值问题)
+        # max_queue_size 
         max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote())
         print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}")
         message_queue = MessageQueue.remote(config, max_queue_size)
@@ -280,7 +280,7 @@ def _run_training_loop(self):
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
-    from verl.trainer.main_ppo import run_ppo
+    from recipe.fully_async_policy.main_ppo import run_ppo
 
     # Ensure async training config exists
     if not hasattr(config, "async_training"):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 162836a00f6..919314ba1b5 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -25,7 +25,7 @@
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
+from recipe.fully_async_policy.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.profiler import marked_timer
 from verl.utils.tracking import ValidationGenerationsLogger
 
@@ -257,7 +257,7 @@ def _create_continuous_iterator(self):
     def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         assert self.config.actor_rollout_ref.rollout.mode == "async"
-        from verl.experimental.agent_loop import AgentLoopManager
+        from recipe.fully_async_policy.agent_loop import AgentLoopManager
 
         self.async_rollout_mode = True
         self.async_rollout_manager = AgentLoopManager(
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 5d2a2c794e8..0c1501cbf89 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -31,7 +31,7 @@
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator
-from verl.trainer.ppo.ray_trainer import (
+from recipe.fully_async_policy.ray_trainer import (
     RayPPOTrainer,
     ResourcePoolManager,
     Role,
diff --git a/recipe/fully_async_policy/main_ppo.py b/recipe/fully_async_policy/main_ppo.py
new file mode 100644
index 00000000000..4b240c6ffbf
--- /dev/null
+++ b/recipe/fully_async_policy/main_ppo.py
@@ -0,0 +1,344 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+
+import os
+import socket
+
+import hydra
+import ray
+from omegaconf import OmegaConf
+
+from verl.experimental.dataset.sampler import AbstractSampler
+from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.trainer.ppo.reward import load_reward_manager
+from verl.utils.device import is_cuda_available
+from verl.utils.import_utils import load_extern_type
+
+
+@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None)
+def main(config):
+    """Main entry point for PPO training with Hydra configuration management.
+
+    Args:
+        config_dict: Hydra configuration dictionary containing training parameters.
+    """
+    from time import time
+
+    start_time = time()
+    run_ppo(config)
+    print(f"total time: {time() - start_time:.2f} seconds")
+
+
+# Define a function to run the PPO-like training process
+def run_ppo(config, task_runner_class=None) -> None:
+    """Initialize Ray cluster and run distributed PPO training process.
+
+    Args:
+        config: Training configuration object containing all necessary parameters
+                for distributed PPO training including Ray initialization settings,
+                model paths, and training hyperparameters.
+    """
+    # Check if Ray is not initialized
+    if not ray.is_initialized():
+        # Initialize Ray with a local cluster configuration
+        # Set environment variables in the runtime environment to control tokenizer parallelism,
+        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
+        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
+        ray.init(
+            runtime_env=get_ppo_ray_runtime_env(),
+            num_cpus=config.ray_init.num_cpus,
+        )
+    # for recipe to change TaskRunner
+    if task_runner_class is None:
+        task_runner_class = TaskRunner
+
+    # Create a remote instance of the TaskRunner class, and
+    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
+    if (
+        is_cuda_available
+        and config.trainer.get("profile_steps") is not None
+        and len(config.trainer.get("profile_steps", [])) > 0
+    ):
+        nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
+        runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote()
+    else:
+        runner = task_runner_class.remote()
+    ray.get(runner.run.remote(config))
+
+    # [Optional] get the path of the timeline trace file from the configuration, default to None
+    # This file is used for performance analysis
+    timeline_json_file = config.ray_init.get("timeline_json_file", None)
+    if timeline_json_file:
+        ray.timeline(filename=timeline_json_file)
+
+
+@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+class TaskRunner:
+    """Ray remote class for executing distributed PPO training tasks.
+
+    This class encapsulates the main training logic and runs as a Ray remote actor
+    to enable distributed execution across multiple nodes and GPUs.
+    """
+
+    def run(self, config):
+        """Execute the main PPO training workflow.
+
+        This method sets up the distributed training environment, initializes
+        workers, datasets, and reward functions, then starts the training process.
+
+        Args:
+            config: Training configuration object containing all parameters needed
+                   for setting up and running the PPO training process.
+        """
+        # Print the initial configuration. `resolve=True` will evaluate symbolic values.
+        from pprint import pprint
+
+        from omegaconf import OmegaConf
+
+        from verl.utils.fs import copy_to_local
+
+        print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
+        pprint(OmegaConf.to_container(config, resolve=True))
+        OmegaConf.resolve(config)
+
+        # Download the checkpoint from HDFS to the local machine.
+        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        local_path = copy_to_local(
+            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
+        )
+
+        # Instantiate the tokenizer and processor.
+        from verl.utils import hf_processor, hf_tokenizer
+
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # Used for multimodal LLM, could be None
+        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+
+        # Define worker classes based on the actor strategy.
+        if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
+            assert config.critic.strategy in {"fsdp", "fsdp2"}
+            from verl.single_controller.ray import RayWorkerGroup
+            from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
+
+            use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
+            if use_legacy_worker_impl in ["auto", "enable"]:
+                # import warnings
+                # warnings.warn(f"Legacy worker impl is going to be deprecated, will be removed in the future. \
+                #   Please set trainer.use_legacy_worker_impl = false to switch to the new worker implementation.")
+                from verl.workers.fsdp_workers import CriticWorker
+            elif use_legacy_worker_impl == "disable":
+                from verl.workers.roles import CriticWorker
+
+                print("Using new worker implementation")
+            else:
+                raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}")
+
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = RayWorkerGroup
+
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+            from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+        else:
+            raise NotImplementedError
+
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+        # Map roles to their corresponding remote worker classes.
+        role_worker_mapping = {
+            Role.ActorRollout: ray.remote(actor_rollout_cls),
+            Role.Critic: ray.remote(CriticWorker),
+        }
+
+        # Define the resource pool specification.
+        # Map roles to the resource pool.
+        global_pool_id = "global_pool"
+        resource_pool_spec = {
+            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+        }
+        mapping = {
+            Role.ActorRollout: global_pool_id,
+            Role.Critic: global_pool_id,
+        }
+
+        # We should adopt a multi-source reward function here:
+        # - for rule-based rm, we directly call a reward score
+        # - for model-based rm, we call a model
+        # - for code related prompt, we send to a sandbox if there are test cases
+        # finally, we combine all the rewards together
+        # The reward type depends on the tag of the data
+        if config.reward_model.enable:
+            if config.reward_model.strategy in {"fsdp", "fsdp2"}:
+                from verl.workers.fsdp_workers import RewardModelWorker
+            elif config.reward_model.strategy == "megatron":
+                from verl.workers.megatron_workers import RewardModelWorker
+            else:
+                raise NotImplementedError
+            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+            mapping[Role.RewardModel] = global_pool_id
+
+        # Add a reference policy worker if KL loss or KL reward is used.
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+            mapping[Role.RefPolicy] = global_pool_id
+
+        # Load the reward manager for training and validation.
+        reward_fn = load_reward_manager(
+            config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
+        )
+        val_reward_fn = load_reward_manager(
+            config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {})
+        )
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+        from verl.utils.dataset.rl_dataset import collate_fn
+
+        # Create training and validation datasets.
+        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor, is_train=True)
+        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False)
+        train_sampler = create_rl_sampler(config.data, train_dataset)
+
+        # Initialize the PPO trainer.
+        trainer = RayPPOTrainer(
+            config=config,
+            tokenizer=tokenizer,
+            processor=processor,
+            role_worker_mapping=role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            reward_fn=reward_fn,
+            val_reward_fn=val_reward_fn,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            collate_fn=collate_fn,
+            train_sampler=train_sampler,
+        )
+        # Initialize the workers of the trainer.
+        trainer.init_workers()
+        # Start the training process.
+        trainer.fit()
+
+
+def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=True):
+    """Create a dataset.
+
+    Arguments:
+        data_paths: List of paths to data files.
+        data_config: The data config.
+        tokenizer (Tokenizer): The tokenizer.
+        processor (Processor): The processor.
+
+    Returns:
+        dataset (Dataset): The dataset.
+    """
+    from torch.utils.data import Dataset
+
+    from verl.utils.dataset.rl_dataset import RLHFDataset
+
+    # Check if a custom dataset class is specified in the data configuration
+    # and if the path to the custom class is provided
+    if "custom_cls" in data_config and data_config.custom_cls.get("path", None) is not None:
+        # Dynamically load the custom dataset class
+        dataset_cls = load_extern_type(data_config.custom_cls.path, data_config.custom_cls.name)
+        # Verify that the custom dataset class inherits from torch.utils.data.Dataset
+        if not issubclass(dataset_cls, Dataset):
+            raise TypeError(
+                f"The custom dataset class '{data_config.custom_cls.name}' from "
+                f"'{data_config.custom_cls.path}' must inherit from torch.utils.data.Dataset"
+            )
+    elif "datagen" in data_config and data_config.datagen.get("path", None) is not None and is_train:
+        # If a data generation strategy is specified, use the DynamicGenDataset class
+        from verl.utils.dataset.dynamicgen_dataset import DynamicGenDataset
+
+        dataset_cls = DynamicGenDataset
+        print("Using DynamicGenDataset for data generation.")
+
+    else:
+        # Use the default RLHFDataset class if no custom class is specified
+        dataset_cls = RLHFDataset
+    print(f"Using dataset class: {dataset_cls.__name__}")
+
+    # Instantiate the dataset using the determined dataset class
+    dataset = dataset_cls(
+        data_files=data_paths,
+        tokenizer=tokenizer,
+        processor=processor,
+        config=data_config,
+    )
+
+    return dataset
+
+
+def create_rl_sampler(data_config, dataset):
+    """Create a sampler for the dataset.
+
+    Arguments:
+        data_config: The data config.
+        dataset (Dataset): The dataset.
+
+    Returns:
+        sampler (Sampler): The sampler.
+    """
+    import torch
+    from torch.utils.data import RandomSampler, SequentialSampler
+
+    if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None:
+        curriculum_class = load_extern_type(
+            data_config.sampler.class_path,
+            data_config.sampler.class_name,
+        )
+        sampler = curriculum_class(
+            data_source=dataset,
+            data_config=data_config,
+        )
+        assert isinstance(sampler, AbstractSampler)
+        assert data_config.get("dataloader_num_workers", 8) == 0, (
+            "If using curriculum, num_workers must be 0 to prevent data caching. "
+            "If the dataloader caches data before the batch is done the "
+            "curriculum sampler won't have the opportunity to reorder it. "
+        )
+
+    # Use a sampler to facilitate checkpoint resumption.
+    # If shuffling is enabled in the data configuration, create a random sampler.
+    elif data_config.shuffle:
+        train_dataloader_generator = torch.Generator()
+        train_dataloader_generator.manual_seed(data_config.get("seed", 1))
+        sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator)
+    else:
+        # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order.
+        sampler = SequentialSampler(data_source=dataset)
+
+    return sampler
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py
new file mode 100644
index 00000000000..56a1e5bcab1
--- /dev/null
+++ b/recipe/fully_async_policy/ray_trainer.py
@@ -0,0 +1,1434 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PPO Trainer with Ray-based single controller.
+This trainer supports model-agonistic model initialization with huggingface
+"""
+
+import json
+import os
+import uuid
+import warnings
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+from pprint import pprint
+from typing import Optional
+
+import numpy as np
+import ray
+import torch
+from omegaconf import OmegaConf, open_dict
+from torch.utils.data import Dataset, Sampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from tqdm import tqdm
+
+from verl import DataProto
+from verl.experimental.dataset.sampler import AbstractCurriculumSampler
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.base import Worker
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.config import AlgoConfig
+from verl.trainer.ppo import core_algos
+from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
+from verl.trainer.ppo.metric_utils import (
+    compute_data_metrics,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+    process_validation_metrics,
+)
+from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.debug import marked_timer
+from verl.utils.metric import (
+    reduce_metrics,
+)
+from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
+from verl.utils.torch_functional import masked_mean
+from verl.utils.tracking import ValidationGenerationsLogger
+
+WorkerType = type[Worker]
+
+
+class Role(Enum):
+    """
+    To create more roles dynamically, you can subclass Role and add new members
+    """
+
+    Actor = 0
+    Rollout = 1
+    ActorRollout = 2
+    Critic = 3
+    RefPolicy = 4
+    RewardModel = 5
+    ActorRolloutRef = 6
+
+    def __str__(self):
+        """返回与代码中一致的字符串表示"""
+        return self._get_role_string()
+
+    def _get_role_string(self):
+        """获取角色对应的字符串名称"""
+        role_mapping = {
+            Role.Actor: "actor",
+            Role.Rollout: "rollout",
+            Role.ActorRollout: "actor_rollout",
+            Role.Critic: "critic",
+            Role.RefPolicy: "ref",
+            Role.RewardModel: "rm",
+            Role.ActorRolloutRef: "actor_rollout_ref",
+        }
+        return role_mapping.get(self, self.name.lower())
+
+    @classmethod
+    def from_string(cls, name: str):
+        """从字符串创建Role实例"""
+        string_mapping = {
+            "actor": cls.Actor,
+            "rollout": cls.Rollout,
+            "actor_rollout": cls.ActorRollout,
+            "critic": cls.Critic,
+            "ref": cls.RefPolicy,
+            "rm": cls.RewardModel,
+            "actor_rollout_ref": cls.ActorRolloutRef,
+        }
+        role = string_mapping.get(name.lower())
+        if role is None:
+            raise ValueError(f"No Role found for string: {name}")
+        return role
+
+
+@dataclass
+class ResourcePoolManager:
+    """
+    Define a resource pool specification. Resource pool will be initialized first.
+    """
+
+    resource_pool_spec: dict[str, list[int]]
+    mapping: dict[Role, str]
+    resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict)
+
+    def create_resource_pool(self):
+        """Create Ray resource pools for distributed training.
+
+        Initializes resource pools based on the resource pool specification,
+        with each pool managing GPU resources across multiple nodes.
+        For FSDP backend, uses max_colocate_count=1 to merge WorkerGroups.
+        For Megatron backend, uses max_colocate_count>1 for different models.
+        """
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool
+            # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one.
+            # For Megatron backend, we recommend using max_colocate_count>1
+            # that can utilize different WorkerGroup for differnt models
+            resource_pool = RayResourcePool(
+                process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name
+            )
+            self.resource_pool_dict[resource_pool_name] = resource_pool
+
+        self._check_resource_available()
+
+    def get_resource_pool(self, role: Role) -> RayResourcePool:
+        """Get the resource pool of the worker_cls"""
+        return self.resource_pool_dict[self.mapping[role]]
+
+    def get_n_gpus(self) -> int:
+        """Get the number of gpus in this cluster."""
+        return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes])
+
+    def _check_resource_available(self):
+        """Check if the resource pool can be satisfied in this ray cluster."""
+        node_available_resources = ray.state.available_resources_per_node()
+        node_available_gpus = {
+            node: node_info.get("GPU", 0) if "GPU" in node_info else node_info.get("NPU", 0)
+            for node, node_info in node_available_resources.items()
+        }
+
+        # check total required gpus can be satisfied
+        total_available_gpus = sum(node_available_gpus.values())
+        total_required_gpus = sum(
+            [n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes]
+        )
+        if total_available_gpus < total_required_gpus:
+            raise ValueError(
+                f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}"
+            )
+
+        # check each resource pool can be satisfied, O(#resource_pools * #nodes)
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes)
+            for node, available_gpus in node_available_gpus.items():
+                if available_gpus >= num_gpus:
+                    node_available_gpus[node] -= num_gpus
+                    num_nodes -= 1
+                    if num_nodes == 0:
+                        break
+            if num_nodes > 0:
+                raise ValueError(
+                    f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes}"
+                    + "cannot be satisfied in this ray cluster"
+                )
+
+
+def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl"):
+    """Apply KL penalty to the token-level rewards.
+
+    This function computes the KL divergence between the reference policy and current policy,
+    then applies a penalty to the token-level rewards based on this divergence.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+        kl_ctrl (core_algos.AdaptiveKLController): Controller for adaptive KL penalty.
+        kl_penalty (str, optional): Type of KL penalty to apply. Defaults to "kl".
+        multi_turn (bool, optional): Whether the data is from a multi-turn conversation. Defaults to False.
+
+    Returns:
+        tuple: A tuple containing:
+            - The updated data with token-level rewards adjusted by KL penalty
+            - A dictionary of metrics related to the KL penalty
+    """
+    response_mask = data.batch["response_mask"]
+    token_level_scores = data.batch["token_level_scores"]
+    batch_size = data.batch.batch_size[0]
+
+    # compute kl between ref_policy and current policy
+    # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled.
+    kld = core_algos.kl_penalty(
+        data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty
+    )  # (batch_size, response_length)
+    kld = kld * response_mask
+    beta = kl_ctrl.value
+
+    token_level_rewards = token_level_scores - beta * kld
+
+    current_kl = masked_mean(kld, mask=response_mask, axis=-1)  # average over sequence
+    current_kl = torch.mean(current_kl, dim=0).item()
+
+    # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
+    kl_ctrl.update(current_kl=current_kl, n_steps=batch_size)
+    data.batch["token_level_rewards"] = token_level_rewards
+
+    metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta}
+
+    return data, metrics
+
+
+def compute_response_mask(data: DataProto):
+    """Compute the attention mask for the response part of the sequence.
+
+    This function extracts the portion of the attention mask that corresponds to the model's response,
+    which is used for masking computations that should only apply to response tokens.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+
+    Returns:
+        torch.Tensor: The attention mask for the response tokens.
+    """
+    responses = data.batch["responses"]
+    response_length = responses.size(1)
+    attention_mask = data.batch["attention_mask"]
+    return attention_mask[:, -response_length:]
+
+
+def compute_advantage(
+    data: DataProto,
+    adv_estimator: AdvantageEstimator,
+    gamma: float = 1.0,
+    lam: float = 1.0,
+    num_repeat: int = 1,
+    norm_adv_by_std_in_grpo: bool = True,
+    config: Optional[AlgoConfig] = None,
+) -> DataProto:
+    """Compute advantage estimates for policy optimization.
+
+    This function computes advantage estimates using various estimators like GAE, GRPO, REINFORCE++, etc.
+    The advantage estimates are used to guide policy optimization in RL algorithms.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+        adv_estimator (AdvantageEstimator): The advantage estimator to use (e.g., GAE, GRPO, REINFORCE++).
+        gamma (float, optional): Discount factor for future rewards. Defaults to 1.0.
+        lam (float, optional): Lambda parameter for GAE. Defaults to 1.0.
+        num_repeat (int, optional): Number of times to repeat the computation. Defaults to 1.
+        norm_adv_by_std_in_grpo (bool, optional): Whether to normalize advantages by standard deviation in
+            GRPO. Defaults to True.
+        config (dict, optional): Configuration dictionary for algorithm settings. Defaults to None.
+
+    Returns:
+        DataProto: The updated data with computed advantages and returns.
+    """
+    # Back-compatible with trainers that do not compute response mask in fit
+    if "response_mask" not in data.batch.keys():
+        data.batch["response_mask"] = compute_response_mask(data)
+    # prepare response group
+    if adv_estimator == AdvantageEstimator.GAE:
+        # Compute advantages and returns using Generalized Advantage Estimation (GAE)
+        advantages, returns = core_algos.compute_gae_advantage_return(
+            token_level_rewards=data.batch["token_level_rewards"],
+            values=data.batch["values"],
+            response_mask=data.batch["response_mask"],
+            gamma=gamma,
+            lam=lam,
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+        if config.get("use_pf_ppo", False):
+            data = core_algos.compute_pf_ppo_reweight_data(
+                data,
+                config.pf_ppo.get("reweight_method"),
+                config.pf_ppo.get("weight_pow"),
+            )
+    elif adv_estimator == AdvantageEstimator.GRPO:
+        # Initialize the mask for GRPO calculation
+        grpo_calculation_mask = data.batch["response_mask"]
+        # Call compute_grpo_outcome_advantage with parameters matching its definition
+        advantages, returns = core_algos.compute_grpo_outcome_advantage(
+            token_level_rewards=data.batch["token_level_rewards"],
+            response_mask=grpo_calculation_mask,
+            index=data.non_tensor_batch["uid"],
+            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    else:
+        # handle all other adv estimator type other than GAE and GRPO
+        adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator)
+        adv_kwargs = {
+            "token_level_rewards": data.batch["token_level_rewards"],
+            "response_mask": data.batch["response_mask"],
+            "config": config,
+        }
+        if "uid" in data.non_tensor_batch:  # optional
+            adv_kwargs["index"] = data.non_tensor_batch["uid"]
+        if "reward_baselines" in data.batch:  # optional
+            adv_kwargs["reward_baselines"] = data.batch["reward_baselines"]
+
+        # calculate advantage estimator
+        advantages, returns = adv_estimator_fn(**adv_kwargs)
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    return data
+
+
+class RayPPOTrainer:
+    """Distributed PPO trainer using Ray for scalable reinforcement learning.
+
+    This trainer orchestrates distributed PPO training across multiple nodes and GPUs,
+    managing actor rollouts, critic training, and reward computation with Ray backend.
+    Supports various model architectures including FSDP, Megatron, and vLLM integration.
+    """
+
+    # TODO: support each role have individual ray_worker_group_cls,
+    # i.e., support different backend of different role
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name=None,
+    ):
+        """
+        Initialize distributed PPO trainer with Ray backend.
+        Note that this trainer runs on the driver process on a single CPU/GPU node.
+
+        Args:
+            config: Configuration object containing training parameters.
+            tokenizer: Tokenizer used for encoding and decoding text.
+            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
+            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
+            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
+            processor: Optional data processor, used for multimodal data
+            reward_fn: Function for computing rewards during training.
+            val_reward_fn: Function for computing rewards during validation.
+            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
+            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
+            collate_fn: Function to collate data samples into batches.
+            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
+            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
+        """
+
+        # Store the tokenizer for text processing
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+        self.reward_fn = reward_fn
+        self.val_reward_fn = val_reward_fn
+
+        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+        assert self.hybrid_engine, "Currently, only support hybrid engine"
+
+        if self.hybrid_engine:
+            assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}"
+
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
+        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name if device_name else self.config.trainer.device
+        self.validation_generations_logger = ValidationGenerationsLogger(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+        )
+
+        # if ref_in_actor is True, the reference policy will be actor without lora applied
+        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
+
+        # define in-reward KL control
+        # kl loss control currently not suppoorted
+        if self.config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
+
+        if config.critic.enable is not None:
+            self.use_critic = bool(config.critic.enable)
+        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+            self.use_critic = True
+        else:
+            warnings.warn(
+                "Disabled critic as algorithm.adv_estimator != gae. "
+                "If it is not intended, please set critic.enable=True",
+                stacklevel=2,
+            )
+            self.use_critic = False
+
+        self._validate_config()
+        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+
+    def _validate_config(self):
+        config = self.config
+        # number of GPUs total
+        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
+        if config.actor_rollout_ref.actor.strategy == "megatron":
+            model_parallel_size = (
+                config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size
+                * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
+            )
+            assert (
+                n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0
+            ), (
+                f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times "
+                f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
+            )
+            megatron_dp = n_gpus // (
+                model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
+            )
+            self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
+        else:
+            self.minimal_bsz = n_gpus
+
+        # 1. Check total batch size for data correctness
+        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
+        assert real_train_batch_size % self.minimal_bsz == 0, (
+            f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
+            f"({self.minimal_bsz})"
+        )
+
+        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
+        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
+        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
+            """Validate mutually exclusive micro batch size configuration options.
+
+            Ensures that users don't set both deprecated micro_batch_size and
+            the new micro_batch_size_per_gpu parameters simultaneously.
+
+            Args:
+                mbs: Deprecated micro batch size parameter value.
+                mbs_per_gpu: New micro batch size per GPU parameter value.
+                name (str): Configuration section name for error messages.
+
+            Raises:
+                ValueError: If both parameters are set or neither is set.
+            """
+            settings = {
+                "reward_model": "micro_batch_size",
+                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
+                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
+            }
+
+            if name in settings:
+                param = settings[name]
+                param_per_gpu = f"{param}_per_gpu"
+
+                if mbs is None and mbs_per_gpu is None:
+                    raise ValueError(
+                        f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'."
+                    )
+
+                if mbs is not None and mbs_per_gpu is not None:
+                    raise ValueError(
+                        f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove "
+                        f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)."
+                    )
+
+        # Actor validation done in ActorConfig.__post_init__ and validate()
+        actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor)
+        actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model)
+
+        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+            if self.use_reference_policy:
+                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+                check_mutually_exclusive(
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
+                    "actor_rollout_ref.ref",
+                )
+
+            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
+                "actor_rollout_ref.rollout",
+            )
+
+        # Check for reward model micro-batch size conflicts
+        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+            check_mutually_exclusive(
+                config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
+            )
+
+        if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
+            print("NOTICE: You have both enabled in-reward kl and kl loss.")
+
+        # critic
+        if self.use_critic:
+            critic_config = omega_conf_to_dataclass(config.critic)
+            critic_config.validate(n_gpus, config.data.train_batch_size)
+
+        if config.data.get("val_batch_size", None) is not None:
+            print(
+                "WARNING: val_batch_size is deprecated."
+                + " Validation datasets are sent to inference engines as a whole batch,"
+                + " which will schedule the memory themselves."
+            )
+
+        # check eval config
+        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
+            assert config.actor_rollout_ref.rollout.temperature > 0, (
+                "validation gen temperature should be greater than 0 when enabling do_sample"
+            )
+
+        print("[validate_config] All configuration checks passed successfully!")
+
+    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
+        """
+        Creates the train and validation dataloaders.
+        """
+        # TODO: we have to make sure the batch size is divisible by the dp size
+        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+
+        if train_dataset is None:
+            train_dataset = create_rl_dataset(
+                self.config.data.train_files, self.config.data, self.tokenizer, self.processor
+            )
+        if val_dataset is None:
+            val_dataset = create_rl_dataset(
+                self.config.data.val_files, self.config.data, self.tokenizer, self.processor
+            )
+        self.train_dataset, self.val_dataset = train_dataset, val_dataset
+
+        if train_sampler is None:
+            train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
+        if collate_fn is None:
+            from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
+
+            collate_fn = default_collate_fn
+
+        num_workers = self.config.data["dataloader_num_workers"]
+
+        self.train_dataloader = StatefulDataLoader(
+            dataset=self.train_dataset,
+            batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
+            num_workers=num_workers,
+            drop_last=True,
+            collate_fn=collate_fn,
+            sampler=train_sampler,
+        )
+
+        val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
+        if val_batch_size is None:
+            val_batch_size = len(self.val_dataset)
+
+        self.val_dataloader = StatefulDataLoader(
+            dataset=self.val_dataset,
+            batch_size=val_batch_size,
+            num_workers=num_workers,
+            shuffle=self.config.data.get("validation_shuffle", True),
+            drop_last=False,
+            collate_fn=collate_fn,
+        )
+
+        assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
+        assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
+
+        print(
+            f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: "
+            f"{len(self.val_dataloader)}"
+        )
+
+        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+        if self.config.trainer.total_training_steps is not None:
+            total_training_steps = self.config.trainer.total_training_steps
+
+        self.total_training_steps = total_training_steps
+        print(f"Total training steps: {self.total_training_steps}")
+
+        try:
+            OmegaConf.set_struct(self.config, True)
+            with open_dict(self.config):
+                if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"):
+                    self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+                if OmegaConf.select(self.config, "critic.optim"):
+                    self.config.critic.optim.total_training_steps = total_training_steps
+        except Exception as e:
+            print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
+
+    def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path):
+        """Dump rollout/validation samples as JSONL."""
+        os.makedirs(dump_path, exist_ok=True)
+        filename = os.path.join(dump_path, f"{self.global_steps}.jsonl")
+
+        n = len(inputs)
+        base_data = {
+            "input": inputs,
+            "output": outputs,
+            "score": scores,
+            "step": [self.global_steps] * n,
+        }
+
+        for k, v in reward_extra_infos_dict.items():
+            if len(v) == n:
+                base_data[k] = v
+
+        lines = []
+        for i in range(n):
+            entry = {k: v[i] for k, v in base_data.items()}
+            lines.append(json.dumps(entry, ensure_ascii=False))
+
+        with open(filename, "w") as f:
+            f.write("\n".join(lines) + "\n")
+
+        print(f"Dumped generations to {filename}")
+
+    def _maybe_log_val_generations(self, inputs, outputs, scores):
+        """Log a table of validation samples to the configured logger (wandb or swanlab)"""
+
+        generations_to_log = self.config.trainer.log_val_generations
+
+        if generations_to_log == 0:
+            return
+
+        import numpy as np
+
+        # Create tuples of (input, output, score) and sort by input text
+        samples = list(zip(inputs, outputs, scores, strict=True))
+        samples.sort(key=lambda x: x[0])  # Sort by input text
+
+        # Use fixed random seed for deterministic shuffling
+        rng = np.random.RandomState(42)
+        rng.shuffle(samples)
+
+        # Take first N samples after shuffling
+        samples = samples[:generations_to_log]
+
+        # Log to each configured logger
+        self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps)
+
+    def _validate(self):
+        data_source_lst = []
+        reward_extra_infos_dict: dict[str, list] = defaultdict(list)
+
+        # Lists to collect samples for the table
+        sample_inputs = []
+        sample_outputs = []
+        sample_scores = []
+        sample_turns = []
+
+        for test_data in self.val_dataloader:
+            test_batch = DataProto.from_single_dict(test_data)
+
+            # repeat test batch
+            test_batch = test_batch.repeat(
+                repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True
+            )
+
+            # we only do validation on rule-based rm
+            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
+                return {}
+
+            # Store original inputs
+            input_ids = test_batch.batch["input_ids"]
+            # TODO: Can we keep special tokens except for padding tokens?
+            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
+            sample_inputs.extend(input_texts)
+
+            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+            if "multi_modal_data" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("multi_modal_data")
+            if "raw_prompt" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("raw_prompt")
+            if "tools_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("tools_kwargs")
+            if "interaction_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+            if "agent_name" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("agent_name")
+            test_gen_batch = test_batch.pop(
+                batch_keys=batch_keys_to_pop,
+                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+            )
+
+            test_gen_batch.meta_info = {
+                "eos_token_id": self.tokenizer.eos_token_id,
+                "pad_token_id": self.tokenizer.pad_token_id,
+                "recompute_log_prob": False,
+                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
+                "validate": True,
+                "global_steps": self.global_steps,
+            }
+            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
+
+            # pad to be divisible by dp_size
+            size_divisor = (
+                self.actor_rollout_wg.world_size
+                if not self.async_rollout_mode
+                else self.config.actor_rollout_ref.rollout.agent.num_workers
+            )
+            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor)
+            if not self.async_rollout_mode:
+                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
+            else:
+                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)
+
+            # unpad
+            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
+
+            print("validation generation end")
+
+            # Store generated outputs
+            output_ids = test_output_gen_batch.batch["responses"]
+            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
+            sample_outputs.extend(output_texts)
+
+            test_batch = test_batch.union(test_output_gen_batch)
+            test_batch.meta_info["validate"] = True
+
+            # evaluate using reward_function
+            result = self.val_reward_fn(test_batch, return_dict=True)
+            reward_tensor = result["reward_tensor"]
+            scores = reward_tensor.sum(-1).cpu().tolist()
+            sample_scores.extend(scores)
+
+            reward_extra_infos_dict["reward"].extend(scores)
+            print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}")
+            if "reward_extra_info" in result:
+                for key, lst in result["reward_extra_info"].items():
+                    reward_extra_infos_dict[key].extend(lst)
+                    print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}")
+
+            # collect num_turns of each prompt
+            if "__num_turns__" in test_batch.non_tensor_batch:
+                sample_turns.append(test_batch.non_tensor_batch["__num_turns__"])
+
+            data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0]))
+
+        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
+
+        # dump generations
+        val_data_dir = self.config.trainer.get("validation_data_dir", None)
+        if val_data_dir:
+            self._dump_generations(
+                inputs=sample_inputs,
+                outputs=sample_outputs,
+                scores=sample_scores,
+                reward_extra_infos_dict=reward_extra_infos_dict,
+                dump_path=val_data_dir,
+            )
+
+        for key_info, lst in reward_extra_infos_dict.items():
+            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
+
+        data_sources = np.concatenate(data_source_lst, axis=0)
+
+        data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)
+        metric_dict = {}
+        for data_source, var2metric2val in data_src2var2metric2val.items():
+            core_var = "acc" if "acc" in var2metric2val else "reward"
+            for var_name, metric2val in var2metric2val.items():
+                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
+                for metric_name, metric_val in metric2val.items():
+                    if (
+                        (var_name == core_var)
+                        and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"])
+                        and (f"@{n_max}" in metric_name)
+                    ):
+                        metric_sec = "val-core"
+                    else:
+                        metric_sec = "val-aux"
+                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
+                    metric_dict[pfx] = metric_val
+
+        if len(sample_turns) > 0:
+            sample_turns = np.concatenate(sample_turns)
+            metric_dict["val-aux/num_turns/min"] = sample_turns.min()
+            metric_dict["val-aux/num_turns/max"] = sample_turns.max()
+            metric_dict["val-aux/num_turns/mean"] = sample_turns.mean()
+
+        return metric_dict
+
+    def init_workers(self):
+        """Initialize distributed training workers using Ray backend.
+
+        Creates:
+        1. Ray resource pools from configuration
+        2. Worker groups for each role (actor, critic, etc.)
+        """
+        self._init_resource_pools()
+        self._create_worker_classes()
+        self._init_worker_groups()
+        self._init_models()
+        self._init_async_rollout_manager()
+
+    def _init_resource_pools(self):
+        self.resource_pool_manager.create_resource_pool()
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
+    def _create_worker_classes(self):
+        self._create_actor_rollout_classes()
+        self._create_critic_class()
+        self._create_reference_policy_class()
+        self._create_reward_model_class()
+
+    def _create_actor_rollout_classes(self):
+        # create actor and rollout
+        if self.hybrid_engine:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
+            actor_rollout_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[Role.ActorRollout],
+                config=self.config.actor_rollout_ref,
+                role=str(Role.ActorRollout),
+                profile_option=self.config.trainer.npu_profile.options,
+            )
+            self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls
+        else:
+            raise NotImplementedError
+
+    def _create_critic_class(self):
+        # create critic
+        if self.use_critic:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            critic_cfg = omega_conf_to_dataclass(self.config.critic)
+            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
+            self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls
+
+    def _create_reference_policy_class(self):
+        # create reference policy if needed
+        if self.use_reference_policy:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            ref_policy_cls = RayClassWithInitArgs(
+                self.role_worker_mapping[Role.RefPolicy],
+                config=self.config.actor_rollout_ref,
+                role=str(Role.RefPolicy),
+                profile_option=self.config.trainer.npu_profile.options,
+            )
+            self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls
+
+    def _create_reward_model_class(self):
+        # create a reward model if reward_fn is None
+        if self.use_rm:
+            # we create a RM here
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
+            self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls
+
+    def _init_worker_groups(self):
+        # initialize WorkerGroup
+        # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
+        # you should not use `create_colocated_worker_cls`.
+        # Instead, directly pass different resource pool to different worker groups.
+        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
+        all_wg = {}
+        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
+            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
+            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
+                "worker_nsight_options must be set when profile_steps is set"
+            )
+            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
+                OmegaConf.select(self.config.trainer, "worker_nsight_options")
+            )
+        wg_kwargs["device_name"] = self.device_name
+
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                **wg_kwargs,
+            )
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+        self.all_wg = all_wg
+
+    def _init_models(self):
+        if self.use_critic:
+            self.critic_wg = self.all_wg[str(Role.Critic)]
+            self.critic_wg.init_model()
+
+        if self.use_reference_policy and not self.ref_in_actor:
+            self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)]
+            self.ref_policy_wg.init_model()
+
+        if self.use_rm:
+            self.rm_wg = self.all_wg[str(Role.RewardModel)]
+            self.rm_wg.init_model()
+
+        # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
+        self.actor_rollout_wg = self.all_wg[str(Role.ActorRollout)]
+        self.actor_rollout_wg.init_model()
+
+    def _init_async_rollout_manager(self):
+        # create async rollout manager and request scheduler
+        self.async_rollout_mode = False
+        if self.config.actor_rollout_ref.rollout.mode == "async":
+            from verl.experimental.agent_loop import AgentLoopManager
+
+            self.async_rollout_mode = True
+            self.async_rollout_manager = AgentLoopManager(
+                config=self.config,
+                worker_group=self.actor_rollout_wg,
+            )
+
+    def _save_checkpoint(self):
+        from verl.utils.fs import local_mkdir_safe
+
+        # path: given_path + `/global_step_{global_steps}` + `/actor`
+        local_global_step_folder = os.path.join(
+            self.config.trainer.default_local_dir, f"global_step_{self.global_steps}"
+        )
+
+        print(f"local_global_step_folder: {local_global_step_folder}")
+        actor_local_path = os.path.join(local_global_step_folder, "actor")
+
+        actor_remote_path = (
+            None
+            if self.config.trainer.default_hdfs_dir is None
+            else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor")
+        )
+
+        remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False)
+        if remove_previous_ckpt_in_save:
+            print(
+                "Warning: remove_previous_ckpt_in_save is deprecated,"
+                + " set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead"
+            )
+        max_actor_ckpt_to_keep = (
+            self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+        )
+        max_critic_ckpt_to_keep = (
+            self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+        )
+
+        self.actor_rollout_wg.save_checkpoint(
+            actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep
+        )
+
+        if self.use_critic:
+            critic_local_path = os.path.join(local_global_step_folder, "critic")
+            critic_remote_path = (
+                None
+                if self.config.trainer.default_hdfs_dir is None
+                else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic")
+            )
+            self.critic_wg.save_checkpoint(
+                critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep
+            )
+
+        # save dataloader
+        local_mkdir_safe(local_global_step_folder)
+        dataloader_local_path = os.path.join(local_global_step_folder, "data.pt")
+        dataloader_state_dict = self.train_dataloader.state_dict()
+        torch.save(dataloader_state_dict, dataloader_local_path)
+
+        # latest checkpointed iteration tracker (for atomic usage)
+        local_latest_checkpointed_iteration = os.path.join(
+            self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt"
+        )
+        with open(local_latest_checkpointed_iteration, "w") as f:
+            f.write(str(self.global_steps))
+
+    def _load_checkpoint(self):
+        if self.config.trainer.resume_mode == "disable":
+            return 0
+
+        # load from hdfs
+        if self.config.trainer.default_hdfs_dir is not None:
+            raise NotImplementedError("load from hdfs is not implemented yet")
+        else:
+            checkpoint_folder = self.config.trainer.default_local_dir  # TODO: check path
+            if not os.path.isabs(checkpoint_folder):
+                working_dir = os.getcwd()
+                checkpoint_folder = os.path.join(working_dir, checkpoint_folder)
+            global_step_folder = find_latest_ckpt_path(checkpoint_folder)  # None if no latest
+
+        # find global_step_folder
+        if self.config.trainer.resume_mode == "auto":
+            if global_step_folder is None:
+                print("Training from scratch")
+                return 0
+        else:
+            if self.config.trainer.resume_mode == "resume_path":
+                assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type"
+                assert "global_step_" in self.config.trainer.resume_from_path, (
+                    "resume ckpt must specify the global_steps"
+                )
+                global_step_folder = self.config.trainer.resume_from_path
+                if not os.path.isabs(global_step_folder):
+                    working_dir = os.getcwd()
+                    global_step_folder = os.path.join(working_dir, global_step_folder)
+        print(f"Load from checkpoint folder: {global_step_folder}")
+        # set global step
+        self.global_steps = int(global_step_folder.split("global_step_")[-1])
+
+        print(f"Setting global step to {self.global_steps}")
+        print(f"Resuming from {global_step_folder}")
+
+        actor_path = os.path.join(global_step_folder, "actor")
+        critic_path = os.path.join(global_step_folder, "critic")
+        # load actor
+        self.actor_rollout_wg.load_checkpoint(
+            actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
+        )
+        # load critic
+        if self.use_critic:
+            self.critic_wg.load_checkpoint(
+                critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
+            )
+
+        # load dataloader,
+        # TODO: from remote not implemented yet
+        dataloader_local_path = os.path.join(global_step_folder, "data.pt")
+        if os.path.exists(dataloader_local_path):
+            dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
+            self.train_dataloader.load_state_dict(dataloader_state_dict)
+        else:
+            print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
+
+    def _start_profiling(self, do_profile: bool, timing_raw) -> None:
+        """Start profiling for all worker groups if profiling is enabled."""
+        with marked_timer("start_profile", timing_raw):
+            if do_profile:
+                self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
+                if self.use_reference_policy:
+                    self.ref_policy_wg.start_profile()
+                if self.use_critic:
+                    self.critic_wg.start_profile()
+                if self.use_rm:
+                    self.rm_wg.start_profile()
+
+    def _stop_profiling(self, do_profile: bool, timing_raw) -> None:
+        """Stop profiling for all worker groups if profiling is enabled."""
+        with marked_timer("stop_profile", timing_raw):
+            if do_profile:
+                self.actor_rollout_wg.stop_profile()
+                if self.use_reference_policy:
+                    self.ref_policy_wg.stop_profile()
+                if self.use_critic:
+                    self.critic_wg.stop_profile()
+                if self.use_rm:
+                    self.rm_wg.stop_profile()
+
+    def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
+        """Reorder the data on single controller such that each dp rank gets similar total tokens"""
+        attention_mask = batch.batch["attention_mask"]
+        batch_size = attention_mask.shape[0]
+        global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist()  # (train_batch_size,)
+        world_size = self.actor_rollout_wg.world_size
+        global_partition_lst = get_seqlen_balanced_partitions(
+            global_seqlen_lst, k_partitions=world_size, equal_size=True
+        )
+        # reorder based on index. The data will be automatically equally partitioned by dispatch function
+        global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
+        batch.reorder(global_idx)
+        global_balance_stats = log_seqlen_unbalance(
+            seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix
+        )
+        metrics.update(global_balance_stats)
+
+    def fit(self):
+        """
+        The training loop of PPO.
+        The driver process only need to call the compute functions of the worker group through RPC
+        to construct the PPO dataflow.
+        The light-weight advantage computation is done on the driver process.
+        """
+        from omegaconf import OmegaConf
+
+        from verl.utils.tracking import Tracking
+
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+
+        self.global_steps = 0
+
+        # load checkpoint before doing anything
+        self._load_checkpoint()
+
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+        self.max_steps_duration = 0
+
+        for epoch in range(self.config.trainer.total_epochs):
+            for batch_dict in self.train_dataloader:
+                metrics = {}
+                timing_raw = {}
+
+                do_profile = (
+                    self.global_steps in self.config.trainer.profile_steps
+                    if self.config.trainer.profile_steps is not None
+                    else False
+                )
+                self._start_profiling(do_profile, timing_raw)
+
+                batch, gen_batch = self._prepare_generate_batch(batch_dict)
+
+                is_last_step = self.global_steps >= self.total_training_steps
+
+                with marked_timer("step", timing_raw):
+                    # generate a batch
+                    with marked_timer("gen", timing_raw, color="red"):
+                        if not self.async_rollout_mode:
+                            gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                        else:
+                            gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
+                        timing_raw.update(gen_batch_output.meta_info["timing"])
+                        gen_batch_output.meta_info.pop("timing", None)
+
+                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                        with marked_timer("gen_max", timing_raw, color="purple"):
+                            gen_baseline_batch = deepcopy(gen_batch)
+                            gen_baseline_batch.meta_info["do_sample"] = False
+                            if not self.async_rollout_mode:
+                                gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
+                            else:
+                                gen_baseline_output = self.async_rollout_manager.generate_sequences(gen_baseline_batch)
+                            batch = batch.union(gen_baseline_output)
+                            reward_baseline_tensor = self.reward_fn(batch)
+                            reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
+
+                            batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
+
+                            batch.batch["reward_baselines"] = reward_baseline_tensor
+
+                            del gen_baseline_batch, gen_baseline_output
+
+                    batch = self._post_generate_batch(batch, gen_batch_output, metrics)
+                    batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
+                    self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
+                    last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
+                    self._check_save_checkpoint(is_last_step, timing_raw)
+
+                self._stop_profiling(do_profile, timing_raw)
+                self._collect_metrics(batch, epoch, metrics, timing_raw)
+                self._post_batch_processing(batch)
+
+                # TODO: make a canonical logger that supports various backend
+                logger.log(data=metrics, step=self.global_steps)
+
+                progress_bar.update(1)
+                self.global_steps += 1
+
+                if is_last_step:
+                    pprint(f"Final validation metrics: {last_val_metrics}")
+                    progress_bar.close()
+                    return
+
+    def _prepare_generate_batch(self, batch_dict):
+        batch: DataProto = DataProto.from_single_dict(batch_dict)
+        # pop those keys for generation
+        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+        if "multi_modal_data" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("multi_modal_data")
+        if "raw_prompt" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("raw_prompt")
+        if "tools_kwargs" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("tools_kwargs")
+        if "interaction_kwargs" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+        if "index" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("index")
+        if "agent_name" in batch.non_tensor_batch:
+            non_tensor_batch_keys_to_pop.append("agent_name")
+        gen_batch = batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+        )
+        # pass global_steps to trace
+        gen_batch.meta_info["global_steps"] = self.global_steps
+        gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+        return batch, gen_batch
+
+    def _post_generate_batch(self, batch, gen_batch_output, metrics):
+        batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+        # repeat to align with repeated responses in rollout
+        batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+        batch = batch.union(gen_batch_output)
+        if "response_mask" not in batch.batch.keys():
+            batch.batch["response_mask"] = compute_response_mask(batch)
+        # Balance the number of valid tokens across DP ranks.
+        # NOTE: This usually changes the order of data in the `batch`,
+        # which won't affect the advantage calculation (since it's based on uid),
+        # but might affect the loss calculation (due to the change of mini-batching).
+        # TODO: Decouple the DP balancing and mini-batching.
+        if self.config.trainer.balance_batch:
+            self._balance_batch(batch, metrics=metrics)
+        # compute global_valid tokens
+        batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+        return batch
+
+    def _process_batch_common(self, batch, metrics, timing_raw):
+        with marked_timer("reward", timing_raw, color="yellow"):
+            # compute reward model score
+            if self.use_rm:
+                reward_tensor = self.rm_wg.compute_rm_score(batch)
+                batch = batch.union(reward_tensor)
+
+            if self.config.reward_model.launch_reward_fn_async:
+                future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
+            else:
+                reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+        # recompute old_log_probs
+        with marked_timer("old_log_prob", timing_raw, color="blue"):
+            async_training = self.config.get("async_training", None)
+            if async_training and async_training.use_rollout_log_probs:
+                batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
+                batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature
+
+            else:
+                old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                entropys = old_log_prob.batch["entropys"]
+                response_masks = batch.batch["response_mask"]
+                loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                metrics.update(old_log_prob_metrics)
+                old_log_prob.batch.pop("entropys")
+                batch = batch.union(old_log_prob)
+
+                if "rollout_log_probs" in batch.batch.keys():
+                    # TODO: we may want to add diff of probs too.
+                    rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                    actor_old_log_probs = batch.batch["old_log_probs"]
+                    attention_mask = batch.batch["attention_mask"]
+                    responses = batch.batch["responses"]
+                    response_length = responses.size(1)
+                    response_mask = attention_mask[:, -response_length:]
+
+                    rollout_probs = torch.exp(rollout_old_log_probs)
+                    actor_probs = torch.exp(actor_old_log_probs)
+                    rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                    rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                    rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                    rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                    rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                    metrics.update(
+                        {
+                            "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                            "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                            "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                        }
+                    )
+
+        if self.use_reference_policy:
+            # compute reference log_prob
+            with marked_timer("ref", timing_raw, color="olive"):
+                if not self.ref_in_actor:
+                    ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                else:
+                    ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                batch = batch.union(ref_log_prob)
+        # compute values
+        if self.use_critic:
+            with marked_timer("values", timing_raw, color="cyan"):
+                values = self.critic_wg.compute_values(batch)
+                batch = batch.union(values)
+        with marked_timer("adv", timing_raw, color="brown"):
+            # we combine with rule-based rm
+            reward_extra_infos_dict: dict[str, list]
+            if self.config.reward_model.launch_reward_fn_async:
+                reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
+            batch.batch["token_level_scores"] = reward_tensor
+
+            if reward_extra_infos_dict:
+                batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+            # compute rewards. apply_kl_penalty if available
+            if self.config.algorithm.use_kl_in_reward:
+                batch, kl_metrics = apply_kl_penalty(
+                    batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                )
+                metrics.update(kl_metrics)
+            else:
+                batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+
+            # compute advantages, executed on the driver process
+
+            norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                "norm_adv_by_std_in_grpo", True
+            )  # GRPO adv normalization factor
+
+            batch = compute_advantage(
+                batch,
+                adv_estimator=self.config.algorithm.adv_estimator,
+                gamma=self.config.algorithm.gamma,
+                lam=self.config.algorithm.lam,
+                num_repeat=self.config.actor_rollout_ref.rollout.n,
+                norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                config=self.config.algorithm,
+            )
+        # update critic
+        if self.use_critic:
+            with marked_timer("update_critic", timing_raw, color="pink"):
+                critic_output = self.critic_wg.update_critic(batch)
+            critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+            metrics.update(critic_output_metrics)
+        # implement critic warmup
+        if self.config.trainer.critic_warmup <= self.global_steps:
+            # update actor
+            with marked_timer("update_actor", timing_raw, color="red"):
+                batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                actor_output = self.actor_rollout_wg.update_actor(batch)
+            actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+            metrics.update(actor_output_metrics)
+        return batch, reward_extra_infos_dict
+
+    def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw):
+        """Log rollout generations if enabled"""
+        rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+        if rollout_data_dir:
+            with marked_timer("dump_rollout_generations", timing_raw, color="green"):
+                inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                if "request_id" in batch.non_tensor_batch:
+                    reward_extra_infos_dict.setdefault(
+                        "request_id",
+                        batch.non_tensor_batch["request_id"].tolist(),
+                    )
+                self._dump_generations(
+                    inputs=inputs,
+                    outputs=outputs,
+                    scores=scores,
+                    reward_extra_infos_dict=reward_extra_infos_dict,
+                    dump_path=rollout_data_dir,
+                )
+
+    def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw):
+        if (
+            self.val_reward_fn is not None
+            and self.config.trainer.test_freq > 0
+            and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+        ):
+            with marked_timer("testing", timing_raw, color="green"):
+                val_metrics: dict = self._validate()
+                if is_last_step:
+                    last_val_metrics = val_metrics
+            metrics.update(val_metrics)
+        return last_val_metrics
+
+    def _check_save_checkpoint(self, is_last_step, timing_raw):
+        # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
+        esi_close_to_expiration = should_save_ckpt_esi(
+            max_steps_duration=self.max_steps_duration,
+            redundant_time=self.config.trainer.esi_redundant_time,
+        )
+        # Check if the conditions for saving a checkpoint are met.
+        # The conditions include a mandatory condition (1) and
+        # one of the following optional conditions (2/3/4):
+        # 1. The save frequency is set to a positive value.
+        # 2. It's the last training step.
+        # 3. The current step number is a multiple of the save frequency.
+        # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
+        if self.config.trainer.save_freq > 0 and (
+            is_last_step or self.global_steps % self.config.trainer.save_freq == 0 or esi_close_to_expiration
+        ):
+            if esi_close_to_expiration:
+                print("Force saving checkpoint: ESI instance expiration approaching.")
+            with marked_timer("save_checkpoint", timing_raw, color="green"):
+                self._save_checkpoint()
+
+    def _collect_metrics(self, batch, epoch, metrics, timing_raw):
+        steps_duration = timing_raw["step"]
+        self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+        # training metrics
+        metrics.update(
+            {
+                "training/global_step": self.global_steps,
+                "training/epoch": epoch,
+            }
+        )
+        # collect metrics
+        metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+        metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+        # TODO: implement actual tflpo and theoretical tflpo
+        n_gpus = self.resource_pool_manager.get_n_gpus()
+        metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+
+    def _post_batch_processing(self, batch: DataProto):
+        # this is experimental and may be changed/removed in the future in favor of a general-purpose one
+        if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
+            self.train_dataloader.sampler.update(batch=batch)
+
+        # this is experimental and may be changed/removed in the future
+        # in favor of a general-purpose data buffer pool
+        if hasattr(self.train_dataset, "on_batch_end"):
+            # The dataset may be changed after each training batch
+            self.train_dataset.on_batch_end(batch=batch)

From c20666039c4300056dce97bf0af1de4dca5142fc Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 15 Sep 2025 15:43:29 +0800
Subject: [PATCH 132/182] restore modified files in verl folder

---
 recipe/fully_async_policy/detach_utils.py     |   2 +-
 recipe/fully_async_policy/ray_trainer.py      |   2 +-
 verl/experimental/agent_loop/__init__.py      |   5 +-
 verl/experimental/agent_loop/agent_loop.py    | 268 ++------
 .../partial_single_turn_agent_loop.py         |  74 ---
 verl/trainer/main_ppo.py                      |  15 +-
 verl/trainer/ppo/ray_trainer.py               | 618 ++++++++----------
 .../rollout/vllm_rollout/vllm_async_server.py |  65 +-
 8 files changed, 356 insertions(+), 693 deletions(-)
 delete mode 100644 verl/experimental/agent_loop/partial_single_turn_agent_loop.py

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 18e45d50a16..75d67ec1ab1 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -20,7 +20,7 @@
 import torch
 
 from verl import DataProto
-from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs
+from recipe.fully_async_policy.agent_loop.agent_loop import postprocess_agent_loop_outputs
 from verl.trainer.ppo.ray_trainer import compute_response_mask
 
 
diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py
index 56a1e5bcab1..dea3aa2c26e 100644
--- a/recipe/fully_async_policy/ray_trainer.py
+++ b/recipe/fully_async_policy/ray_trainer.py
@@ -920,7 +920,7 @@ def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         self.async_rollout_mode = False
         if self.config.actor_rollout_ref.rollout.mode == "async":
-            from verl.experimental.agent_loop import AgentLoopManager
+            from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopManager
 
             self.async_rollout_mode = True
             self.async_rollout_manager = AgentLoopManager(
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index 67dcb16047e..c6f58f83c83 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 
 from .agent_loop import AgentLoopBase, AgentLoopManager
-from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
 from .single_turn_agent_loop import SingleTurnAgentLoop
 from .tool_agent_loop import ToolAgentLoop
 
-_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop]
+_ = [SingleTurnAgentLoop, ToolAgentLoop]
 
-__all__ = ["AgentLoopBase", "AgentLoopManager"]
+__all__ = ["AgentLoopBase", "AgentLoopManager"]
\ No newline at end of file
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index ddcad093326..4639229a3b0 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -17,7 +17,7 @@
 import os
 import random
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
 
 import hydra
 import numpy as np
@@ -103,16 +103,6 @@ async def generate(
         )
         return output
 
-    async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
-        """Generate tokens from prompt ids. with partial rollout function"""
-        server = self._choose_server(request_id)
-        output = await server.generate_for_partial.remote(
-            request_id=request_id,
-            prompt_ids=prompt_ids,
-            sampling_params=sampling_params,
-        )
-        return output
-
 
 class AgentLoopMetrics(BaseModel):
     """Agent loop performance metrics."""
@@ -134,10 +124,6 @@ class AgentLoopOutput(BaseModel):
     """Number of chat turns, including user, assistant, tool."""
     metrics: AgentLoopMetrics
     """Auxiliary performance metrics"""
-    is_cancel: bool = False
-    """Indicates whether the request was interrupted"""
-    log_probs: list[float] = None
-    """Response token log probs including LLM generated token, tool response token."""
 
 
 # make hydra.utils.instantiate happy
@@ -214,81 +200,6 @@ def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]:
     return decorator
 
 
-def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto:
-    """Static method to postprocess a list of AgentLoopOutput into DataProto
-
-    Args:
-        inputs: List of AgentLoopOutput
-        tokenizer: Tokenizer instance
-        config: Configuration object
-
-    Returns:
-        DataProto: Processed batch data
-    """
-    # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
-    # prompts: left pad
-    # responses: right pad
-    # input_ids: prompt + response
-    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
-    # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
-
-    # prompts
-    tokenizer.padding_side = "left"
-    outputs = tokenizer.pad(
-        [{"input_ids": input.prompt_ids} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.prompt_length,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-    # responses
-    tokenizer.padding_side = "right"
-    outputs = tokenizer.pad(
-        [{"input_ids": input.response_ids} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.response_length,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-    # response_mask
-    outputs = tokenizer.pad(
-        [{"input_ids": input.response_mask} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.response_length,
-        return_tensors="pt",
-        return_attention_mask=False,
-    )
-    response_mask = outputs["input_ids"]
-    assert response_ids.shape == response_mask.shape, (
-        f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
-    )
-    response_mask = response_mask * response_attention_mask
-
-    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
-    attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
-    position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
-
-    batch = TensorDict(
-        {
-            "prompts": prompt_ids,  # [bsz, prompt_length]
-            "responses": response_ids,  # [bsz, response_length]
-            "response_mask": response_mask,  # [bsz, response_length]
-            "input_ids": input_ids,  # [bsz, prompt_length + response_length]
-            "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
-            "position_ids": position_ids,  # [bsz, prompt_length + response_length]
-        },
-        batch_size=len(input_ids),
-    )
-
-    num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
-    metrics = [input.metrics.model_dump() for input in inputs]
-    return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
-
-
 @ray.remote
 class AgentLoopWorker:
     """Agent loop worker takes a batch of messages and run each message in an agent loop."""
@@ -378,76 +289,15 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
             )
         outputs = await asyncio.gather(*tasks)
 
-        output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config)
+        output = self._postprocess(outputs)
         return output
 
-    async def generate_sequences_no_post(
-        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
-    ) -> list[AgentLoopOutput]:
-        """Generate sequences from agent loop.
-
-        Args:
-            batch (DataProto): Input batch.
-            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
-
-        Returns:
-            list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
-            Each AgentLoopOutput contains:
-            - prompt_ids: prompt token ids
-            - response_ids: response token ids including LLM generated and tool response tokens
-            - response_mask: 1 for LLM generated tokens, 0 for tool response tokens
-            - num_turns: number of chat turns
-            - metrics: performance metrics
-        """
-        config = self.config.actor_rollout_ref.rollout
-        sampling_params = dict(
-            temperature=config.temperature,
-            top_p=config.top_p,
-            repetition_penalty=1.0,
-        )
-
-        # override sampling params for validation
-        if batch.meta_info.get("validate", False):
-            sampling_params["top_p"] = config.val_kwargs.top_p
-            sampling_params["temperature"] = config.val_kwargs.temperature
-
-        # by default, we assume it's a single turn agent
-        if "agent_name" not in batch.non_tensor_batch:
-            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
-
-        tasks = []
-        agent_names = batch.non_tensor_batch["agent_name"]
-        raw_prompts = batch.non_tensor_batch["raw_prompt"]
-        if "index" in batch.non_tensor_batch:
-            index = batch.non_tensor_batch["index"]
-        else:
-            index = np.arange(len(raw_prompts))
-
-        trajectory_info = await get_trajectory_info(
-            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
-        )
-        if not partial_output_list:
-            partial_output_list = [None] * len(batch)
-
-        for agent_name, messages, trajectory, partial_output in zip(
-            agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True
-        ):
-            tasks.append(
-                asyncio.create_task(
-                    self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
-                )
-            )
-        outputs = await asyncio.gather(*tasks)
-
-        return outputs
-
     async def _run_agent_loop(
         self,
         agent_name: str,
         messages: list[dict[str, Any]],
         sampling_params: dict[str, Any],
         trajectory: dict[str, Any],
-        partial_output: Optional[AgentLoopOutput] = None,
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
             step=trajectory["step"],
@@ -459,6 +309,7 @@ async def _run_agent_loop(
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
             )
+
             agent_loop_config = _agent_loop_registry[agent_name]
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
@@ -466,12 +317,73 @@ async def _run_agent_loop(
                 server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
             )
-            if agent_name == "partial_single_turn_agent":
-                output = await agent_loop.run(messages, sampling_params, partial_output)
-            else:
-                output = await agent_loop.run(messages, sampling_params)
+            output = await agent_loop.run(messages, sampling_params)
             return output
 
+    def _postprocess(self, inputs: list[AgentLoopOutput]) -> DataProto:
+        # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
+        # prompts: left pad
+        # responses: right pad
+        # input_ids: prompt + response
+        # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+        # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
+
+        # prompts
+        self.tokenizer.padding_side = "left"
+        outputs = self.tokenizer.pad(
+            [{"input_ids": input.prompt_ids} for input in inputs],
+            padding="max_length",
+            max_length=self.config.actor_rollout_ref.rollout.prompt_length,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+        # responses
+        self.tokenizer.padding_side = "right"
+        outputs = self.tokenizer.pad(
+            [{"input_ids": input.response_ids} for input in inputs],
+            padding="max_length",
+            max_length=self.config.actor_rollout_ref.rollout.response_length,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+        # response_mask
+        outputs = self.tokenizer.pad(
+            [{"input_ids": input.response_mask} for input in inputs],
+            padding="max_length",
+            max_length=self.config.actor_rollout_ref.rollout.response_length,
+            return_tensors="pt",
+            return_attention_mask=False,
+        )
+        response_mask = outputs["input_ids"]
+        assert response_ids.shape == response_mask.shape, (
+            f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
+        )
+        response_mask = response_mask * response_attention_mask
+
+        input_ids = torch.cat([prompt_ids, response_ids], dim=1)
+        attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
+        position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
+
+        batch = TensorDict(
+            {
+                "prompts": prompt_ids,  # [bsz, prompt_length]
+                "responses": response_ids,  # [bsz, response_length]
+                "response_mask": response_mask,  # [bsz, response_length]
+                "input_ids": input_ids,  # [bsz, prompt_length + response_length]
+                "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
+                "position_ids": position_ids,  # [bsz, prompt_length + response_length]
+            },
+            batch_size=len(input_ids),
+        )
+
+        num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
+        metrics = [input.metrics.model_dump() for input in inputs]
+        return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
+
 
 async def get_trajectory_info(step, index, validate):
     """Get trajectory info.
@@ -503,7 +415,7 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
 
         Args:
             config (DictConfig): trainer config.
-            worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group.
+            worker_group (RayWorkerGroup): ActorRolloutRef worker group.
         """
         self.config = config
         self.worker_group = worker_group
@@ -600,36 +512,6 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         output.meta_info = {"timing": timing}
         return output
 
-    async def generate_single_sample_async(
-        self,
-        sample: DataProto,
-        partial_output_list: Optional[list[AgentLoopOutput]],
-    ) -> list[AgentLoopOutput]:
-        """
-        异步处理单个样本, 需要复制n次
-
-        Args:
-            sample: 单个样本数据
-            partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
-
-        Returns:
-            tuple[AgentLoopOutput, float]: 处理结果和处理时间
-        """
-        # 使用负载均衡选择 worker
-        worker = self._select_best_worker()
-        # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
-        output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list)
-        return await asyncio.wrap_future(output_future.future())
-
-    def _select_best_worker(self):
-        """选择最佳的 worker（简单的轮询负载均衡）"""
-        if not hasattr(self, "_worker_index"):
-            self._worker_index = 0
-
-        worker = self.agent_loop_workers[self._worker_index]
-        self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
-        return worker
-
     def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
         timing = {}
         t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk])
@@ -658,14 +540,4 @@ def wake_up(self):
 
     def sleep(self):
         """Sleep all rollout server instances."""
-        ray.get([server.sleep.remote() for server in self.async_llm_servers])
-
-    async def cancel_async(self):
-        """Cancel all rollout tasks asynchronously."""
-        futures = [server.cancel.remote() for server in self.async_llm_servers]
-        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
-
-    async def resume_async(self):
-        """Cancel all rollout tasks asynchronously."""
-        futures = [server.resume.remote() for server in self.async_llm_servers]
-        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+        ray.get([server.sleep.remote() for server in self.async_llm_servers])
\ No newline at end of file
diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
deleted file mode 100644
index df4a4f3350a..00000000000
--- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import os
-from typing import Any, Optional
-from uuid import uuid4
-
-from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
-from verl.utils.profiler import simple_timer
-
-logger = logging.getLogger(__file__)
-logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
-
-
-@register("partial_single_turn_agent")
-class PartialSingleTurnAgentLoop(AgentLoopBase):
-    """Naive agent loop that only do single turn chat completion."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
-        self.response_length = self.config.actor_rollout_ref.rollout.response_length
-
-    async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
-    ) -> AgentLoopOutput:
-        if not output:
-            prompt_ids = await self.loop.run_in_executor(
-                None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
-            )
-        else:
-            if output.is_cancel:
-                # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
-                prompt_ids = output.prompt_ids + output.response_ids
-            else:
-                # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
-                return output
-
-        metrics = {}
-        request_id = uuid4().hex
-        with simple_timer("generate_sequences", metrics):
-            response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
-                request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
-            )
-
-        if not output:
-            response_mask = [1] * len(response_ids)
-        # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
-        else:
-            prompt_ids = output.prompt_ids
-            log_probs = output.log_probs + log_probs
-            response_ids = output.response_ids + response_ids
-            response_mask = [1] * len(response_ids)
-
-        return AgentLoopOutput(
-            prompt_ids=prompt_ids,
-            response_ids=response_ids[: self.response_length],
-            response_mask=response_mask[: self.response_length],
-            num_turns=2,
-            metrics=metrics,
-            is_cancel=is_cancel,
-            log_probs=log_probs,
-        )
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 4b240c6ffbf..8d2b811c733 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -37,15 +37,11 @@ def main(config):
     Args:
         config_dict: Hydra configuration dictionary containing training parameters.
     """
-    from time import time
-
-    start_time = time()
     run_ppo(config)
-    print(f"total time: {time() - start_time:.2f} seconds")
 
 
 # Define a function to run the PPO-like training process
-def run_ppo(config, task_runner_class=None) -> None:
+def run_ppo(config) -> None:
     """Initialize Ray cluster and run distributed PPO training process.
 
     Args:
@@ -63,9 +59,6 @@ def run_ppo(config, task_runner_class=None) -> None:
             runtime_env=get_ppo_ray_runtime_env(),
             num_cpus=config.ray_init.num_cpus,
         )
-    # for recipe to change TaskRunner
-    if task_runner_class is None:
-        task_runner_class = TaskRunner
 
     # Create a remote instance of the TaskRunner class, and
     # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
@@ -75,9 +68,9 @@ def run_ppo(config, task_runner_class=None) -> None:
         and len(config.trainer.get("profile_steps", [])) > 0
     ):
         nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
-        runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote()
+        runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
     else:
-        runner = task_runner_class.remote()
+        runner = TaskRunner.remote()
     ray.get(runner.run.remote(config))
 
     # [Optional] get the path of the timeline trace file from the configuration, default to None
@@ -341,4 +334,4 @@ def create_rl_sampler(data_config, dataset):
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 56a1e5bcab1..05281ebe3f9 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -79,40 +79,6 @@ class Role(Enum):
     RewardModel = 5
     ActorRolloutRef = 6
 
-    def __str__(self):
-        """返回与代码中一致的字符串表示"""
-        return self._get_role_string()
-
-    def _get_role_string(self):
-        """获取角色对应的字符串名称"""
-        role_mapping = {
-            Role.Actor: "actor",
-            Role.Rollout: "rollout",
-            Role.ActorRollout: "actor_rollout",
-            Role.Critic: "critic",
-            Role.RefPolicy: "ref",
-            Role.RewardModel: "rm",
-            Role.ActorRolloutRef: "actor_rollout_ref",
-        }
-        return role_mapping.get(self, self.name.lower())
-
-    @classmethod
-    def from_string(cls, name: str):
-        """从字符串创建Role实例"""
-        string_mapping = {
-            "actor": cls.Actor,
-            "rollout": cls.Rollout,
-            "actor_rollout": cls.ActorRollout,
-            "critic": cls.Critic,
-            "ref": cls.RefPolicy,
-            "rm": cls.RewardModel,
-            "actor_rollout_ref": cls.ActorRolloutRef,
-        }
-        role = string_mapping.get(name.lower())
-        if role is None:
-            raise ValueError(f"No Role found for string: {name}")
-        return role
-
 
 @dataclass
 class ResourcePoolManager:
@@ -438,15 +404,15 @@ def _validate_config(self):
             megatron_dp = n_gpus // (
                 model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
             )
-            self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
+            minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
         else:
-            self.minimal_bsz = n_gpus
+            minimal_bsz = n_gpus
 
         # 1. Check total batch size for data correctness
         real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
-        assert real_train_batch_size % self.minimal_bsz == 0, (
+        assert real_train_batch_size % minimal_bsz == 0, (
             f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
-            f"({self.minimal_bsz})"
+            f"({minimal_bsz})"
         )
 
         # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
@@ -810,65 +776,48 @@ def init_workers(self):
         1. Ray resource pools from configuration
         2. Worker groups for each role (actor, critic, etc.)
         """
-        self._init_resource_pools()
-        self._create_worker_classes()
-        self._init_worker_groups()
-        self._init_models()
-        self._init_async_rollout_manager()
-
-    def _init_resource_pools(self):
         self.resource_pool_manager.create_resource_pool()
-        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
 
-    def _create_worker_classes(self):
-        self._create_actor_rollout_classes()
-        self._create_critic_class()
-        self._create_reference_policy_class()
-        self._create_reward_model_class()
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
 
-    def _create_actor_rollout_classes(self):
         # create actor and rollout
         if self.hybrid_engine:
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
             actor_rollout_cls = RayClassWithInitArgs(
                 cls=self.role_worker_mapping[Role.ActorRollout],
                 config=self.config.actor_rollout_ref,
-                role=str(Role.ActorRollout),
+                role="actor_rollout",
                 profile_option=self.config.trainer.npu_profile.options,
             )
-            self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls
+            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
         else:
             raise NotImplementedError
 
-    def _create_critic_class(self):
         # create critic
         if self.use_critic:
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
             critic_cfg = omega_conf_to_dataclass(self.config.critic)
             critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
-            self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls
+            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
 
-    def _create_reference_policy_class(self):
         # create reference policy if needed
         if self.use_reference_policy:
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
             ref_policy_cls = RayClassWithInitArgs(
                 self.role_worker_mapping[Role.RefPolicy],
                 config=self.config.actor_rollout_ref,
-                role=str(Role.RefPolicy),
+                role="ref",
                 profile_option=self.config.trainer.npu_profile.options,
             )
-            self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls
+            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
 
-    def _create_reward_model_class(self):
         # create a reward model if reward_fn is None
         if self.use_rm:
             # we create a RM here
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
             rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
-            self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls
+            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
 
-    def _init_worker_groups(self):
         # initialize WorkerGroup
         # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
         # you should not use `create_colocated_worker_cls`.
@@ -897,26 +846,23 @@ def _init_worker_groups(self):
             )
             spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
             all_wg.update(spawn_wg)
-        self.all_wg = all_wg
 
-    def _init_models(self):
         if self.use_critic:
-            self.critic_wg = self.all_wg[str(Role.Critic)]
+            self.critic_wg = all_wg["critic"]
             self.critic_wg.init_model()
 
         if self.use_reference_policy and not self.ref_in_actor:
-            self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)]
+            self.ref_policy_wg = all_wg["ref"]
             self.ref_policy_wg.init_model()
 
         if self.use_rm:
-            self.rm_wg = self.all_wg[str(Role.RewardModel)]
+            self.rm_wg = all_wg["rm"]
             self.rm_wg.init_model()
 
         # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
-        self.actor_rollout_wg = self.all_wg[str(Role.ActorRollout)]
+        self.actor_rollout_wg = all_wg["actor_rollout"]
         self.actor_rollout_wg.init_model()
 
-    def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         self.async_rollout_mode = False
         if self.config.actor_rollout_ref.rollout.mode == "async":
@@ -1043,29 +989,27 @@ def _load_checkpoint(self):
         else:
             print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
 
-    def _start_profiling(self, do_profile: bool, timing_raw) -> None:
+    def _start_profiling(self, do_profile: bool) -> None:
         """Start profiling for all worker groups if profiling is enabled."""
-        with marked_timer("start_profile", timing_raw):
-            if do_profile:
-                self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
-                if self.use_reference_policy:
-                    self.ref_policy_wg.start_profile()
-                if self.use_critic:
-                    self.critic_wg.start_profile()
-                if self.use_rm:
-                    self.rm_wg.start_profile()
-
-    def _stop_profiling(self, do_profile: bool, timing_raw) -> None:
+        if do_profile:
+            self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
+            if self.use_reference_policy:
+                self.ref_policy_wg.start_profile()
+            if self.use_critic:
+                self.critic_wg.start_profile()
+            if self.use_rm:
+                self.rm_wg.start_profile()
+
+    def _stop_profiling(self, do_profile: bool) -> None:
         """Stop profiling for all worker groups if profiling is enabled."""
-        with marked_timer("stop_profile", timing_raw):
-            if do_profile:
-                self.actor_rollout_wg.stop_profile()
-                if self.use_reference_policy:
-                    self.ref_policy_wg.stop_profile()
-                if self.use_critic:
-                    self.critic_wg.stop_profile()
-                if self.use_rm:
-                    self.rm_wg.stop_profile()
+        if do_profile:
+            self.actor_rollout_wg.stop_profile()
+            if self.use_reference_policy:
+                self.ref_policy_wg.stop_profile()
+            if self.use_critic:
+                self.critic_wg.stop_profile()
+            if self.use_rm:
+                self.rm_wg.stop_profile()
 
     def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
         """Reorder the data on single controller such that each dp rank gets similar total tokens"""
@@ -1135,9 +1079,35 @@ def fit(self):
                     if self.config.trainer.profile_steps is not None
                     else False
                 )
-                self._start_profiling(do_profile, timing_raw)
+                with marked_timer("start_profile", timing_raw):
+                    self._start_profiling(do_profile)
+
+                batch: DataProto = DataProto.from_single_dict(batch_dict)
+
+                # pop those keys for generation
+                batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+                non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+                if "multi_modal_data" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("multi_modal_data")
+                if "raw_prompt" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("raw_prompt")
+                if "tools_kwargs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("tools_kwargs")
+                if "interaction_kwargs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+                if "index" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("index")
+                if "agent_name" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("agent_name")
+
+                gen_batch = batch.pop(
+                    batch_keys=batch_keys_to_pop,
+                    non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+                )
 
-                batch, gen_batch = self._prepare_generate_batch(batch_dict)
+                # pass global_steps to trace
+                gen_batch.meta_info["global_steps"] = self.global_steps
+                gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
 
                 is_last_step = self.global_steps >= self.total_training_steps
 
@@ -1169,15 +1139,216 @@ def fit(self):
 
                             del gen_baseline_batch, gen_baseline_output
 
-                    batch = self._post_generate_batch(batch, gen_batch_output, metrics)
-                    batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
-                    self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-                    last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
-                    self._check_save_checkpoint(is_last_step, timing_raw)
+                    batch.non_tensor_batch["uid"] = np.array(
+                        [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
+                    )
+                    # repeat to align with repeated responses in rollout
+                    batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                    batch = batch.union(gen_batch_output)
+
+                    if "response_mask" not in batch.batch.keys():
+                        batch.batch["response_mask"] = compute_response_mask(batch)
+                    # Balance the number of valid tokens across DP ranks.
+                    # NOTE: This usually changes the order of data in the `batch`,
+                    # which won't affect the advantage calculation (since it's based on uid),
+                    # but might affect the loss calculation (due to the change of mini-batching).
+                    # TODO: Decouple the DP balancing and mini-batching.
+                    if self.config.trainer.balance_batch:
+                        self._balance_batch(batch, metrics=metrics)
+
+                    # compute global_valid tokens
+                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
+                    with marked_timer("reward", timing_raw, color="yellow"):
+                        # compute reward model score
+                        if self.use_rm:
+                            reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            batch = batch.union(reward_tensor)
+
+                        if self.config.reward_model.launch_reward_fn_async:
+                            future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
+                        else:
+                            reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+
+                    # recompute old_log_probs
+                    with marked_timer("old_log_prob", timing_raw, color="blue"):
+                        old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                        entropys = old_log_prob.batch["entropys"]
+                        response_masks = batch.batch["response_mask"]
+                        loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                        entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                        old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                        metrics.update(old_log_prob_metrics)
+                        old_log_prob.batch.pop("entropys")
+                        batch = batch.union(old_log_prob)
+
+                        if "rollout_log_probs" in batch.batch.keys():
+                            # TODO: we may want to add diff of probs too.
+                            rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                            actor_old_log_probs = batch.batch["old_log_probs"]
+                            attention_mask = batch.batch["attention_mask"]
+                            responses = batch.batch["responses"]
+                            response_length = responses.size(1)
+                            response_mask = attention_mask[:, -response_length:]
+
+                            rollout_probs = torch.exp(rollout_old_log_probs)
+                            actor_probs = torch.exp(actor_old_log_probs)
+                            rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                            rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                            rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                            rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                            rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                            metrics.update(
+                                {
+                                    "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                                    "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                                    "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                                }
+                            )
+
+                    if self.use_reference_policy:
+                        # compute reference log_prob
+                        with marked_timer("ref", timing_raw, color="olive"):
+                            if not self.ref_in_actor:
+                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                            else:
+                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                            batch = batch.union(ref_log_prob)
+
+                    # compute values
+                    if self.use_critic:
+                        with marked_timer("values", timing_raw, color="cyan"):
+                            values = self.critic_wg.compute_values(batch)
+                            batch = batch.union(values)
+
+                    with marked_timer("adv", timing_raw, color="brown"):
+                        # we combine with rule-based rm
+                        reward_extra_infos_dict: dict[str, list]
+                        if self.config.reward_model.launch_reward_fn_async:
+                            reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
+                        batch.batch["token_level_scores"] = reward_tensor
+
+                        if reward_extra_infos_dict:
+                            batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+                        # compute rewards. apply_kl_penalty if available
+                        if self.config.algorithm.use_kl_in_reward:
+                            batch, kl_metrics = apply_kl_penalty(
+                                batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                            )
+                            metrics.update(kl_metrics)
+                        else:
+                            batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+
+                        # compute advantages, executed on the driver process
+
+                        norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                            "norm_adv_by_std_in_grpo", True
+                        )  # GRPO adv normalization factor
+
+                        batch = compute_advantage(
+                            batch,
+                            adv_estimator=self.config.algorithm.adv_estimator,
+                            gamma=self.config.algorithm.gamma,
+                            lam=self.config.algorithm.lam,
+                            num_repeat=self.config.actor_rollout_ref.rollout.n,
+                            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                            config=self.config.algorithm,
+                        )
+
+                    # update critic
+                    if self.use_critic:
+                        with marked_timer("update_critic", timing_raw, color="pink"):
+                            critic_output = self.critic_wg.update_critic(batch)
+                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                        metrics.update(critic_output_metrics)
+
+                    # implement critic warmup
+                    if self.config.trainer.critic_warmup <= self.global_steps:
+                        # update actor
+                        with marked_timer("update_actor", timing_raw, color="red"):
+                            batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                            actor_output = self.actor_rollout_wg.update_actor(batch)
+                        actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                        metrics.update(actor_output_metrics)
+
+                    # Log rollout generations if enabled
+                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                    if rollout_data_dir:
+                        with marked_timer("dump_rollout_generations", timing_raw, color="green"):
+                            inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                            outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                            scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                            if "request_id" in batch.non_tensor_batch:
+                                reward_extra_infos_dict.setdefault(
+                                    "request_id",
+                                    batch.non_tensor_batch["request_id"].tolist(),
+                                )
+                            self._dump_generations(
+                                inputs=inputs,
+                                outputs=outputs,
+                                scores=scores,
+                                reward_extra_infos_dict=reward_extra_infos_dict,
+                                dump_path=rollout_data_dir,
+                            )
+
+                    # validate
+                    if (
+                        self.val_reward_fn is not None
+                        and self.config.trainer.test_freq > 0
+                        and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+                    ):
+                        with marked_timer("testing", timing_raw, color="green"):
+                            val_metrics: dict = self._validate()
+                            if is_last_step:
+                                last_val_metrics = val_metrics
+                        metrics.update(val_metrics)
+
+                    # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
+                    esi_close_to_expiration = should_save_ckpt_esi(
+                        max_steps_duration=self.max_steps_duration,
+                        redundant_time=self.config.trainer.esi_redundant_time,
+                    )
+                    # Check if the conditions for saving a checkpoint are met.
+                    # The conditions include a mandatory condition (1) and
+                    # one of the following optional conditions (2/3/4):
+                    # 1. The save frequency is set to a positive value.
+                    # 2. It's the last training step.
+                    # 3. The current step number is a multiple of the save frequency.
+                    # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
+                    if self.config.trainer.save_freq > 0 and (
+                        is_last_step
+                        or self.global_steps % self.config.trainer.save_freq == 0
+                        or esi_close_to_expiration
+                    ):
+                        if esi_close_to_expiration:
+                            print("Force saving checkpoint: ESI instance expiration approaching.")
+                        with marked_timer("save_checkpoint", timing_raw, color="green"):
+                            self._save_checkpoint()
+
+                with marked_timer("stop_profile", timing_raw):
+                    self._stop_profiling(do_profile)
+
+                steps_duration = timing_raw["step"]
+                self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+
+                # training metrics
+                metrics.update(
+                    {
+                        "training/global_step": self.global_steps,
+                        "training/epoch": epoch,
+                    }
+                )
+                # collect metrics
+                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+                # TODO: implement actual tflpo and theoretical tflpo
+                n_gpus = self.resource_pool_manager.get_n_gpus()
+                metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
 
-                self._stop_profiling(do_profile, timing_raw)
-                self._collect_metrics(batch, epoch, metrics, timing_raw)
-                self._post_batch_processing(batch)
+                # this is experimental and may be changed/removed in the future in favor of a general-purpose one
+                if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
+                    self.train_dataloader.sampler.update(batch=batch)
 
                 # TODO: make a canonical logger that supports various backend
                 logger.log(data=metrics, step=self.global_steps)
@@ -1190,245 +1361,8 @@ def fit(self):
                     progress_bar.close()
                     return
 
-    def _prepare_generate_batch(self, batch_dict):
-        batch: DataProto = DataProto.from_single_dict(batch_dict)
-        # pop those keys for generation
-        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-        if "multi_modal_data" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("multi_modal_data")
-        if "raw_prompt" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("raw_prompt")
-        if "tools_kwargs" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("tools_kwargs")
-        if "interaction_kwargs" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-        if "index" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("index")
-        if "agent_name" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("agent_name")
-        gen_batch = batch.pop(
-            batch_keys=batch_keys_to_pop,
-            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-        )
-        # pass global_steps to trace
-        gen_batch.meta_info["global_steps"] = self.global_steps
-        gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-        return batch, gen_batch
-
-    def _post_generate_batch(self, batch, gen_batch_output, metrics):
-        batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
-        # repeat to align with repeated responses in rollout
-        batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-        batch = batch.union(gen_batch_output)
-        if "response_mask" not in batch.batch.keys():
-            batch.batch["response_mask"] = compute_response_mask(batch)
-        # Balance the number of valid tokens across DP ranks.
-        # NOTE: This usually changes the order of data in the `batch`,
-        # which won't affect the advantage calculation (since it's based on uid),
-        # but might affect the loss calculation (due to the change of mini-batching).
-        # TODO: Decouple the DP balancing and mini-batching.
-        if self.config.trainer.balance_batch:
-            self._balance_batch(batch, metrics=metrics)
-        # compute global_valid tokens
-        batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
-        return batch
-
-    def _process_batch_common(self, batch, metrics, timing_raw):
-        with marked_timer("reward", timing_raw, color="yellow"):
-            # compute reward model score
-            if self.use_rm:
-                reward_tensor = self.rm_wg.compute_rm_score(batch)
-                batch = batch.union(reward_tensor)
-
-            if self.config.reward_model.launch_reward_fn_async:
-                future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
-            else:
-                reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
-        # recompute old_log_probs
-        with marked_timer("old_log_prob", timing_raw, color="blue"):
-            async_training = self.config.get("async_training", None)
-            if async_training and async_training.use_rollout_log_probs:
-                batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
-                batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature
-
-            else:
-                old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
-                entropys = old_log_prob.batch["entropys"]
-                response_masks = batch.batch["response_mask"]
-                loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
-                entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
-                old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
-                metrics.update(old_log_prob_metrics)
-                old_log_prob.batch.pop("entropys")
-                batch = batch.union(old_log_prob)
-
-                if "rollout_log_probs" in batch.batch.keys():
-                    # TODO: we may want to add diff of probs too.
-                    rollout_old_log_probs = batch.batch["rollout_log_probs"]
-                    actor_old_log_probs = batch.batch["old_log_probs"]
-                    attention_mask = batch.batch["attention_mask"]
-                    responses = batch.batch["responses"]
-                    response_length = responses.size(1)
-                    response_mask = attention_mask[:, -response_length:]
-
-                    rollout_probs = torch.exp(rollout_old_log_probs)
-                    actor_probs = torch.exp(actor_old_log_probs)
-                    rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
-                    rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
-                    rollout_probs_diff_max = torch.max(rollout_probs_diff)
-                    rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
-                    rollout_probs_diff_std = torch.std(rollout_probs_diff)
-                    metrics.update(
-                        {
-                            "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
-                            "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
-                            "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
-                        }
-                    )
-
-        if self.use_reference_policy:
-            # compute reference log_prob
-            with marked_timer("ref", timing_raw, color="olive"):
-                if not self.ref_in_actor:
-                    ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
-                else:
-                    ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
-                batch = batch.union(ref_log_prob)
-        # compute values
-        if self.use_critic:
-            with marked_timer("values", timing_raw, color="cyan"):
-                values = self.critic_wg.compute_values(batch)
-                batch = batch.union(values)
-        with marked_timer("adv", timing_raw, color="brown"):
-            # we combine with rule-based rm
-            reward_extra_infos_dict: dict[str, list]
-            if self.config.reward_model.launch_reward_fn_async:
-                reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
-            batch.batch["token_level_scores"] = reward_tensor
-
-            if reward_extra_infos_dict:
-                batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
-
-            # compute rewards. apply_kl_penalty if available
-            if self.config.algorithm.use_kl_in_reward:
-                batch, kl_metrics = apply_kl_penalty(
-                    batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
-                )
-                metrics.update(kl_metrics)
-            else:
-                batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
-
-            # compute advantages, executed on the driver process
-
-            norm_adv_by_std_in_grpo = self.config.algorithm.get(
-                "norm_adv_by_std_in_grpo", True
-            )  # GRPO adv normalization factor
-
-            batch = compute_advantage(
-                batch,
-                adv_estimator=self.config.algorithm.adv_estimator,
-                gamma=self.config.algorithm.gamma,
-                lam=self.config.algorithm.lam,
-                num_repeat=self.config.actor_rollout_ref.rollout.n,
-                norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
-                config=self.config.algorithm,
-            )
-        # update critic
-        if self.use_critic:
-            with marked_timer("update_critic", timing_raw, color="pink"):
-                critic_output = self.critic_wg.update_critic(batch)
-            critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
-            metrics.update(critic_output_metrics)
-        # implement critic warmup
-        if self.config.trainer.critic_warmup <= self.global_steps:
-            # update actor
-            with marked_timer("update_actor", timing_raw, color="red"):
-                batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
-                actor_output = self.actor_rollout_wg.update_actor(batch)
-            actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
-            metrics.update(actor_output_metrics)
-        return batch, reward_extra_infos_dict
-
-    def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw):
-        """Log rollout generations if enabled"""
-        rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
-        if rollout_data_dir:
-            with marked_timer("dump_rollout_generations", timing_raw, color="green"):
-                inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
-                outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
-                scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
-                if "request_id" in batch.non_tensor_batch:
-                    reward_extra_infos_dict.setdefault(
-                        "request_id",
-                        batch.non_tensor_batch["request_id"].tolist(),
-                    )
-                self._dump_generations(
-                    inputs=inputs,
-                    outputs=outputs,
-                    scores=scores,
-                    reward_extra_infos_dict=reward_extra_infos_dict,
-                    dump_path=rollout_data_dir,
-                )
-
-    def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw):
-        if (
-            self.val_reward_fn is not None
-            and self.config.trainer.test_freq > 0
-            and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
-        ):
-            with marked_timer("testing", timing_raw, color="green"):
-                val_metrics: dict = self._validate()
-                if is_last_step:
-                    last_val_metrics = val_metrics
-            metrics.update(val_metrics)
-        return last_val_metrics
-
-    def _check_save_checkpoint(self, is_last_step, timing_raw):
-        # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
-        esi_close_to_expiration = should_save_ckpt_esi(
-            max_steps_duration=self.max_steps_duration,
-            redundant_time=self.config.trainer.esi_redundant_time,
-        )
-        # Check if the conditions for saving a checkpoint are met.
-        # The conditions include a mandatory condition (1) and
-        # one of the following optional conditions (2/3/4):
-        # 1. The save frequency is set to a positive value.
-        # 2. It's the last training step.
-        # 3. The current step number is a multiple of the save frequency.
-        # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
-        if self.config.trainer.save_freq > 0 and (
-            is_last_step or self.global_steps % self.config.trainer.save_freq == 0 or esi_close_to_expiration
-        ):
-            if esi_close_to_expiration:
-                print("Force saving checkpoint: ESI instance expiration approaching.")
-            with marked_timer("save_checkpoint", timing_raw, color="green"):
-                self._save_checkpoint()
-
-    def _collect_metrics(self, batch, epoch, metrics, timing_raw):
-        steps_duration = timing_raw["step"]
-        self.max_steps_duration = max(self.max_steps_duration, steps_duration)
-        # training metrics
-        metrics.update(
-            {
-                "training/global_step": self.global_steps,
-                "training/epoch": epoch,
-            }
-        )
-        # collect metrics
-        metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
-        metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
-        # TODO: implement actual tflpo and theoretical tflpo
-        n_gpus = self.resource_pool_manager.get_n_gpus()
-        metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
-
-    def _post_batch_processing(self, batch: DataProto):
-        # this is experimental and may be changed/removed in the future in favor of a general-purpose one
-        if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
-            self.train_dataloader.sampler.update(batch=batch)
-
-        # this is experimental and may be changed/removed in the future
-        # in favor of a general-purpose data buffer pool
-        if hasattr(self.train_dataset, "on_batch_end"):
-            # The dataset may be changed after each training batch
-            self.train_dataset.on_batch_end(batch=batch)
+                # this is experimental and may be changed/removed in the future
+                # in favor of a general-purpose data buffer pool
+                if hasattr(self.train_dataset, "on_batch_end"):
+                    # The dataset may be changed after each training batch
+                    self.train_dataset.on_batch_end(batch=batch)
\ No newline at end of file
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 4826ebaa1d0..5125ab41f8b 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import asyncio
 import logging
 import os
 import pickle
-from typing import Any, Callable, Optional, Sequence
+from typing import Any, Callable, Optional
 
 import ray
 import zmq
@@ -207,12 +206,6 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_
         self.wg_prefix = wg_prefix
         self.engine: AsyncLLM = None
 
-        # for cancel LLMServer
-        self.paused = False
-        self.lock = asyncio.Lock()
-        self.cancel_event: dict[str, asyncio.Event] = {}
-        self.req_output: dict[str, Optional[RequestOutput]] = {}
-
     async def init_engine(self):
         """Init vLLM AsyncLLM engine."""
         config = self.config
@@ -334,60 +327,6 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any],
 
         return final_res.outputs[0].token_ids
 
-    async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str):
-        max_tokens = self.max_model_len - len(prompt_ids)
-        sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params)
-        prompt = TokensPrompt(prompt_token_ids=prompt_ids)
-        generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
-
-        # Get final response
-        self.req_output[request_id]: Optional[RequestOutput] = None
-        async for output in generator:
-            self.req_output[request_id] = output
-        assert self.req_output[request_id] is not None
-
-    async def generate_for_partial(
-        self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
-    ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
-        # 设置中断标志
-        async with self.lock:
-            if self.paused:
-                # cancel 后， 所有任务直接返回，等待下次提交
-                return [], [], True
-            self.cancel_event[request_id] = asyncio.Event()
-            cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
-            generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
-
-        done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED)
-
-        for task in done:
-            await task
-
-        for task in pend:
-            task.cancel()
-
-        async with self.lock:
-            token_ids = self.req_output[request_id].outputs[0].token_ids
-            log_probs: list[float] = []
-            for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs):
-                # sampling_params 中 logprobs 设置为1，应该返回1个, 但是实测会有多个，取token_id所对应的log_prob
-                token_id = self.req_output[request_id].outputs[0].token_ids[i]
-                log_probs.append(x[token_id].logprob)
-            is_cancel = generation_handle not in done
-            self.cancel_event.pop(request_id, None)
-            self.req_output.pop(request_id, None)
-        return token_ids, log_probs, is_cancel
-
-    async def cancel(self):
-        async with self.lock:
-            self.paused = True
-            for request_id in self.cancel_event:
-                self.cancel_event[request_id].set()
-
-    async def resume(self):
-        async with self.lock:
-            self.paused = False
-
     async def wake_up(self):
         if self.config.rollout.free_cache_engine:
             await self.engine.wake_up()
@@ -396,4 +335,4 @@ async def sleep(self):
         # TODO: https://github.com/vllm-project/vllm/issues/17103
         await self.engine.reset_prefix_cache()
         if self.config.rollout.free_cache_engine:
-            await self.engine.sleep()
+            await self.engine.sleep()
\ No newline at end of file

From 6cf1da1101c8269fadae3416c5f60455b0e4cd57 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 15 Sep 2025 19:57:46 +0800
Subject: [PATCH 133/182] ruff format

---
 recipe/fully_async_policy/agent_loop/__init__.py          | 2 +-
 recipe/fully_async_policy/agent_loop/agent_loop.py        | 6 ++++--
 recipe/fully_async_policy/agent_loop/vllm_async_server.py | 2 --
 recipe/fully_async_policy/fully_async_main.py             | 2 +-
 verl/experimental/agent_loop/__init__.py                  | 2 +-
 verl/experimental/agent_loop/agent_loop.py                | 2 +-
 verl/trainer/main_ppo.py                                  | 2 +-
 verl/trainer/ppo/ray_trainer.py                           | 2 +-
 verl/workers/rollout/vllm_rollout/vllm_async_server.py    | 2 +-
 9 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
index 7e583cb220d..0796a0c3f5e 100644
--- a/recipe/fully_async_policy/agent_loop/__init__.py
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -18,4 +18,4 @@
 
 _ = [SingleTurnAgentLoop, PartialSingleTurnAgentLoop]
 
-__all__ = ["AgentLoopBase", "AgentLoopManager"]
\ No newline at end of file
+__all__ = ["AgentLoopBase", "AgentLoopManager"]
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 4e6c9ff9285..4f4496c8999 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -670,8 +670,9 @@ async def resume_async(self):
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
 
 
-
 from verl.workers.rollout.async_server import AsyncServerBase
+
+
 def async_server_class(
     rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None
 ) -> type[AsyncServerBase]:
@@ -692,6 +693,7 @@ def async_server_class(
 
         if rollout_backend == "vllm":
             from recipe.fully_async_policy.agent_loop.vllm_async_server import AsyncvLLMServer
+
             return AsyncvLLMServer
         else:
             raise NotImplementedError(f"rollout backend {rollout_backend} is not supported")
@@ -701,4 +703,4 @@ def async_server_class(
 
     from verl.utils.import_utils import load_extern_type
 
-    return load_extern_type(rollout_backend_module, rollout_backend_class)
\ No newline at end of file
+    return load_extern_type(rollout_backend_module, rollout_backend_class)
diff --git a/recipe/fully_async_policy/agent_loop/vllm_async_server.py b/recipe/fully_async_policy/agent_loop/vllm_async_server.py
index 03fc28c8549..4826ebaa1d0 100644
--- a/recipe/fully_async_policy/agent_loop/vllm_async_server.py
+++ b/recipe/fully_async_policy/agent_loop/vllm_async_server.py
@@ -397,5 +397,3 @@ async def sleep(self):
         await self.engine.reset_prefix_cache()
         if self.config.rollout.free_cache_engine:
             await self.engine.sleep()
-
-
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 699222f350a..79bdc4114db 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -194,7 +194,7 @@ def _initialize_components(self, config) -> None:
         print(f"total_train_steps {total_train_steps}")
         ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps))
 
-        # max_queue_size 
+        # max_queue_size
         max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote())
         print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}")
         message_queue = MessageQueue.remote(config, max_queue_size)
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index c6f58f83c83..a39171db764 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -18,4 +18,4 @@
 
 _ = [SingleTurnAgentLoop, ToolAgentLoop]
 
-__all__ = ["AgentLoopBase", "AgentLoopManager"]
\ No newline at end of file
+__all__ = ["AgentLoopBase", "AgentLoopManager"]
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 4639229a3b0..ef86381020b 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -540,4 +540,4 @@ def wake_up(self):
 
     def sleep(self):
         """Sleep all rollout server instances."""
-        ray.get([server.sleep.remote() for server in self.async_llm_servers])
\ No newline at end of file
+        ray.get([server.sleep.remote() for server in self.async_llm_servers])
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 8d2b811c733..a9ea554687a 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -334,4 +334,4 @@ def create_rl_sampler(data_config, dataset):
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 05281ebe3f9..6a82a4bcf2b 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1365,4 +1365,4 @@ def fit(self):
                 # in favor of a general-purpose data buffer pool
                 if hasattr(self.train_dataset, "on_batch_end"):
                     # The dataset may be changed after each training batch
-                    self.train_dataset.on_batch_end(batch=batch)
\ No newline at end of file
+                    self.train_dataset.on_batch_end(batch=batch)
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 5125ab41f8b..988dac407d7 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -335,4 +335,4 @@ async def sleep(self):
         # TODO: https://github.com/vllm-project/vllm/issues/17103
         await self.engine.reset_prefix_cache()
         if self.config.rollout.free_cache_engine:
-            await self.engine.sleep()
\ No newline at end of file
+            await self.engine.sleep()

From aa370b4cea5dfa76bd9a5fb3c751b216b6000da1 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 15 Sep 2025 20:33:38 +0800
Subject: [PATCH 134/182] add anomaly detection and exit

---
 recipe/fully_async_policy/fully_async_main.py | 34 +++++++++++++++----
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 79bdc4114db..f41ab2df826 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -267,15 +267,37 @@ def _create_trainer(self, config) -> None:
     def _run_training_loop(self):
         self.running = True
 
-        print("[ASYNC MAIN] Starting Rollouter in background...")
+        print("[ASYNC MAIN] Starting Rollouter and Trainer...")
         rollouter_future = self.components["rollouter"].fit.remote()
         trainer_future = self.components["trainer"].fit.remote()
 
-        ray.get(rollouter_future)
-        ray.get(trainer_future)
-
-        self.components["message_queue_client"].clear_queue()
-        print("[ASYNC MAIN] Training completed or interrupted")
+        futures = [rollouter_future, trainer_future]
+
+        try:
+            while futures:
+                # Use ray.wait to monitor all futures and return when any one is completed.
+                done_futures, remaining_futures = ray.wait(futures, num_returns=1, timeout=None)
+
+                for future in done_futures:
+                    try:
+                        ray.get(future)
+                        print(f"[ASYNC MAIN] One component completed successfully")
+                    except Exception as e:
+                        print(f"[ASYNC MAIN] Component failed with error: {e}")
+                        for remaining_future in remaining_futures:
+                            ray.cancel(remaining_future)
+                        raise e
+
+                futures = remaining_futures
+
+        except Exception as e:
+            print(f"[ASYNC MAIN] Training failed: {e}")
+            for future in futures:
+                ray.cancel(future)
+            raise
+        finally:
+            self.components["message_queue_client"].clear_queue()
+            print("[ASYNC MAIN] Training completed or interrupted")
 
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)

From 0a2763dea59f8cbb1e0566eb0d7a1bfc9346048c Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 16 Sep 2025 11:13:15 +0800
Subject: [PATCH 135/182] qwen3-32b-96-32

---
 .../fsdp2_fully-async_96-32/run.sh            | 153 ++++++++++++++++++
 .../fsdp2_fully-async_96-32/runtime_env.yaml  |   4 +
 2 files changed, 157 insertions(+)
 create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh
 create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml

diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh
new file mode 100644
index 00000000000..827e9a30e41
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh
@@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1'
+
+# Paths
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
+CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 20))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=8
+fsdp_size=-1
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-12}
+NNODES_TRAIN=${NNODES_TRAIN:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=128
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=2
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml
new file mode 100644
index 00000000000..be4ab6a6349
--- /dev/null
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml
@@ -0,0 +1,4 @@
+env_vars:
+  VLLM_USE_V1: "1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1"
+  HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From dd534e09c6ba09e895c6733b7d6bebc3448b13ce Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Tue, 16 Sep 2025 15:16:04 +0800
Subject: [PATCH 136/182] add rollouter&trainer idle time

---
 recipe/fully_async_policy/detach_utils.py     |  8 ++++-
 .../fully_async_rollouter.py                  | 32 ++++++++++++++-----
 .../fully_async_policy/fully_async_trainer.py | 31 ++++++++++--------
 recipe/fully_async_policy/param_sync.py       |  4 +--
 4 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 75d67ec1ab1..ad12ef69057 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -52,7 +52,7 @@ class RolloutSample:
 @dataclass
 class ValidateMetrics:
     timing_raw: dict[str, Any]
-    metrics: dict[str, Any]
+    metrics: Optional[dict[str, Any]] = None
     global_steps: Optional[int] = None
     param_version: Optional[int] = None
 
@@ -362,14 +362,20 @@ def get_aggregated_metrics(self) -> dict[str, Any]:
     def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, Any]:
         """calculate special metrics"""
 
+        # global_seqlen/minmax_diff
         if "global_seqlen/minmax_diff" in aggregated.keys():
             aggregated["global_seqlen/minmax_diff"] = aggregated["global_seqlen/max"] - aggregated["global_seqlen/min"]
 
+        # perf/throughput
         REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"}
         if REQUIRED_PERF_KEYS.issubset(aggregated):
             aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / (
                 aggregated["perf/time_per_step"] * self.total_gpus
             )
+        
+        # trainer/idle_ratio
+        if "timing_s/gen" in aggregated.keys() and "timing_s/step" in aggregated.keys():
+           aggregated["trainer/idle_ratio"] = aggregated["timing_s/gen"] / aggregated["timing_s/step"]
 
         return aggregated
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 919314ba1b5..2134e6d0e38 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -127,6 +127,8 @@ def __init__(
         self.dropped_stale_samples = 0
         self.processed_sample_count = 0  # 已处理的样本计数
         self.global_steps = 0
+        self.idle_start_time = None
+        self.version_start_time = None
 
         # Concurrency control
         self.paused = False
@@ -203,24 +205,37 @@ async def update_param_version(self, version: int, validate: bool = False, globa
                 + self.cancel_queue.qsize()
                 + await self.message_queue_client.get_queue_size()
             )
+            timing_raw = {}
+            idle_ratio = None
+            if self.idle_start_time is not None and self.version_start_time is not None:
+               rollout_active_time = self.idle_start_time - self.version_start_time
+               rollout_version_time = time.time() - self.version_start_time
+               idle_ratio = 1 - rollout_active_time / rollout_version_time
+               timing_raw["rollouter/active_time"] = rollout_active_time
+               timing_raw["rollouter/version_time"] = rollout_version_time
+               timing_raw["rollouter/idle_ratio"] = idle_ratio
+               self.idle_start_time = None
             print(
                 f"[FullyAsyncRollouter][Public][update_param_version] "
                 f"Parameter version updated from {old_version} to {version} "
                 f",reset staleness_samples to: {self.staleness_samples}"
+                f",idle_ratio: {idle_ratio}"
             )
-            timing_raw = {}
+            val_metrics = None
             if (
                 self.val_reward_fn is not None
                 and self.config.rollout.test_freq > 0
                 and self.current_param_version % self.config.rollout.test_freq == 0
                 and self.current_param_version > 0  # don't test here in the initial parameter sync
             ) or (validate and self.val_reward_fn is not None):
-                with marked_timer("testing", timing_raw, color="green"):
+                with marked_timer("rollouter/validate_time", timing_raw, color="green"):
                     val_metrics: dict = self._validate()
-                data = ValidateMetrics(
-                    timing_raw=timing_raw, metrics=val_metrics, global_steps=global_steps, param_version=version
-                )
-                await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
+            data = ValidateMetrics(
+                timing_raw=timing_raw, metrics=val_metrics, global_steps=global_steps, param_version=version
+            )
+            await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data))
+
+            self.version_start_time = time.time()
 
     def _validate_config(self):
         # Validate asynchronous training configuration
@@ -320,6 +335,8 @@ async def _processor_worker(self):
             # self.paused 由 pause() 和 self._should_pause_generation() 负责修改
             if self.paused or await self._should_pause_generation():
                 print("[FullyAsyncRollouter][Processor] 收到暂停信号，等待剩余任务完成...")
+                async with self.lock:
+                    self.paused = True
                 while self.active_tasks:
                     async with self.lock:
                         # 获取锁后，active_tasks 数量会发生变化，需要再次校验
@@ -329,11 +346,10 @@ async def _processor_worker(self):
                             )
                         for task in done_tasks:
                             await task
-                async with self.lock:
-                    self.paused = True
 
                 async with self.lock:
                     while self.paused:
+                        self.idle_start_time = time.time()
                         await self.condition.wait()
 
             # 获取待处理的部分 RolloutSample
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 0c1501cbf89..66d96c4b09b 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -263,13 +263,14 @@ def fit(self):
 
         # get validate data before training
         if self.config.trainer.val_before_train and self.reward_fn is not None:
-            ray.get(self.param_synchronizer.wait_last_sync.remote())
+            ray.get(self.param_synchronizer.wait_last_valid.remote())
         val_data = self.message_queue_client.get_validate_sync()
         if val_data:
             val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-            self.logger.log(data=val_data.metrics, step=val_data.param_version)
-            self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
-            pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}")
+            if val_data.metrics:
+                self.logger.log(data=val_data.metrics, step=val_data.param_version)
+                pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}")
+            self.logger.log(data=val_data.timing_raw, step=val_data.param_version) 
 
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
@@ -320,24 +321,26 @@ def fit(self):
             val_data = self.message_queue_client.get_validate_sync()
             if val_data:
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-                self.logger.log(data=val_data.metrics, step=val_data.param_version)
+                if val_data.metrics:
+                    self.logger.log(data=val_data.metrics, step=val_data.param_version)
+                    pprint(
+                        f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \
+                        Validation metrics: {val_data.metrics}"
+                    )
                 self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
-                pprint(
-                    f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \
-                      Validation metrics: {val_data.metrics}"
-                )
             self.global_steps += 1
 
         # final parameter sync and validate
-        if val_data is None:
+        if val_data is None or val_data.metrics is None:
             self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps - 1)
-            ray.get(self.param_synchronizer.wait_last_sync.remote())
+            ray.get(self.param_synchronizer.wait_last_valid.remote())
             val_data = self.message_queue_client.get_validate_sync()
             if val_data:
                 val_data: ValidateMetrics = ray.cloudpickle.loads(val_data)
-                self.logger.log(data=val_data.metrics, step=val_data.param_version)
+                if val_data.metrics:
+                    self.logger.log(data=val_data.metrics, step=val_data.param_version)
+                    pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
                 self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
-                pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
         else:
             pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
         self.progress_bar.close()
@@ -364,7 +367,7 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step
         )
         self.progress_bar.update(1)
         self.metrics_aggregator.reset()
-        ray.get(self.param_synchronizer.wait_last_sync.remote())
+        ray.get(self.param_synchronizer.wait_last_valid.remote())
         ray.get(
             self.param_synchronizer.sync_weights.remote(
                 self.current_param_version, validate=validate, global_steps=global_steps
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 2a58292ff78..55d11d236c0 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -95,8 +95,8 @@ def sync_weights(self, version, validate=False, global_steps=0):
         self.wait_last_update = self.rollouter.update_param_version.remote(version, validate, global_steps)
         self.wait_last_resume = self.rollouter.resume.remote()
 
-    def wait_last_sync(self):
-        print("[ParameterSynchronizer] waiting last parameter sync and validate...")
+    def wait_last_valid(self):
+        print("[ParameterSynchronizer] waiting last validate...")
         start_time = time.time()
         if self.wait_last_update:
             ray.get(self.wait_last_update)

From 67de99f199493852390026feec3a34fe28344475 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 16 Sep 2025 15:37:18 +0800
Subject: [PATCH 137/182] refactor code rm megatron code

---
 hzg_test/name_ip.py                           |  21 ++
 .../agent_loop/agent_loop.py                  |   3 +-
 recipe/fully_async_policy/fsdp_workers.py     |  24 +--
 recipe/fully_async_policy/fully_async_main.py |  15 +-
 recipe/fully_async_policy/megatron_workers.py | 200 ------------------
 .../vllm_rollout/__init__.py                  |   0
 .../vllm_async_server.py                      |   0
 tests/special_e2e/run_fully_async_policy.sh   |   6 +-
 8 files changed, 39 insertions(+), 230 deletions(-)
 create mode 100644 hzg_test/name_ip.py
 delete mode 100644 recipe/fully_async_policy/megatron_workers.py
 create mode 100644 recipe/fully_async_policy/vllm_rollout/__init__.py
 rename recipe/fully_async_policy/{agent_loop => vllm_rollout}/vllm_async_server.py (100%)

diff --git a/hzg_test/name_ip.py b/hzg_test/name_ip.py
new file mode 100644
index 00000000000..d47b0890d91
--- /dev/null
+++ b/hzg_test/name_ip.py
@@ -0,0 +1,21 @@
+import ray
+
+# 初始化Ray
+if not ray.is_initialized():
+    ray.init()
+
+# 获取所有节点的信息
+nodes = ray.nodes()
+
+# 打印表头
+print(f"{'机器名':<20} {'IP地址':<15}")
+print("-" * 40)
+
+# 遍历所有节点并打印信息
+for node in nodes:
+    # 节点地址格式通常为 "IP:端口"，我们只需要IP部分
+    ip_address = node["NodeManagerAddress"].split(":")[0]
+    # 机器名（主机名）
+    node_name = node["NodeManagerHostname"]
+
+    print(f"{node_name:<20} {ip_address:<15}")
\ No newline at end of file
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 4f4496c8999..4da6b562c4c 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -692,8 +692,7 @@ def async_server_class(
         # importlib.import_module and from ... import ... have subtle differences in ray
 
         if rollout_backend == "vllm":
-            from recipe.fully_async_policy.agent_loop.vllm_async_server import AsyncvLLMServer
-
+            from recipe.fully_async_policy.vllm_rollout.vllm_async_server import AsyncvLLMServer
             return AsyncvLLMServer
         else:
             raise NotImplementedError(f"rollout backend {rollout_backend} is not supported")
diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index 41fa3a55eec..7a1b59aa64c 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -99,6 +99,18 @@ def sync_rollout_weights(self):
                 inference_model.load_weights([(key, tensor)])
         get_torch_device().empty_cache()
 
+
+class DetachActorWorker(DetachNcclSync):
+    def _get_actor_params(self):
+        assert self._is_actor
+        params = self.actor_module_fsdp.state_dict()
+        from verl.utils.model import convert_weight_keys
+
+        params = convert_weight_keys(
+            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
+        )
+        return params
+
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def get_actor_weights_info(self):
         assert self._is_actor
@@ -120,18 +132,6 @@ def get_actor_weights_info(self):
         return ret
 
 
-class DetachActorWorker(DetachNcclSync):
-    def _get_actor_params(self):
-        assert self._is_actor
-        params = self.actor_module_fsdp.state_dict()
-        from verl.utils.model import convert_weight_keys
-
-        params = convert_weight_keys(
-            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
-        )
-        return params
-
-
 class DetachRolloutWorker(DetachNcclSync):
     def __init__(self, config: DictConfig, role: str):
         Worker.__init__(self)
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index f41ab2df826..c0f156296a2 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -90,17 +90,7 @@ def create_role_worker_mapping(config):
 
         ray_worker_group_cls = RayWorkerGroup
 
-    elif config.actor_rollout_ref.actor.strategy == "megatron":
-        assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-        from recipe.fully_async_policy.megatron_workers import (
-            CriticWorker,
-            DetachActorWorker,
-            DetachAsyncRolloutWorker,
-        )
-        from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
-
-        ray_worker_group_cls = NVMegatronRayWorkerGroup
-
+    # TODO megatron support
     else:
         raise NotImplementedError(f"Unsupported strategy: {config.actor_rollout_ref.actor.strategy}")
 
@@ -113,8 +103,7 @@ def create_role_worker_mapping(config):
     if config.reward_model.enable:
         if config.reward_model.strategy == "fsdp2":
             from verl.workers.fsdp_workers import RewardModelWorker
-        elif config.reward_model.strategy == "megatron":
-            from verl.workers.megatron_workers import RewardModelWorker
+        # TODO megatron support
         else:
             raise NotImplementedError(f"Unsupported reward model strategy: {config.reward_model.strategy}")
 
diff --git a/recipe/fully_async_policy/megatron_workers.py b/recipe/fully_async_policy/megatron_workers.py
deleted file mode 100644
index a9318b8f7b3..00000000000
--- a/recipe/fully_async_policy/megatron_workers.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright 2025 Bytedance Ltd. and/or its affiliates
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-
-import torch
-import torch.distributed
-from omegaconf import DictConfig, OmegaConf
-
-from verl.single_controller.base.decorator import Dispatch, register
-from verl.utils.debug import (
-    log_gpu_memory_usage,
-)
-from verl.utils.device import get_device_name, get_torch_device
-from verl.utils.fs import copy_to_local
-from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
-from verl.workers.megatron_workers import (
-    ActorRolloutRefWorker,
-    AsyncActorRolloutRefWorker,
-    CriticWorker,
-)
-
-logger = logging.getLogger(__file__)
-logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
-
-__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
-
-
-class DetachNcclSync(ActorRolloutRefWorker):
-    def _get_actor_params_generator(self):
-        pass
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
-    def sync_rollout_weights(self):
-        assert (self._is_actor or self._is_rollout) and not self.config.hybrid_engine
-        assert hasattr(self, "_weights_info") and self._weights_info is not None
-
-        params_generator = self._get_actor_params_generator() if self._is_actor else None
-        if self._is_rollout:
-            inference_model = (
-                self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
-            )
-            patch_vllm_moe_model_weight_loader(inference_model)
-        for key, shape, dtype in self._weights_info:
-            if self._is_actor:
-                weight_key, weight = next(params_generator)
-                assert key == weight_key
-                assert shape == weight.size()
-                assert dtype == weight.dtype
-
-            tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
-            if self._is_actor and torch.distributed.get_rank() == 0:
-                tensor.copy_(weight)
-            from ray.util.collective import collective
-
-            collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
-            if self._is_rollout:
-                inference_model.load_weights([(key, tensor)])
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def get_actor_weights_info(self):
-        assert self._is_actor
-        if hasattr(self, "_weights_info"):
-            return self._weights_info
-
-        params_generator = self._get_actor_params_generator()
-        ret = []
-        for key, tensor in params_generator:
-            ret.append((key, tensor.size(), tensor.dtype))
-
-        self._weights_info = ret
-        return ret
-
-
-class DetachActorWorker(DetachNcclSync):
-    def _get_actor_params_generator(self):
-        assert self._is_actor
-        from verl.models.mcore import get_mcore_weight_converter
-        from verl.utils.megatron_utils import per_tensor_generator
-
-        layer_name_mapping = {
-            "qkv_layer_name": "self_attention.linear_qkv.",
-            "gate_proj_layer_name": "linear_fc1.",
-        }
-        weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
-        generator = per_tensor_generator(
-            self.actor.actor_module,
-            self.actor_model_config,
-            weight_converter,
-            self.tf_config,
-            layer_name_mapping,
-        )
-        return generator
-
-
-class DetachRolloutWorker(DetachNcclSync):
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def init_model(self):
-        if self.config.model.get("external_lib", None) is not None:
-            # This is used to import external_lib into the huggingface systems
-            import importlib
-
-            importlib.import_module(self.config.model.external_lib)
-
-        from verl.utils.torch_dtypes import PrecisionType
-
-        override_model_config = OmegaConf.to_container(OmegaConf.create(self.config.model.get("override_config", {})))
-        override_transformer_config = {}
-        self.param_dtype = torch.bfloat16
-        self.dtype = PrecisionType.to_dtype(self.param_dtype)
-        trust_remote_code = self.config.model.get("trust_remote_code", False)
-
-        from verl.utils.model import get_generation_config
-
-        self._init_hf_config_and_tf_config(
-            self.config.model.path,
-            self.config.model.path,
-            self.dtype,
-            override_model_config,
-            override_transformer_config,
-            trust_remote_code,
-        )
-        self.generation_config = get_generation_config(self.local_path)
-
-        from torch.distributed.device_mesh import init_device_mesh
-
-        assert self.config.rollout.name == "vllm"
-
-        from verl.workers.rollout.vllm_rollout import vLLMRollout
-
-        # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor,
-        # we will reorganize their weight format when resharding from actor to rollout.
-
-        infer_tp = self.config.rollout.tensor_model_parallel_size
-        dp = self.world_size // infer_tp
-        assert self.world_size % infer_tp == 0, (
-            f"rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}"
-        )
-        rollout_device_mesh = init_device_mesh(
-            get_device_name(), mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"]
-        )
-        log_gpu_memory_usage("Before building vllm rollout", logger=None)
-
-        local_path = copy_to_local(self.config.model.path, use_shm=self.config.model.get("use_shm", False))
-        from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout
-
-        vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout
-        rollout = vllm_rollout_cls(
-            model_path=local_path,
-            config=self.config.rollout,
-            tokenizer=self.tokenizer,
-            model_hf_config=self.hf_config,
-            device_mesh=rollout_device_mesh,
-            trust_remote_code=trust_remote_code,
-        )
-        log_gpu_memory_usage("After building vllm rollout", logger=logger)
-
-        from .detach_sharding_manager import DetachShardingManager
-
-        rollout_sharding_manager = DetachShardingManager(
-            inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
-        )
-
-        log_gpu_memory_usage("After building sharding manager", logger=logger)
-
-        self.rollout = rollout
-        self.sharding_manager = rollout_sharding_manager
-        self.rollout.sharding_manager = rollout_sharding_manager
-
-    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
-    def async_generate_sequences(self, *args, **kwargs):
-        return super().generate_sequences(*args, **kwargs)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def set_actor_weights_info(self, weights_info):
-        assert self._is_rollout
-        self._weights_info = weights_info
-
-
-class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker):
-    def __init__(self, config: DictConfig, role: str):
-        print(DetachAsyncRolloutWorker.__mro__)
-        DetachRolloutWorker.__init__(self, config, role)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def init_model(self):
-        DetachRolloutWorker.init_model(self)
diff --git a/recipe/fully_async_policy/vllm_rollout/__init__.py b/recipe/fully_async_policy/vllm_rollout/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/recipe/fully_async_policy/agent_loop/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
similarity index 100%
rename from recipe/fully_async_policy/agent_loop/vllm_async_server.py
rename to recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 4813f159696..2ddc61910ba 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -49,8 +49,8 @@ top_k=-1
 val_top_p=0.7
 
 # Fully async specific parameters
-n_gpus_rollout=4
-n_gpus_training=$((NUM_GPUS - n_gpus_rollout))
+n_gpus_rollout=1
+n_gpus_training=1
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
@@ -118,7 +118,7 @@ common_params=(
     trainer.logger=['console']
     trainer.project_name='verl-test-fully-async'
     trainer.experiment_name="${exp_name}"
-    trainer.val_before_train=True
+    trainer.val_before_train=False
     trainer.save_freq=-1
     trainer.resume_mode=disable
     trainer.nnodes=1

From a66c4cf252a42a2a5209b1fffca005f2d23a9dbb Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Tue, 16 Sep 2025 15:37:49 +0800
Subject: [PATCH 138/182] set required_samples=ppo_mini_bs & set
 max_concurrent_samples=rollout_dp_size*16

---
 recipe/fully_async_policy/fully_async_rollouter.py |  7 +------
 recipe/fully_async_policy/fully_async_trainer.py   | 10 +---------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 2134e6d0e38..8fbed0f0b65 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -165,12 +165,7 @@ async def set_required_samples(self, required_samples: int):
             )
 
             # 单次最多扔一次更新需要的样本
-            self.max_concurrent_samples = int(
-                self.config.actor_rollout_ref.actor.ppo_mini_batch_size
-                / self.config.actor_rollout_ref.rollout.n
-                * self.async_rollout_manager.rollout_dp_size
-                * 8
-            )
+            self.max_concurrent_samples = self.async_rollout_manager.rollout_dp_size * 16
             self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples)
             self.max_queue_size = self.max_required_samples
 
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 66d96c4b09b..0f0c35d7db5 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -113,15 +113,7 @@ def __init__(
         self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step
 
         # calculate required_samples
-        ppo_mini_batch_size = config.actor_rollout_ref.actor.ppo_mini_batch_size
-        rollout_n = config.actor_rollout_ref.rollout.n
-        if ppo_mini_batch_size % rollout_n != 0:
-            raise ValueError(
-                f"PPO mini batch size ({ppo_mini_batch_size}) must be divisible by rollout n ({rollout_n})"
-            )
-        self.required_samples = int(
-            self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n
-        )
+        self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size
         total_gpus = (
             config.trainer.nnodes * config.trainer.n_gpus_per_node
             + config.rollout.nnodes * config.rollout.n_gpus_per_node

From 0ae200ec7b9a86bf77160118471d7d3b9e1dfa3e Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 16 Sep 2025 15:39:45 +0800
Subject: [PATCH 139/182] rm code

---
 hzg_test/name_ip.py | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 hzg_test/name_ip.py

diff --git a/hzg_test/name_ip.py b/hzg_test/name_ip.py
deleted file mode 100644
index d47b0890d91..00000000000
--- a/hzg_test/name_ip.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import ray
-
-# 初始化Ray
-if not ray.is_initialized():
-    ray.init()
-
-# 获取所有节点的信息
-nodes = ray.nodes()
-
-# 打印表头
-print(f"{'机器名':<20} {'IP地址':<15}")
-print("-" * 40)
-
-# 遍历所有节点并打印信息
-for node in nodes:
-    # 节点地址格式通常为 "IP:端口"，我们只需要IP部分
-    ip_address = node["NodeManagerAddress"].split(":")[0]
-    # 机器名（主机名）
-    node_name = node["NodeManagerHostname"]
-
-    print(f"{node_name:<20} {ip_address:<15}")
\ No newline at end of file

From 9cfacc2bd32ce1ae03556726351b72cdcd61c042 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 16 Sep 2025 20:03:11 +0800
Subject: [PATCH 140/182] refactor 1

---
 .../fully_async_policy/agent_loop/__init__.py |    7 +-
 .../agent_loop/agent_loop.py                  |  599 +--------
 recipe/fully_async_policy/detach_utils.py     |   83 +-
 recipe/fully_async_policy/fsdp_workers.py     |   30 +-
 recipe/fully_async_policy/fully_async_main.py |    7 +-
 .../fully_async_rollouter.py                  |   11 +-
 .../fully_async_policy/fully_async_trainer.py |   11 +-
 recipe/fully_async_policy/main_ppo.py         |  344 ------
 recipe/fully_async_policy/ray_trainer.py      | 1069 ++---------------
 verl/experimental/agent_loop/__init__.py      |    4 +-
 verl/experimental/agent_loop/agent_loop.py    |   59 +-
 verl/trainer/main_ppo.py                      |   10 +-
 verl/trainer/ppo/ray_trainer.py               |   30 +-
 verl/trainer/ppo/utils.py                     |   31 +
 14 files changed, 337 insertions(+), 1958 deletions(-)
 delete mode 100644 recipe/fully_async_policy/main_ppo.py

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
index 0796a0c3f5e..5f059078964 100644
--- a/recipe/fully_async_policy/agent_loop/__init__.py
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -12,10 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .agent_loop import AgentLoopBase, AgentLoopManager
 from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
-from .single_turn_agent_loop import SingleTurnAgentLoop
-
-_ = [SingleTurnAgentLoop, PartialSingleTurnAgentLoop]
-
-__all__ = ["AgentLoopBase", "AgentLoopManager"]
+_ = [PartialSingleTurnAgentLoop]
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 4da6b562c4c..38c461629dc 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -12,97 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import asyncio
-import heapq
 import logging
 import os
-import random
-from abc import ABC, abstractmethod
 from typing import Any, Optional
 
 import hydra
 import numpy as np
 import ray
 import torch
-from cachetools import LRUCache
 from omegaconf import DictConfig, OmegaConf
-from pydantic import BaseModel
 from tensordict import TensorDict
-from transformers import AutoTokenizer
 
 from verl.protocol import DataProto
 from verl.single_controller.ray.base import RayWorkerGroup
 from verl.utils import hf_tokenizer
 from verl.utils.fs import copy_to_local
-from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
+from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr
+from verl.workers.rollout.replica import TokenOutput
+
+from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, _agent_loop_registry, _DummyConfig
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
+from verl.experimental.agent_loop.agent_loop import *
 
-class AsyncLLMServerManager:
-    """
-    A class to manage multiple OpenAI compatible LLM servers. This class provides
-    - Load balance: least requests load balancing
-    - Sticky session: send multi-turn chat completions to same server for automatic prefix caching
-    """
-
-    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000):
-        """Initialize the AsyncLLMServerManager.
-
-        Args:
-            config (DictConfig): YAML config.
-            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
-            max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000.
-        """
-        self.config = config
-        self.server_handles = server_handles
-        random.shuffle(self.server_handles)
-
-        # Least requests load balancing
-        self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles]
-        heapq.heapify(self.weighted_serveres)
-
-        # LRU cache to map request_id to server
-        self.request_id_to_server = LRUCache(maxsize=max_cache_size)
-
-    def _choose_server(self, request_id: str) -> ray.actor.ActorHandle:
-        # TODO: implement server pressure awareness load balancing
-        if request_id in self.request_id_to_server:
-            return self.request_id_to_server[request_id]
-
-        server = self.weighted_serveres[0][1][1]
-        self.weighted_serveres[0][0] += 1
-        heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0])
-        self.request_id_to_server[request_id] = server
-        return server
-
-    @rollout_trace_op
-    async def generate(
-        self,
-        request_id,
-        *,
-        prompt_ids: list[int],
-        sampling_params: dict[str, Any],
-    ) -> list[int]:
-        """Generate tokens from prompt ids.
-
-        Args:
-            request_id (str): request id for sticky session.
-            prompt_ids (List[int]): List of prompt token ids.
-            sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
-
-        Returns:
-            List[int]: List of generated token ids.
-        """
-        server = self._choose_server(request_id)
-        output = await server.generate.remote(
-            request_id=request_id,
-            prompt_ids=prompt_ids,
-            sampling_params=sampling_params,
-        )
-        return output
 
-    async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
+class PartialAsyncLLMServerManager(AsyncLLMServerManager):
+    async def generate_for_partial(self, request_id, prompt_ids, sampling_params) -> TokenOutput:
         """Generate tokens from prompt ids. with partial rollout function"""
         server = self._choose_server(request_id)
         output = await server.generate_for_partial.remote(
@@ -113,275 +50,25 @@ async def generate_for_partial(self, request_id, prompt_ids, sampling_params):
         return output
 
 
-class AgentLoopMetrics(BaseModel):
-    """Agent loop performance metrics."""
-
-    generate_sequences: float = 0.0
-    tool_calls: float = 0.0
-
-
-class AgentLoopOutput(BaseModel):
+class PartialAgentLoopOutput(AgentLoopOutput):
     """Agent loop output."""
 
-    prompt_ids: list[int]
-    """Prompt token ids."""
-    response_ids: list[int]
-    """Response token ids including LLM generated token, tool response token."""
-    response_mask: list[int]
-    """Response mask, 1 for LLM generated token, 0 for tool response token."""
-    num_turns: int = 0
-    """Number of chat turns, including user, assistant, tool."""
-    metrics: AgentLoopMetrics
-    """Auxiliary performance metrics"""
     is_cancel: bool = False
     """Indicates whether the request was interrupted"""
     log_probs: list[float] = None
     """Response token log probs including LLM generated token, tool response token."""
 
 
-# make hydra.utils.instantiate happy
-class _DummyConfig:
-    def __init__(self, config: DictConfig) -> None:
-        self.config = config
-
-
-class AgentLoopBase(ABC):
-    """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various
-    environments."""
-
-    _class_initialized = False
-
+@ray.remote
+class FullyAgentLoopWorker(AgentLoopWorker):
     def __init__(
-        self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs
+            self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
-        """Initialize agent loop, each sample will have its own loop instance.
-
-        Args:
-            trainer_config (_DummyConfig): trainer config.
-            server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
-            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
-        """
-        self.init_class(trainer_config.config, tokenizer, **kwargs)
-        self.config = trainer_config.config
-        self.server_manager = server_manager
-        self.tokenizer = tokenizer
-        self.loop = asyncio.get_running_loop()
-
-    @classmethod
-    def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs):
-        """This is used to do heavy initialization work that should shared across all instances. It's only called once.
-
-        Args:
-            config (DictConfig): trainer config.
-            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
-            **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`.
-        """
-        if cls._class_initialized:
-            return
-        cls._class_initialized = True
-
-    @abstractmethod
-    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
-        """Run agent loop to interact with LLM server and environment.
-
-        Args:
-            messages (List[Dict[str, Any]]): Input messages.
-            sampling_params (Dict[str, Any]): LLM sampling params.
-
-        Returns:
-            AgentLoopOutput: Agent loop output.
-        """
-        raise NotImplementedError
-
-
-"""Agent loop registry: key is agent_name, value is a dict of agent loop config
-used by hydra.utils.instantiate to initialize agent loop instance.
-
-https://hydra.cc/docs/advanced/instantiate_objects/overview/
-"""
-_agent_loop_registry: dict[str, dict] = {}
-
-
-def register(agent_name: str):
-    """Register agent loop class."""
-
-    def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]:
-        fqdn = f"{subclass.__module__}.{subclass.__qualname__}"
-        _agent_loop_registry[agent_name] = {"_target_": fqdn}
-        return subclass
-
-    return decorator
-
-
-def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto:
-    """Static method to postprocess a list of AgentLoopOutput into DataProto
-
-    Args:
-        inputs: List of AgentLoopOutput
-        tokenizer: Tokenizer instance
-        config: Configuration object
-
-    Returns:
-        DataProto: Processed batch data
-    """
-    # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
-    # prompts: left pad
-    # responses: right pad
-    # input_ids: prompt + response
-    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
-    # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
-
-    # prompts
-    tokenizer.padding_side = "left"
-    outputs = tokenizer.pad(
-        [{"input_ids": input.prompt_ids} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.prompt_length,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-    # responses
-    tokenizer.padding_side = "right"
-    outputs = tokenizer.pad(
-        [{"input_ids": input.response_ids} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.response_length,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
-
-    # response_mask
-    outputs = tokenizer.pad(
-        [{"input_ids": input.response_mask} for input in inputs],
-        padding="max_length",
-        max_length=config.actor_rollout_ref.rollout.response_length,
-        return_tensors="pt",
-        return_attention_mask=False,
-    )
-    response_mask = outputs["input_ids"]
-    assert response_ids.shape == response_mask.shape, (
-        f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
-    )
-    response_mask = response_mask * response_attention_mask
-
-    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
-    attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
-    position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
-
-    batch = TensorDict(
-        {
-            "prompts": prompt_ids,  # [bsz, prompt_length]
-            "responses": response_ids,  # [bsz, response_length]
-            "response_mask": response_mask,  # [bsz, response_length]
-            "input_ids": input_ids,  # [bsz, prompt_length + response_length]
-            "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
-            "position_ids": position_ids,  # [bsz, prompt_length + response_length]
-        },
-        batch_size=len(input_ids),
-    )
-
-    num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
-    metrics = [input.metrics.model_dump() for input in inputs]
-    return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
-
-
-@ray.remote
-class AgentLoopWorker:
-    """Agent loop worker takes a batch of messages and run each message in an agent loop."""
-
-    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]):
-        """Initialize agent loop manager.
-
-        Args:
-            config (DictConfig): YAML config.
-            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
-        """
-        self.config = config
-        self.server_manager = AsyncLLMServerManager(config, server_handles)
-
-        model_path = config.actor_rollout_ref.model.path
-        self.model_name = "/".join(model_path.split("/")[-2:])
-        local_path = copy_to_local(config.actor_rollout_ref.model.path)
-        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True)
-
-        agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path
-        if agent_loop_config_path:
-            agent_loop_configs = OmegaConf.load(agent_loop_config_path)
-            for agent_loop_config in agent_loop_configs:
-                _agent_loop_registry[agent_loop_config.name] = agent_loop_config
-
-        trace_config = config.trainer.get("rollout_trace", {})
-        trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
-        RolloutTraceConfig.init(
-            self.config.trainer.project_name,
-            self.config.trainer.experiment_name,
-            trace_config.get("backend"),
-            trace_config.get("token2text", False),
-        )
-
-    async def generate_sequences(self, batch: DataProto) -> DataProto:
-        """Generate sequences from agent loop.
-
-        Args:
-            batch (DataProto): Input batch.
-
-        Returns:
-            DataProto: Output batch.
-            - prompts: [bsz, prompt_length], prompt token ids from dataset.
-            - responses: [bsz, response_length], output token ids include response tokens
-              from LLM generation and observation tokens from tool_calls.
-            - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens.
-            - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens
-              and response tokens.
-            - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens.
-            - position_ids: [bsz, prompt_length + response_length], incremental position ids.
-
-            For multi-turn conversations:
-            responses:     |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->|
-            response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0|
-        """
-        config = self.config.actor_rollout_ref.rollout
-        sampling_params = dict(
-            temperature=config.temperature,
-            top_p=config.top_p,
-            repetition_penalty=1.0,
-        )
-
-        # override sampling params for validation
-        if batch.meta_info.get("validate", False):
-            sampling_params["top_p"] = config.val_kwargs.top_p
-            sampling_params["temperature"] = config.val_kwargs.temperature
-
-        # by default, we assume it's a single turn agent
-        if "agent_name" not in batch.non_tensor_batch:
-            batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
-
-        tasks = []
-        agent_names = batch.non_tensor_batch["agent_name"]
-        raw_prompts = batch.non_tensor_batch["raw_prompt"]
-        if "index" in batch.non_tensor_batch:
-            index = batch.non_tensor_batch["index"]
-        else:
-            index = np.arange(len(raw_prompts))
-
-        trajectory_info = await get_trajectory_info(
-            batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
-        )
-
-        for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True):
-            tasks.append(
-                asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory))
-            )
-        outputs = await asyncio.gather(*tasks)
-
-        output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config)
-        return output
+        self.AsyncLLMServerManager = PartialAsyncLLMServerManager
+        super().__init__(config, server_handles, rm_executor)
 
     async def generate_sequences_no_post(
-        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
+            self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
@@ -403,6 +90,7 @@ async def generate_sequences_no_post(
             temperature=config.temperature,
             top_p=config.top_p,
             repetition_penalty=1.0,
+            logprobs=config.calculate_log_probs,
         )
 
         # override sampling params for validation
@@ -414,9 +102,6 @@ async def generate_sequences_no_post(
         if "agent_name" not in batch.non_tensor_batch:
             batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object)
 
-        tasks = []
-        agent_names = batch.non_tensor_batch["agent_name"]
-        raw_prompts = batch.non_tensor_batch["raw_prompt"]
         if "index" in batch.non_tensor_batch:
             index = batch.non_tensor_batch["index"]
         else:
@@ -425,184 +110,61 @@ async def generate_sequences_no_post(
         trajectory_info = await get_trajectory_info(
             batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
         )
+
         if not partial_output_list:
             partial_output_list = [None] * len(batch)
 
-        for agent_name, messages, trajectory, partial_output in zip(
-            agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True
-        ):
+        tasks = []
+        for i in range(len(batch)):
+            kwargs = {k: v[i] for k, v in batch.non_tensor_batch.items()}
             tasks.append(
                 asyncio.create_task(
-                    self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
-                )
-            )
-        outputs = await asyncio.gather(*tasks)
-
-        return outputs
-
-    async def _run_agent_loop(
-        self,
-        agent_name: str,
-        messages: list[dict[str, Any]],
-        sampling_params: dict[str, Any],
-        trajectory: dict[str, Any],
-        partial_output: Optional[AgentLoopOutput] = None,
+                    self._partial_run_agent_loop(sampling_params,
+                                                 trajectory_info[i],
+                                                 partial_output_list[i],
+                                                 **kwargs)))
+        return await asyncio.gather(*tasks)
+
+    async def _partial_run_agent_loop(
+            self,
+            sampling_params: dict[str, Any],
+            trajectory: dict[str, Any],
+            partial_output: Optional[AgentLoopOutput] = None,
+            *,
+            agent_name: str,
+            **kwargs,
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
-            step=trajectory["step"],
-            sample_index=trajectory["sample_index"],
-            rollout_n=trajectory["rollout_n"],
-            validate=trajectory["validate"],
-            name="agent_loop",
+                step=trajectory["step"],
+                sample_index=trajectory["sample_index"],
+                rollout_n=trajectory["rollout_n"],
+                validate=trajectory["validate"],
+                name="agent_loop",
         ):
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
             )
+
             agent_loop_config = _agent_loop_registry[agent_name]
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
                 trainer_config=_DummyConfig(config=self.config),
                 server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
+                processor=self.processor,
             )
-            if agent_name == "partial_single_turn_agent":
-                output = await agent_loop.run(messages, sampling_params, partial_output)
-            else:
-                output = await agent_loop.run(messages, sampling_params)
-            return output
-
-
-async def get_trajectory_info(step, index, validate):
-    """Get trajectory info.
-
-    Args:
-        step (int): global steps in the trainer.
-        index (list): form datastore extra_info.index column.
-        validate (bool): whether is a validate step.
-
-    Returns:
-        list: trajectory.
-    """
-    trajectory_info = []
-    rollout_n = 0
-    for i in range(len(index)):
-        if i > 0 and index[i - 1] == index[i]:
-            rollout_n += 1
-        else:
-            rollout_n = 0
-        trajectory_info.append({"step": step, "sample_index": index[i], "rollout_n": rollout_n, "validate": validate})
-    return trajectory_info
+            return await agent_loop.run(sampling_params, partial_output, **kwargs)
 
 
-class AgentLoopManager:
-    """Agent loop manager that manages a group of agent loop workers."""
-
-    def __init__(self, config: DictConfig, worker_group: RayWorkerGroup):
-        """Initialize agent loop manager.
-
-        Args:
-            config (DictConfig): trainer config.
-            worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group.
-        """
-        self.config = config
-        self.worker_group = worker_group
-
-        self._initialize_llm_servers()
-        self._init_agent_loop_workers()
-
-        # Initially we're in sleep mode.
-        self.sleep()
-
-    def _initialize_llm_servers(self):
-        self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
-        self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size
-
-        register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center")
-        workers_info = ray.get(register_center.get_worker_info.remote())
-        assert len(workers_info) == self.worker_group.world_size
-
-        self.async_llm_servers = [None] * self.rollout_dp_size
-        self.server_addresses = [None] * self.rollout_dp_size
-
-        if self.config.actor_rollout_ref.rollout.agent.custom_async_server:
-            server_class = async_server_class(
-                rollout_backend=self.config.actor_rollout_ref.rollout.name,
-                rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path,
-                rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name,
-            )
-        else:
-            server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name)
-
-        # Start all server instances, restart if address already in use.
-        unready_dp_ranks = set(range(self.rollout_dp_size))
-        while len(unready_dp_ranks) > 0:
-            servers = {
-                rollout_dp_rank: server_class.options(
-                    # make sure AsyncvLLMServer colocates with its corresponding workers
-                    scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
-                        node_id=workers_info[rollout_dp_rank * self.rollout_tp_size],
-                        soft=False,
-                    ),
-                    name=f"async_llm_server_{rollout_dp_rank}",
-                ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix)
-                for rollout_dp_rank in unready_dp_ranks
-            }
-
-            for rollout_dp_rank, server in servers.items():
-                try:
-                    address = ray.get(server.get_server_address.remote())
-                    self.server_addresses[rollout_dp_rank] = address
-                    self.async_llm_servers[rollout_dp_rank] = server
-                    unready_dp_ranks.remove(rollout_dp_rank)
-                except Exception:
-                    ray.kill(server)
-                    print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...")
-
-        # All server instances are ready, init AsyncLLM engine.
-        ray.get([server.init_engine.remote() for server in self.async_llm_servers])
-
-    def _init_agent_loop_workers(self):
-        self.agent_loop_workers = []
-        for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers):
-            self.agent_loop_workers.append(
-                AgentLoopWorker.options(
-                    name=f"agent_loop_worker_{i}",
-                ).remote(self.config, self.async_llm_servers)
-            )
-
-    def generate_sequences(self, prompts: DataProto) -> DataProto:
-        """Split input batch and dispatch to agent loop workers.
-
-        Args:
-            prompts (DataProto): Input batch.
-
-        Returns:
-            DataProto: Output batch.
-        """
-        if self.config.actor_rollout_ref.rollout.free_cache_engine:
-            self.wake_up()
-        chunkes = prompts.chunk(len(self.agent_loop_workers))
-        outputs = ray.get(
-            [
-                worker.generate_sequences.remote(chunk)
-                for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True)
-            ]
-        )
-        output = DataProto.concat(outputs)
-        if self.config.actor_rollout_ref.rollout.free_cache_engine:
-            self.sleep()
-
-        # calculate performance metrics
-        metrics = [output.meta_info["metrics"] for output in outputs]  # List[List[Dict[str, str]]]
-        timing = self._performance_metrics(metrics, output)
-
-        output.meta_info = {"timing": timing}
-        return output
+class FullyAgentLoopManager(AgentLoopManager):
+    def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None):
+        super().__init__(config, worker_group, rm_wg)
+        self.AgentLoopWorker = FullyAgentLoopWorker
 
     async def generate_single_sample_async(
-        self,
-        sample: DataProto,
-        partial_output_list: Optional[list[AgentLoopOutput]],
+            self,
+            sample: DataProto,
+            partial_output_list: Optional[list[AgentLoopOutput]],
     ) -> list[AgentLoopOutput]:
         """
         异步处理单个样本, 需要复制n次
@@ -629,36 +191,6 @@ def _select_best_worker(self):
         self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
         return worker
 
-    def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
-        timing = {}
-        t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk])
-        t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk])
-        timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min()
-        timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max()
-        timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean()
-        timing["agent_loop/tool_calls/min"] = t_tool_calls.min()
-        timing["agent_loop/tool_calls/max"] = t_tool_calls.max()
-        timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean()
-
-        # batch sequence generation is bounded by the slowest sample
-        slowest = np.argmax(t_generate_sequences + t_tool_calls)
-        attention_mask = output.batch["attention_mask"][slowest]
-        prompt_length = output.batch["prompts"].shape[1]
-        timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
-        timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
-        timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
-        timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
-
-        return timing
-
-    def wake_up(self):
-        """Wake up all rollout server instances."""
-        ray.get([server.wake_up.remote() for server in self.async_llm_servers])
-
-    def sleep(self):
-        """Sleep all rollout server instances."""
-        ray.get([server.sleep.remote() for server in self.async_llm_servers])
-
     async def cancel_async(self):
         """Cancel all rollout tasks asynchronously."""
         futures = [server.cancel.remote() for server in self.async_llm_servers]
@@ -668,38 +200,3 @@ async def resume_async(self):
         """Cancel all rollout tasks asynchronously."""
         futures = [server.resume.remote() for server in self.async_llm_servers]
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
-
-
-from verl.workers.rollout.async_server import AsyncServerBase
-
-
-def async_server_class(
-    rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None
-) -> type[AsyncServerBase]:
-    """Get async server class.
-
-    Args:
-        rollout_backend: str, rollout backend type (alias), should be "vllm".
-        rollout_backend_module: Optional[str], import path of the rollout backend.
-        rollout_backend_class: Optional[str], class name of the rollout backend.
-
-    Returns:
-        Type[AsyncServerBase]: async server class.
-    """
-    if rollout_backend_class is None and rollout_backend_module is None:
-        # If both are None, use the default backend class
-        # Do not change the original import behavior
-        # importlib.import_module and from ... import ... have subtle differences in ray
-
-        if rollout_backend == "vllm":
-            from recipe.fully_async_policy.vllm_rollout.vllm_async_server import AsyncvLLMServer
-            return AsyncvLLMServer
-        else:
-            raise NotImplementedError(f"rollout backend {rollout_backend} is not supported")
-
-    if rollout_backend_module is None or rollout_backend_class is None:
-        raise ValueError("rollout_backend_module and rollout_backend_class must be both provided for customization")
-
-    from verl.utils.import_utils import load_extern_type
-
-    return load_extern_type(rollout_backend_module, rollout_backend_class)
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 75d67ec1ab1..69041d923b5 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -18,15 +18,86 @@
 
 import numpy as np
 import torch
+from tensordict import TensorDict
 
 from verl import DataProto
-from recipe.fully_async_policy.agent_loop.agent_loop import postprocess_agent_loop_outputs
+from verl.experimental.agent_loop.agent_loop import AgentLoopOutput
 from verl.trainer.ppo.ray_trainer import compute_response_mask
 
 
-# Calculate the number of samples needed
-def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size):
-    return minimal_bsz * ppo_mini_batch_size
+def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto:
+    """Static method to postprocess a list of AgentLoopOutput into DataProto
+
+    Args:
+        inputs: List of AgentLoopOutput
+        tokenizer: Tokenizer instance
+        config: Configuration object
+
+    Returns:
+        DataProto: Processed batch data
+    """
+    # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
+    # prompts: left pad
+    # responses: right pad
+    # input_ids: prompt + response
+    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+    # position_ids:   [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
+
+    # prompts
+    tokenizer.padding_side = "left"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.prompt_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.prompt_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # responses
+    tokenizer.padding_side = "right"
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_ids} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"]
+
+    # response_mask
+    outputs = tokenizer.pad(
+        [{"input_ids": input.response_mask} for input in inputs],
+        padding="max_length",
+        max_length=config.actor_rollout_ref.rollout.response_length,
+        return_tensors="pt",
+        return_attention_mask=False,
+    )
+    response_mask = outputs["input_ids"]
+    assert response_ids.shape == response_mask.shape, (
+        f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}"
+    )
+    response_mask = response_mask * response_attention_mask
+
+    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
+    attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1)
+    position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask
+
+    batch = TensorDict(
+        {
+            "prompts": prompt_ids,  # [bsz, prompt_length]
+            "responses": response_ids,  # [bsz, response_length]
+            "response_mask": response_mask,  # [bsz, response_length]
+            "input_ids": input_ids,  # [bsz, prompt_length + response_length]
+            "attention_mask": attention_mask,  # [bsz, prompt_length + response_length]
+            "position_ids": position_ids,  # [bsz, prompt_length + response_length]
+        },
+        batch_size=len(input_ids),
+    )
+
+    num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32)
+    metrics = [input.metrics.model_dump() for input in inputs]
+    return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics})
 
 
 @dataclass
@@ -157,7 +228,7 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
 
 
 def assemble_batch_from_rollout_samples(
-    rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None
+        rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None
 ) -> DataProto:
     """
     Assemble gen_batch_output from RolloutSample objects
@@ -368,7 +439,7 @@ def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, An
         REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"}
         if REQUIRED_PERF_KEYS.issubset(aggregated):
             aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / (
-                aggregated["perf/time_per_step"] * self.total_gpus
+                    aggregated["perf/time_per_step"] * self.total_gpus
             )
 
         return aggregated
diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index 7a1b59aa64c..82c23bfa04f 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -38,8 +38,9 @@
 )
 from verl.utils.import_utils import import_external_libs
 from verl.utils.model import get_generation_config, update_model_config
-from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader
+from verl.workers.config import HFModelConfig, RolloutConfig
 from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+from verl.workers.rollout import get_rollout_class
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -82,6 +83,9 @@ def sync_rollout_weights(self):
         params = self._get_actor_params() if self._is_actor else None
         if self._is_rollout:
             inference_model = get_inference_model(self.rollout)
+
+            from verl.utils.vllm.patch import patch_vllm_moe_model_weight_loader
+
             patch_vllm_moe_model_weight_loader(inference_model)
         for key, shape, dtype in self._weights_info:
             tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
@@ -207,32 +211,28 @@ def init_model(self):
         rollout_device_mesh = init_device_mesh(
             device_name, mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"]
         )
+
+        is_collect = rollout_device_mesh["infer_tp"].get_local_rank() == 0
+        self._register_dispatch_collect_info(
+            "rollout", dp_rank=rollout_device_mesh["dp"].get_local_rank(), is_collect=is_collect
+        )
+
         rollout_name = self.config.rollout.name
         assert rollout_name == "vllm"
 
-        from verl.workers.rollout.vllm_rollout import vLLMRollout
+        rollout_config: RolloutConfig = omega_conf_to_dataclass(self.config.rollout)
+        model_config: HFModelConfig = omega_conf_to_dataclass(self.config.model, dataclass_type=HFModelConfig)
 
         log_gpu_memory_usage(f"Before building {rollout_name} rollout", logger=logger)
-
-        from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout
-
-        vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout
-        rollout = vllm_rollout_cls(
-            model_path=local_path,
-            config=self.config.rollout,
-            tokenizer=self.tokenizer,
-            model_hf_config=actor_model_config,
-            device_mesh=rollout_device_mesh,
-            trust_remote_code=trust_remote_code,
+        rollout = get_rollout_class(rollout_config.name, rollout_config.mode)(
+            config=rollout_config, model_config=model_config, device_mesh=rollout_device_mesh
         )
         log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger)
-
         from .detach_sharding_manager import DetachShardingManager
 
         sharding_manager = DetachShardingManager(
             inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh
         )
-
         log_gpu_memory_usage("After building sharding manager", logger=logger)
 
         self.rollout = rollout
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index c0f156296a2..b98b3f426e0 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -24,8 +24,9 @@
 from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter
 from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer
 from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
-from recipe.fully_async_policy.ray_trainer import ResourcePoolManager, Role
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager
 from verl.trainer.ppo.reward import load_reward_manager
+from verl.trainer.ppo.utils import Role
 from verl.utils.fs import copy_to_local
 
 
@@ -270,7 +271,7 @@ def _run_training_loop(self):
                 for future in done_futures:
                     try:
                         ray.get(future)
-                        print(f"[ASYNC MAIN] One component completed successfully")
+                        print("[ASYNC MAIN] One component completed successfully")
                     except Exception as e:
                         print(f"[ASYNC MAIN] Component failed with error: {e}")
                         for remaining_future in remaining_futures:
@@ -291,7 +292,7 @@ def _run_training_loop(self):
 
 @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None)
 def main(config):
-    from recipe.fully_async_policy.main_ppo import run_ppo
+    from verl.trainer.main_ppo import run_ppo
 
     # Ensure async training config exists
     if not hasattr(config, "async_training"):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 919314ba1b5..e53e6c43ef5 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -24,14 +24,16 @@
     prepare_single_generation_data,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
+from recipe.fully_async_policy.ray_trainer import FullyAsyncRayPPOTrainer
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
-from recipe.fully_async_policy.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager
+from verl.trainer.ppo.utils import Role, WorkerType
 from verl.utils.profiler import marked_timer
 from verl.utils.tracking import ValidationGenerationsLogger
 
 
 @ray.remote(num_cpus=10, max_concurrency=100)
-class FullyAsyncRollouter(RayPPOTrainer):
+class FullyAsyncRollouter(FullyAsyncRayPPOTrainer):
     """
     Asynchronous sample generator, responsible for continuously generating training samples
     and putting them into MessageQueue
@@ -227,7 +229,6 @@ def _validate_config(self):
         if not hasattr(self.config, "async_training"):
             raise ValueError("[FullyAsyncRollouter] Missing async_training configuration")
         assert self.config.actor_rollout_ref.rollout.calculate_log_probs, "must rollout calculate log_probs"
-        super()._validate_config()
 
     def _create_actor_rollout_classes(self):
         # only create rollout
@@ -257,10 +258,10 @@ def _create_continuous_iterator(self):
     def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         assert self.config.actor_rollout_ref.rollout.mode == "async"
-        from recipe.fully_async_policy.agent_loop import AgentLoopManager
+        from recipe.fully_async_policy.agent_loop import FullyAgentLoopManager
 
         self.async_rollout_mode = True
-        self.async_rollout_manager = AgentLoopManager(
+        self.async_rollout_manager = FullyAgentLoopManager(
             config=self.config,
             worker_group=self.rollout_wg,
         )
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 0c1501cbf89..4cba527c857 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -28,20 +28,17 @@
     assemble_batch_from_rollout_samples,
 )
 from recipe.fully_async_policy.message_queue import MessageQueueClient
+from recipe.fully_async_policy.ray_trainer import FullyAsyncRayPPOTrainer
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator
-from recipe.fully_async_policy.ray_trainer import (
-    RayPPOTrainer,
-    ResourcePoolManager,
-    Role,
-    WorkerType,
-)
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager
+from verl.trainer.ppo.utils import Role, WorkerType
 from verl.utils.debug import marked_timer
 
 
 @ray.remote(num_cpus=10)
-class FullyAsyncTrainer(RayPPOTrainer):
+class FullyAsyncTrainer(FullyAsyncRayPPOTrainer):
     """
     A fully asynchronous PPO trainer that obtains samples from a MessageQueue for training.
     Based on an improved implementation of OneStepOffRayTrainer
diff --git a/recipe/fully_async_policy/main_ppo.py b/recipe/fully_async_policy/main_ppo.py
deleted file mode 100644
index 4b240c6ffbf..00000000000
--- a/recipe/fully_async_policy/main_ppo.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
-"""
-
-import os
-import socket
-
-import hydra
-import ray
-from omegaconf import OmegaConf
-
-from verl.experimental.dataset.sampler import AbstractSampler
-from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer
-from verl.trainer.ppo.reward import load_reward_manager
-from verl.utils.device import is_cuda_available
-from verl.utils.import_utils import load_extern_type
-
-
-@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None)
-def main(config):
-    """Main entry point for PPO training with Hydra configuration management.
-
-    Args:
-        config_dict: Hydra configuration dictionary containing training parameters.
-    """
-    from time import time
-
-    start_time = time()
-    run_ppo(config)
-    print(f"total time: {time() - start_time:.2f} seconds")
-
-
-# Define a function to run the PPO-like training process
-def run_ppo(config, task_runner_class=None) -> None:
-    """Initialize Ray cluster and run distributed PPO training process.
-
-    Args:
-        config: Training configuration object containing all necessary parameters
-                for distributed PPO training including Ray initialization settings,
-                model paths, and training hyperparameters.
-    """
-    # Check if Ray is not initialized
-    if not ray.is_initialized():
-        # Initialize Ray with a local cluster configuration
-        # Set environment variables in the runtime environment to control tokenizer parallelism,
-        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
-        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
-        ray.init(
-            runtime_env=get_ppo_ray_runtime_env(),
-            num_cpus=config.ray_init.num_cpus,
-        )
-    # for recipe to change TaskRunner
-    if task_runner_class is None:
-        task_runner_class = TaskRunner
-
-    # Create a remote instance of the TaskRunner class, and
-    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
-    if (
-        is_cuda_available
-        and config.trainer.get("profile_steps") is not None
-        and len(config.trainer.get("profile_steps", [])) > 0
-    ):
-        nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
-        runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote()
-    else:
-        runner = task_runner_class.remote()
-    ray.get(runner.run.remote(config))
-
-    # [Optional] get the path of the timeline trace file from the configuration, default to None
-    # This file is used for performance analysis
-    timeline_json_file = config.ray_init.get("timeline_json_file", None)
-    if timeline_json_file:
-        ray.timeline(filename=timeline_json_file)
-
-
-@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
-class TaskRunner:
-    """Ray remote class for executing distributed PPO training tasks.
-
-    This class encapsulates the main training logic and runs as a Ray remote actor
-    to enable distributed execution across multiple nodes and GPUs.
-    """
-
-    def run(self, config):
-        """Execute the main PPO training workflow.
-
-        This method sets up the distributed training environment, initializes
-        workers, datasets, and reward functions, then starts the training process.
-
-        Args:
-            config: Training configuration object containing all parameters needed
-                   for setting up and running the PPO training process.
-        """
-        # Print the initial configuration. `resolve=True` will evaluate symbolic values.
-        from pprint import pprint
-
-        from omegaconf import OmegaConf
-
-        from verl.utils.fs import copy_to_local
-
-        print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
-        pprint(OmegaConf.to_container(config, resolve=True))
-        OmegaConf.resolve(config)
-
-        # Download the checkpoint from HDFS to the local machine.
-        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
-        local_path = copy_to_local(
-            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
-        )
-
-        # Instantiate the tokenizer and processor.
-        from verl.utils import hf_processor, hf_tokenizer
-
-        trust_remote_code = config.data.get("trust_remote_code", False)
-        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
-        # Used for multimodal LLM, could be None
-        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
-
-        # Define worker classes based on the actor strategy.
-        if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
-            assert config.critic.strategy in {"fsdp", "fsdp2"}
-            from verl.single_controller.ray import RayWorkerGroup
-            from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
-
-            use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
-            if use_legacy_worker_impl in ["auto", "enable"]:
-                # import warnings
-                # warnings.warn(f"Legacy worker impl is going to be deprecated, will be removed in the future. \
-                #   Please set trainer.use_legacy_worker_impl = false to switch to the new worker implementation.")
-                from verl.workers.fsdp_workers import CriticWorker
-            elif use_legacy_worker_impl == "disable":
-                from verl.workers.roles import CriticWorker
-
-                print("Using new worker implementation")
-            else:
-                raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}")
-
-            actor_rollout_cls = (
-                AsyncActorRolloutRefWorker
-                if config.actor_rollout_ref.rollout.mode == "async"
-                else ActorRolloutRefWorker
-            )
-            ray_worker_group_cls = RayWorkerGroup
-
-        elif config.actor_rollout_ref.actor.strategy == "megatron":
-            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
-            from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
-
-            actor_rollout_cls = (
-                AsyncActorRolloutRefWorker
-                if config.actor_rollout_ref.rollout.mode == "async"
-                else ActorRolloutRefWorker
-            )
-            ray_worker_group_cls = NVMegatronRayWorkerGroup
-
-        else:
-            raise NotImplementedError
-
-        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
-
-        # Map roles to their corresponding remote worker classes.
-        role_worker_mapping = {
-            Role.ActorRollout: ray.remote(actor_rollout_cls),
-            Role.Critic: ray.remote(CriticWorker),
-        }
-
-        # Define the resource pool specification.
-        # Map roles to the resource pool.
-        global_pool_id = "global_pool"
-        resource_pool_spec = {
-            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
-        }
-        mapping = {
-            Role.ActorRollout: global_pool_id,
-            Role.Critic: global_pool_id,
-        }
-
-        # We should adopt a multi-source reward function here:
-        # - for rule-based rm, we directly call a reward score
-        # - for model-based rm, we call a model
-        # - for code related prompt, we send to a sandbox if there are test cases
-        # finally, we combine all the rewards together
-        # The reward type depends on the tag of the data
-        if config.reward_model.enable:
-            if config.reward_model.strategy in {"fsdp", "fsdp2"}:
-                from verl.workers.fsdp_workers import RewardModelWorker
-            elif config.reward_model.strategy == "megatron":
-                from verl.workers.megatron_workers import RewardModelWorker
-            else:
-                raise NotImplementedError
-            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
-            mapping[Role.RewardModel] = global_pool_id
-
-        # Add a reference policy worker if KL loss or KL reward is used.
-        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
-            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
-            mapping[Role.RefPolicy] = global_pool_id
-
-        # Load the reward manager for training and validation.
-        reward_fn = load_reward_manager(
-            config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
-        )
-        val_reward_fn = load_reward_manager(
-            config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {})
-        )
-        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
-
-        from verl.utils.dataset.rl_dataset import collate_fn
-
-        # Create training and validation datasets.
-        train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor, is_train=True)
-        val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False)
-        train_sampler = create_rl_sampler(config.data, train_dataset)
-
-        # Initialize the PPO trainer.
-        trainer = RayPPOTrainer(
-            config=config,
-            tokenizer=tokenizer,
-            processor=processor,
-            role_worker_mapping=role_worker_mapping,
-            resource_pool_manager=resource_pool_manager,
-            ray_worker_group_cls=ray_worker_group_cls,
-            reward_fn=reward_fn,
-            val_reward_fn=val_reward_fn,
-            train_dataset=train_dataset,
-            val_dataset=val_dataset,
-            collate_fn=collate_fn,
-            train_sampler=train_sampler,
-        )
-        # Initialize the workers of the trainer.
-        trainer.init_workers()
-        # Start the training process.
-        trainer.fit()
-
-
-def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=True):
-    """Create a dataset.
-
-    Arguments:
-        data_paths: List of paths to data files.
-        data_config: The data config.
-        tokenizer (Tokenizer): The tokenizer.
-        processor (Processor): The processor.
-
-    Returns:
-        dataset (Dataset): The dataset.
-    """
-    from torch.utils.data import Dataset
-
-    from verl.utils.dataset.rl_dataset import RLHFDataset
-
-    # Check if a custom dataset class is specified in the data configuration
-    # and if the path to the custom class is provided
-    if "custom_cls" in data_config and data_config.custom_cls.get("path", None) is not None:
-        # Dynamically load the custom dataset class
-        dataset_cls = load_extern_type(data_config.custom_cls.path, data_config.custom_cls.name)
-        # Verify that the custom dataset class inherits from torch.utils.data.Dataset
-        if not issubclass(dataset_cls, Dataset):
-            raise TypeError(
-                f"The custom dataset class '{data_config.custom_cls.name}' from "
-                f"'{data_config.custom_cls.path}' must inherit from torch.utils.data.Dataset"
-            )
-    elif "datagen" in data_config and data_config.datagen.get("path", None) is not None and is_train:
-        # If a data generation strategy is specified, use the DynamicGenDataset class
-        from verl.utils.dataset.dynamicgen_dataset import DynamicGenDataset
-
-        dataset_cls = DynamicGenDataset
-        print("Using DynamicGenDataset for data generation.")
-
-    else:
-        # Use the default RLHFDataset class if no custom class is specified
-        dataset_cls = RLHFDataset
-    print(f"Using dataset class: {dataset_cls.__name__}")
-
-    # Instantiate the dataset using the determined dataset class
-    dataset = dataset_cls(
-        data_files=data_paths,
-        tokenizer=tokenizer,
-        processor=processor,
-        config=data_config,
-    )
-
-    return dataset
-
-
-def create_rl_sampler(data_config, dataset):
-    """Create a sampler for the dataset.
-
-    Arguments:
-        data_config: The data config.
-        dataset (Dataset): The dataset.
-
-    Returns:
-        sampler (Sampler): The sampler.
-    """
-    import torch
-    from torch.utils.data import RandomSampler, SequentialSampler
-
-    if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None:
-        curriculum_class = load_extern_type(
-            data_config.sampler.class_path,
-            data_config.sampler.class_name,
-        )
-        sampler = curriculum_class(
-            data_source=dataset,
-            data_config=data_config,
-        )
-        assert isinstance(sampler, AbstractSampler)
-        assert data_config.get("dataloader_num_workers", 8) == 0, (
-            "If using curriculum, num_workers must be 0 to prevent data caching. "
-            "If the dataloader caches data before the batch is done the "
-            "curriculum sampler won't have the opportunity to reorder it. "
-        )
-
-    # Use a sampler to facilitate checkpoint resumption.
-    # If shuffling is enabled in the data configuration, create a random sampler.
-    elif data_config.shuffle:
-        train_dataloader_generator = torch.Generator()
-        train_dataloader_generator.manual_seed(data_config.get("seed", 1))
-        sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator)
-    else:
-        # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order.
-        sampler = SequentialSampler(data_source=dataset)
-
-    return sampler
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py
index dea3aa2c26e..0a74c5ed386 100644
--- a/recipe/fully_async_policy/ray_trainer.py
+++ b/recipe/fully_async_policy/ray_trainer.py
@@ -18,790 +18,41 @@
 This trainer supports model-agonistic model initialization with huggingface
 """
 
-import json
-import os
 import uuid
-import warnings
-from collections import defaultdict
 from copy import deepcopy
-from dataclasses import dataclass, field
-from enum import Enum
 from pprint import pprint
-from typing import Optional
 
 import numpy as np
 import ray
 import torch
-from omegaconf import OmegaConf, open_dict
-from torch.utils.data import Dataset, Sampler
-from torchdata.stateful_dataloader import StatefulDataLoader
+from omegaconf import OmegaConf
 from tqdm import tqdm
 
 from verl import DataProto
 from verl.experimental.dataset.sampler import AbstractCurriculumSampler
-from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
-from verl.single_controller.base import Worker
-from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.single_controller.ray import RayClassWithInitArgs
 from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.trainer.config import AlgoConfig
-from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
 from verl.trainer.ppo.metric_utils import (
     compute_data_metrics,
     compute_throughout_metrics,
     compute_timing_metrics,
-    process_validation_metrics,
 )
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer, compute_advantage
 from verl.trainer.ppo.reward import compute_reward, compute_reward_async
-from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
+from verl.trainer.ppo.utils import Role
+from verl.utils.checkpoint.checkpoint_manager import should_save_ckpt_esi
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
 from verl.utils.metric import (
     reduce_metrics,
 )
-from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
-from verl.utils.torch_functional import masked_mean
-from verl.utils.tracking import ValidationGenerationsLogger
-
-WorkerType = type[Worker]
-
-
-class Role(Enum):
-    """
-    To create more roles dynamically, you can subclass Role and add new members
-    """
-
-    Actor = 0
-    Rollout = 1
-    ActorRollout = 2
-    Critic = 3
-    RefPolicy = 4
-    RewardModel = 5
-    ActorRolloutRef = 6
-
-    def __str__(self):
-        """返回与代码中一致的字符串表示"""
-        return self._get_role_string()
-
-    def _get_role_string(self):
-        """获取角色对应的字符串名称"""
-        role_mapping = {
-            Role.Actor: "actor",
-            Role.Rollout: "rollout",
-            Role.ActorRollout: "actor_rollout",
-            Role.Critic: "critic",
-            Role.RefPolicy: "ref",
-            Role.RewardModel: "rm",
-            Role.ActorRolloutRef: "actor_rollout_ref",
-        }
-        return role_mapping.get(self, self.name.lower())
-
-    @classmethod
-    def from_string(cls, name: str):
-        """从字符串创建Role实例"""
-        string_mapping = {
-            "actor": cls.Actor,
-            "rollout": cls.Rollout,
-            "actor_rollout": cls.ActorRollout,
-            "critic": cls.Critic,
-            "ref": cls.RefPolicy,
-            "rm": cls.RewardModel,
-            "actor_rollout_ref": cls.ActorRolloutRef,
-        }
-        role = string_mapping.get(name.lower())
-        if role is None:
-            raise ValueError(f"No Role found for string: {name}")
-        return role
-
-
-@dataclass
-class ResourcePoolManager:
-    """
-    Define a resource pool specification. Resource pool will be initialized first.
-    """
-
-    resource_pool_spec: dict[str, list[int]]
-    mapping: dict[Role, str]
-    resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict)
-
-    def create_resource_pool(self):
-        """Create Ray resource pools for distributed training.
-
-        Initializes resource pools based on the resource pool specification,
-        with each pool managing GPU resources across multiple nodes.
-        For FSDP backend, uses max_colocate_count=1 to merge WorkerGroups.
-        For Megatron backend, uses max_colocate_count>1 for different models.
-        """
-        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
-            # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool
-            # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one.
-            # For Megatron backend, we recommend using max_colocate_count>1
-            # that can utilize different WorkerGroup for differnt models
-            resource_pool = RayResourcePool(
-                process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name
-            )
-            self.resource_pool_dict[resource_pool_name] = resource_pool
-
-        self._check_resource_available()
-
-    def get_resource_pool(self, role: Role) -> RayResourcePool:
-        """Get the resource pool of the worker_cls"""
-        return self.resource_pool_dict[self.mapping[role]]
-
-    def get_n_gpus(self) -> int:
-        """Get the number of gpus in this cluster."""
-        return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes])
-
-    def _check_resource_available(self):
-        """Check if the resource pool can be satisfied in this ray cluster."""
-        node_available_resources = ray.state.available_resources_per_node()
-        node_available_gpus = {
-            node: node_info.get("GPU", 0) if "GPU" in node_info else node_info.get("NPU", 0)
-            for node, node_info in node_available_resources.items()
-        }
-
-        # check total required gpus can be satisfied
-        total_available_gpus = sum(node_available_gpus.values())
-        total_required_gpus = sum(
-            [n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes]
-        )
-        if total_available_gpus < total_required_gpus:
-            raise ValueError(
-                f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}"
-            )
-
-        # check each resource pool can be satisfied, O(#resource_pools * #nodes)
-        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
-            num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes)
-            for node, available_gpus in node_available_gpus.items():
-                if available_gpus >= num_gpus:
-                    node_available_gpus[node] -= num_gpus
-                    num_nodes -= 1
-                    if num_nodes == 0:
-                        break
-            if num_nodes > 0:
-                raise ValueError(
-                    f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes}"
-                    + "cannot be satisfied in this ray cluster"
-                )
-
-
-def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl"):
-    """Apply KL penalty to the token-level rewards.
-
-    This function computes the KL divergence between the reference policy and current policy,
-    then applies a penalty to the token-level rewards based on this divergence.
-
-    Args:
-        data (DataProto): The data containing batched model outputs and inputs.
-        kl_ctrl (core_algos.AdaptiveKLController): Controller for adaptive KL penalty.
-        kl_penalty (str, optional): Type of KL penalty to apply. Defaults to "kl".
-        multi_turn (bool, optional): Whether the data is from a multi-turn conversation. Defaults to False.
-
-    Returns:
-        tuple: A tuple containing:
-            - The updated data with token-level rewards adjusted by KL penalty
-            - A dictionary of metrics related to the KL penalty
-    """
-    response_mask = data.batch["response_mask"]
-    token_level_scores = data.batch["token_level_scores"]
-    batch_size = data.batch.batch_size[0]
-
-    # compute kl between ref_policy and current policy
-    # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled.
-    kld = core_algos.kl_penalty(
-        data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty
-    )  # (batch_size, response_length)
-    kld = kld * response_mask
-    beta = kl_ctrl.value
-
-    token_level_rewards = token_level_scores - beta * kld
-
-    current_kl = masked_mean(kld, mask=response_mask, axis=-1)  # average over sequence
-    current_kl = torch.mean(current_kl, dim=0).item()
-
-    # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
-    kl_ctrl.update(current_kl=current_kl, n_steps=batch_size)
-    data.batch["token_level_rewards"] = token_level_rewards
-
-    metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta}
-
-    return data, metrics
-
-
-def compute_response_mask(data: DataProto):
-    """Compute the attention mask for the response part of the sequence.
-
-    This function extracts the portion of the attention mask that corresponds to the model's response,
-    which is used for masking computations that should only apply to response tokens.
-
-    Args:
-        data (DataProto): The data containing batched model outputs and inputs.
-
-    Returns:
-        torch.Tensor: The attention mask for the response tokens.
-    """
-    responses = data.batch["responses"]
-    response_length = responses.size(1)
-    attention_mask = data.batch["attention_mask"]
-    return attention_mask[:, -response_length:]
-
-
-def compute_advantage(
-    data: DataProto,
-    adv_estimator: AdvantageEstimator,
-    gamma: float = 1.0,
-    lam: float = 1.0,
-    num_repeat: int = 1,
-    norm_adv_by_std_in_grpo: bool = True,
-    config: Optional[AlgoConfig] = None,
-) -> DataProto:
-    """Compute advantage estimates for policy optimization.
-
-    This function computes advantage estimates using various estimators like GAE, GRPO, REINFORCE++, etc.
-    The advantage estimates are used to guide policy optimization in RL algorithms.
-
-    Args:
-        data (DataProto): The data containing batched model outputs and inputs.
-        adv_estimator (AdvantageEstimator): The advantage estimator to use (e.g., GAE, GRPO, REINFORCE++).
-        gamma (float, optional): Discount factor for future rewards. Defaults to 1.0.
-        lam (float, optional): Lambda parameter for GAE. Defaults to 1.0.
-        num_repeat (int, optional): Number of times to repeat the computation. Defaults to 1.
-        norm_adv_by_std_in_grpo (bool, optional): Whether to normalize advantages by standard deviation in
-            GRPO. Defaults to True.
-        config (dict, optional): Configuration dictionary for algorithm settings. Defaults to None.
-
-    Returns:
-        DataProto: The updated data with computed advantages and returns.
-    """
-    # Back-compatible with trainers that do not compute response mask in fit
-    if "response_mask" not in data.batch.keys():
-        data.batch["response_mask"] = compute_response_mask(data)
-    # prepare response group
-    if adv_estimator == AdvantageEstimator.GAE:
-        # Compute advantages and returns using Generalized Advantage Estimation (GAE)
-        advantages, returns = core_algos.compute_gae_advantage_return(
-            token_level_rewards=data.batch["token_level_rewards"],
-            values=data.batch["values"],
-            response_mask=data.batch["response_mask"],
-            gamma=gamma,
-            lam=lam,
-        )
-        data.batch["advantages"] = advantages
-        data.batch["returns"] = returns
-        if config.get("use_pf_ppo", False):
-            data = core_algos.compute_pf_ppo_reweight_data(
-                data,
-                config.pf_ppo.get("reweight_method"),
-                config.pf_ppo.get("weight_pow"),
-            )
-    elif adv_estimator == AdvantageEstimator.GRPO:
-        # Initialize the mask for GRPO calculation
-        grpo_calculation_mask = data.batch["response_mask"]
-        # Call compute_grpo_outcome_advantage with parameters matching its definition
-        advantages, returns = core_algos.compute_grpo_outcome_advantage(
-            token_level_rewards=data.batch["token_level_rewards"],
-            response_mask=grpo_calculation_mask,
-            index=data.non_tensor_batch["uid"],
-            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
-        )
-        data.batch["advantages"] = advantages
-        data.batch["returns"] = returns
-    else:
-        # handle all other adv estimator type other than GAE and GRPO
-        adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator)
-        adv_kwargs = {
-            "token_level_rewards": data.batch["token_level_rewards"],
-            "response_mask": data.batch["response_mask"],
-            "config": config,
-        }
-        if "uid" in data.non_tensor_batch:  # optional
-            adv_kwargs["index"] = data.non_tensor_batch["uid"]
-        if "reward_baselines" in data.batch:  # optional
-            adv_kwargs["reward_baselines"] = data.batch["reward_baselines"]
-
-        # calculate advantage estimator
-        advantages, returns = adv_estimator_fn(**adv_kwargs)
-        data.batch["advantages"] = advantages
-        data.batch["returns"] = returns
-    return data
-
-
-class RayPPOTrainer:
-    """Distributed PPO trainer using Ray for scalable reinforcement learning.
-
-    This trainer orchestrates distributed PPO training across multiple nodes and GPUs,
-    managing actor rollouts, critic training, and reward computation with Ray backend.
-    Supports various model architectures including FSDP, Megatron, and vLLM integration.
-    """
-
-    # TODO: support each role have individual ray_worker_group_cls,
-    # i.e., support different backend of different role
-    def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        train_dataset: Optional[Dataset] = None,
-        val_dataset: Optional[Dataset] = None,
-        collate_fn=None,
-        train_sampler: Optional[Sampler] = None,
-        device_name=None,
-    ):
-        """
-        Initialize distributed PPO trainer with Ray backend.
-        Note that this trainer runs on the driver process on a single CPU/GPU node.
-
-        Args:
-            config: Configuration object containing training parameters.
-            tokenizer: Tokenizer used for encoding and decoding text.
-            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
-            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
-            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
-            processor: Optional data processor, used for multimodal data
-            reward_fn: Function for computing rewards during training.
-            val_reward_fn: Function for computing rewards during validation.
-            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
-            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
-            collate_fn: Function to collate data samples into batches.
-            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
-            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
-        """
-
-        # Store the tokenizer for text processing
-        self.tokenizer = tokenizer
-        self.processor = processor
-        self.config = config
-        self.reward_fn = reward_fn
-        self.val_reward_fn = val_reward_fn
-
-        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
-        assert self.hybrid_engine, "Currently, only support hybrid engine"
-
-        if self.hybrid_engine:
-            assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}"
-
-        self.role_worker_mapping = role_worker_mapping
-        self.resource_pool_manager = resource_pool_manager
-        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
-        self.use_rm = Role.RewardModel in role_worker_mapping
-        self.ray_worker_group_cls = ray_worker_group_cls
-        self.device_name = device_name if device_name else self.config.trainer.device
-        self.validation_generations_logger = ValidationGenerationsLogger(
-            project_name=self.config.trainer.project_name,
-            experiment_name=self.config.trainer.experiment_name,
-        )
-
-        # if ref_in_actor is True, the reference policy will be actor without lora applied
-        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
-
-        # define in-reward KL control
-        # kl loss control currently not suppoorted
-        if self.config.algorithm.use_kl_in_reward:
-            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
-
-        if config.critic.enable is not None:
-            self.use_critic = bool(config.critic.enable)
-        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
-            self.use_critic = True
-        else:
-            warnings.warn(
-                "Disabled critic as algorithm.adv_estimator != gae. "
-                "If it is not intended, please set critic.enable=True",
-                stacklevel=2,
-            )
-            self.use_critic = False
-
-        self._validate_config()
-        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
-
-    def _validate_config(self):
-        config = self.config
-        # number of GPUs total
-        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
-        if config.actor_rollout_ref.actor.strategy == "megatron":
-            model_parallel_size = (
-                config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size
-                * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
-            )
-            assert (
-                n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0
-            ), (
-                f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times "
-                f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
-            )
-            megatron_dp = n_gpus // (
-                model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
-            )
-            self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
-        else:
-            self.minimal_bsz = n_gpus
-
-        # 1. Check total batch size for data correctness
-        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
-        assert real_train_batch_size % self.minimal_bsz == 0, (
-            f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
-            f"({self.minimal_bsz})"
-        )
-
-        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
-        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
-        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
-            """Validate mutually exclusive micro batch size configuration options.
-
-            Ensures that users don't set both deprecated micro_batch_size and
-            the new micro_batch_size_per_gpu parameters simultaneously.
-
-            Args:
-                mbs: Deprecated micro batch size parameter value.
-                mbs_per_gpu: New micro batch size per GPU parameter value.
-                name (str): Configuration section name for error messages.
-
-            Raises:
-                ValueError: If both parameters are set or neither is set.
-            """
-            settings = {
-                "reward_model": "micro_batch_size",
-                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
-                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
-            }
-
-            if name in settings:
-                param = settings[name]
-                param_per_gpu = f"{param}_per_gpu"
-
-                if mbs is None and mbs_per_gpu is None:
-                    raise ValueError(
-                        f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'."
-                    )
-
-                if mbs is not None and mbs_per_gpu is not None:
-                    raise ValueError(
-                        f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove "
-                        f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)."
-                    )
-
-        # Actor validation done in ActorConfig.__post_init__ and validate()
-        actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor)
-        actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model)
-
-        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
-            if self.use_reference_policy:
-                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-                check_mutually_exclusive(
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
-                    "actor_rollout_ref.ref",
-                )
-
-            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-            check_mutually_exclusive(
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
-                "actor_rollout_ref.rollout",
-            )
-
-        # Check for reward model micro-batch size conflicts
-        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
-            check_mutually_exclusive(
-                config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
-            )
-
-        if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
-            print("NOTICE: You have both enabled in-reward kl and kl loss.")
-
-        # critic
-        if self.use_critic:
-            critic_config = omega_conf_to_dataclass(config.critic)
-            critic_config.validate(n_gpus, config.data.train_batch_size)
-
-        if config.data.get("val_batch_size", None) is not None:
-            print(
-                "WARNING: val_batch_size is deprecated."
-                + " Validation datasets are sent to inference engines as a whole batch,"
-                + " which will schedule the memory themselves."
-            )
-
-        # check eval config
-        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
-            assert config.actor_rollout_ref.rollout.temperature > 0, (
-                "validation gen temperature should be greater than 0 when enabling do_sample"
-            )
-
-        print("[validate_config] All configuration checks passed successfully!")
-
-    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
-        """
-        Creates the train and validation dataloaders.
-        """
-        # TODO: we have to make sure the batch size is divisible by the dp size
-        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
-
-        if train_dataset is None:
-            train_dataset = create_rl_dataset(
-                self.config.data.train_files, self.config.data, self.tokenizer, self.processor
-            )
-        if val_dataset is None:
-            val_dataset = create_rl_dataset(
-                self.config.data.val_files, self.config.data, self.tokenizer, self.processor
-            )
-        self.train_dataset, self.val_dataset = train_dataset, val_dataset
-
-        if train_sampler is None:
-            train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
-        if collate_fn is None:
-            from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
-
-            collate_fn = default_collate_fn
-
-        num_workers = self.config.data["dataloader_num_workers"]
-
-        self.train_dataloader = StatefulDataLoader(
-            dataset=self.train_dataset,
-            batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
-            num_workers=num_workers,
-            drop_last=True,
-            collate_fn=collate_fn,
-            sampler=train_sampler,
-        )
-
-        val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
-        if val_batch_size is None:
-            val_batch_size = len(self.val_dataset)
-
-        self.val_dataloader = StatefulDataLoader(
-            dataset=self.val_dataset,
-            batch_size=val_batch_size,
-            num_workers=num_workers,
-            shuffle=self.config.data.get("validation_shuffle", True),
-            drop_last=False,
-            collate_fn=collate_fn,
-        )
-
-        assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
-        assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
-
-        print(
-            f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: "
-            f"{len(self.val_dataloader)}"
-        )
-
-        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
-
-        if self.config.trainer.total_training_steps is not None:
-            total_training_steps = self.config.trainer.total_training_steps
-
-        self.total_training_steps = total_training_steps
-        print(f"Total training steps: {self.total_training_steps}")
-
-        try:
-            OmegaConf.set_struct(self.config, True)
-            with open_dict(self.config):
-                if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"):
-                    self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
-                if OmegaConf.select(self.config, "critic.optim"):
-                    self.config.critic.optim.total_training_steps = total_training_steps
-        except Exception as e:
-            print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
-
-    def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path):
-        """Dump rollout/validation samples as JSONL."""
-        os.makedirs(dump_path, exist_ok=True)
-        filename = os.path.join(dump_path, f"{self.global_steps}.jsonl")
-
-        n = len(inputs)
-        base_data = {
-            "input": inputs,
-            "output": outputs,
-            "score": scores,
-            "step": [self.global_steps] * n,
-        }
-
-        for k, v in reward_extra_infos_dict.items():
-            if len(v) == n:
-                base_data[k] = v
-
-        lines = []
-        for i in range(n):
-            entry = {k: v[i] for k, v in base_data.items()}
-            lines.append(json.dumps(entry, ensure_ascii=False))
-
-        with open(filename, "w") as f:
-            f.write("\n".join(lines) + "\n")
-
-        print(f"Dumped generations to {filename}")
-
-    def _maybe_log_val_generations(self, inputs, outputs, scores):
-        """Log a table of validation samples to the configured logger (wandb or swanlab)"""
-
-        generations_to_log = self.config.trainer.log_val_generations
-
-        if generations_to_log == 0:
-            return
-
-        import numpy as np
-
-        # Create tuples of (input, output, score) and sort by input text
-        samples = list(zip(inputs, outputs, scores, strict=True))
-        samples.sort(key=lambda x: x[0])  # Sort by input text
+from verl.utils.rollout_skip import RolloutSkip
 
-        # Use fixed random seed for deterministic shuffling
-        rng = np.random.RandomState(42)
-        rng.shuffle(samples)
 
-        # Take first N samples after shuffling
-        samples = samples[:generations_to_log]
-
-        # Log to each configured logger
-        self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps)
-
-    def _validate(self):
-        data_source_lst = []
-        reward_extra_infos_dict: dict[str, list] = defaultdict(list)
-
-        # Lists to collect samples for the table
-        sample_inputs = []
-        sample_outputs = []
-        sample_scores = []
-        sample_turns = []
-
-        for test_data in self.val_dataloader:
-            test_batch = DataProto.from_single_dict(test_data)
-
-            # repeat test batch
-            test_batch = test_batch.repeat(
-                repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True
-            )
-
-            # we only do validation on rule-based rm
-            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
-                return {}
-
-            # Store original inputs
-            input_ids = test_batch.batch["input_ids"]
-            # TODO: Can we keep special tokens except for padding tokens?
-            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
-            sample_inputs.extend(input_texts)
-
-            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-            if "multi_modal_data" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("multi_modal_data")
-            if "raw_prompt" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("raw_prompt")
-            if "tools_kwargs" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("tools_kwargs")
-            if "interaction_kwargs" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-            if "agent_name" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("agent_name")
-            test_gen_batch = test_batch.pop(
-                batch_keys=batch_keys_to_pop,
-                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-            )
-
-            test_gen_batch.meta_info = {
-                "eos_token_id": self.tokenizer.eos_token_id,
-                "pad_token_id": self.tokenizer.pad_token_id,
-                "recompute_log_prob": False,
-                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
-                "validate": True,
-                "global_steps": self.global_steps,
-            }
-            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
-
-            # pad to be divisible by dp_size
-            size_divisor = (
-                self.actor_rollout_wg.world_size
-                if not self.async_rollout_mode
-                else self.config.actor_rollout_ref.rollout.agent.num_workers
-            )
-            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor)
-            if not self.async_rollout_mode:
-                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
-            else:
-                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)
-
-            # unpad
-            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
-
-            print("validation generation end")
-
-            # Store generated outputs
-            output_ids = test_output_gen_batch.batch["responses"]
-            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
-            sample_outputs.extend(output_texts)
-
-            test_batch = test_batch.union(test_output_gen_batch)
-            test_batch.meta_info["validate"] = True
-
-            # evaluate using reward_function
-            result = self.val_reward_fn(test_batch, return_dict=True)
-            reward_tensor = result["reward_tensor"]
-            scores = reward_tensor.sum(-1).cpu().tolist()
-            sample_scores.extend(scores)
-
-            reward_extra_infos_dict["reward"].extend(scores)
-            print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}")
-            if "reward_extra_info" in result:
-                for key, lst in result["reward_extra_info"].items():
-                    reward_extra_infos_dict[key].extend(lst)
-                    print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}")
-
-            # collect num_turns of each prompt
-            if "__num_turns__" in test_batch.non_tensor_batch:
-                sample_turns.append(test_batch.non_tensor_batch["__num_turns__"])
-
-            data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0]))
-
-        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
-
-        # dump generations
-        val_data_dir = self.config.trainer.get("validation_data_dir", None)
-        if val_data_dir:
-            self._dump_generations(
-                inputs=sample_inputs,
-                outputs=sample_outputs,
-                scores=sample_scores,
-                reward_extra_infos_dict=reward_extra_infos_dict,
-                dump_path=val_data_dir,
-            )
-
-        for key_info, lst in reward_extra_infos_dict.items():
-            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
-
-        data_sources = np.concatenate(data_source_lst, axis=0)
-
-        data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)
-        metric_dict = {}
-        for data_source, var2metric2val in data_src2var2metric2val.items():
-            core_var = "acc" if "acc" in var2metric2val else "reward"
-            for var_name, metric2val in var2metric2val.items():
-                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
-                for metric_name, metric_val in metric2val.items():
-                    if (
-                        (var_name == core_var)
-                        and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"])
-                        and (f"@{n_max}" in metric_name)
-                    ):
-                        metric_sec = "val-core"
-                    else:
-                        metric_sec = "val-aux"
-                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
-                    metric_dict[pfx] = metric_val
-
-        if len(sample_turns) > 0:
-            sample_turns = np.concatenate(sample_turns)
-            metric_dict["val-aux/num_turns/min"] = sample_turns.min()
-            metric_dict["val-aux/num_turns/max"] = sample_turns.max()
-            metric_dict["val-aux/num_turns/mean"] = sample_turns.mean()
-
-        return metric_dict
+class FullyAsyncRayPPOTrainer(RayPPOTrainer):
+    def __init__(self, *args, **kwargs):
+        pass
 
     def init_workers(self):
         """Initialize distributed training workers using Ray backend.
@@ -818,6 +69,7 @@ def init_workers(self):
 
     def _init_resource_pools(self):
         self.resource_pool_manager.create_resource_pool()
+
         self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
 
     def _create_worker_classes(self):
@@ -878,14 +130,17 @@ def _init_worker_groups(self):
         wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
         if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
             wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
-        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
-            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
-            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
-                "worker_nsight_options must be set when profile_steps is set"
-            )
-            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
-                OmegaConf.select(self.config.trainer, "worker_nsight_options")
-            )
+        if OmegaConf.select(self.config.global_profiler, "steps") is not None:
+            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.global_profiler, "steps")
+            # Only require nsight worker options when tool is nsys
+            if OmegaConf.select(self.config.global_profiler, "tool") == "nsys":
+                assert (
+                    OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
+                    is not None
+                ), "worker_nsight_options must be set when using nsys with profile_steps"
+                wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
+                    OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
+                )
         wg_kwargs["device_name"] = self.device_name
 
         for resource_pool, class_dict in self.resource_pool_to_cls.items():
@@ -920,170 +175,14 @@ def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         self.async_rollout_mode = False
         if self.config.actor_rollout_ref.rollout.mode == "async":
-            from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopManager
+            from recipe.fully_async_policy.agent_loop.agent_loop import FullyAgentLoopManager
 
             self.async_rollout_mode = True
-            self.async_rollout_manager = AgentLoopManager(
+            self.async_rollout_manager = FullyAgentLoopManager(
                 config=self.config,
                 worker_group=self.actor_rollout_wg,
             )
 
-    def _save_checkpoint(self):
-        from verl.utils.fs import local_mkdir_safe
-
-        # path: given_path + `/global_step_{global_steps}` + `/actor`
-        local_global_step_folder = os.path.join(
-            self.config.trainer.default_local_dir, f"global_step_{self.global_steps}"
-        )
-
-        print(f"local_global_step_folder: {local_global_step_folder}")
-        actor_local_path = os.path.join(local_global_step_folder, "actor")
-
-        actor_remote_path = (
-            None
-            if self.config.trainer.default_hdfs_dir is None
-            else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor")
-        )
-
-        remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False)
-        if remove_previous_ckpt_in_save:
-            print(
-                "Warning: remove_previous_ckpt_in_save is deprecated,"
-                + " set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead"
-            )
-        max_actor_ckpt_to_keep = (
-            self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
-        )
-        max_critic_ckpt_to_keep = (
-            self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
-        )
-
-        self.actor_rollout_wg.save_checkpoint(
-            actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep
-        )
-
-        if self.use_critic:
-            critic_local_path = os.path.join(local_global_step_folder, "critic")
-            critic_remote_path = (
-                None
-                if self.config.trainer.default_hdfs_dir is None
-                else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic")
-            )
-            self.critic_wg.save_checkpoint(
-                critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep
-            )
-
-        # save dataloader
-        local_mkdir_safe(local_global_step_folder)
-        dataloader_local_path = os.path.join(local_global_step_folder, "data.pt")
-        dataloader_state_dict = self.train_dataloader.state_dict()
-        torch.save(dataloader_state_dict, dataloader_local_path)
-
-        # latest checkpointed iteration tracker (for atomic usage)
-        local_latest_checkpointed_iteration = os.path.join(
-            self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt"
-        )
-        with open(local_latest_checkpointed_iteration, "w") as f:
-            f.write(str(self.global_steps))
-
-    def _load_checkpoint(self):
-        if self.config.trainer.resume_mode == "disable":
-            return 0
-
-        # load from hdfs
-        if self.config.trainer.default_hdfs_dir is not None:
-            raise NotImplementedError("load from hdfs is not implemented yet")
-        else:
-            checkpoint_folder = self.config.trainer.default_local_dir  # TODO: check path
-            if not os.path.isabs(checkpoint_folder):
-                working_dir = os.getcwd()
-                checkpoint_folder = os.path.join(working_dir, checkpoint_folder)
-            global_step_folder = find_latest_ckpt_path(checkpoint_folder)  # None if no latest
-
-        # find global_step_folder
-        if self.config.trainer.resume_mode == "auto":
-            if global_step_folder is None:
-                print("Training from scratch")
-                return 0
-        else:
-            if self.config.trainer.resume_mode == "resume_path":
-                assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type"
-                assert "global_step_" in self.config.trainer.resume_from_path, (
-                    "resume ckpt must specify the global_steps"
-                )
-                global_step_folder = self.config.trainer.resume_from_path
-                if not os.path.isabs(global_step_folder):
-                    working_dir = os.getcwd()
-                    global_step_folder = os.path.join(working_dir, global_step_folder)
-        print(f"Load from checkpoint folder: {global_step_folder}")
-        # set global step
-        self.global_steps = int(global_step_folder.split("global_step_")[-1])
-
-        print(f"Setting global step to {self.global_steps}")
-        print(f"Resuming from {global_step_folder}")
-
-        actor_path = os.path.join(global_step_folder, "actor")
-        critic_path = os.path.join(global_step_folder, "critic")
-        # load actor
-        self.actor_rollout_wg.load_checkpoint(
-            actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
-        )
-        # load critic
-        if self.use_critic:
-            self.critic_wg.load_checkpoint(
-                critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
-            )
-
-        # load dataloader,
-        # TODO: from remote not implemented yet
-        dataloader_local_path = os.path.join(global_step_folder, "data.pt")
-        if os.path.exists(dataloader_local_path):
-            dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
-            self.train_dataloader.load_state_dict(dataloader_state_dict)
-        else:
-            print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
-
-    def _start_profiling(self, do_profile: bool, timing_raw) -> None:
-        """Start profiling for all worker groups if profiling is enabled."""
-        with marked_timer("start_profile", timing_raw):
-            if do_profile:
-                self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
-                if self.use_reference_policy:
-                    self.ref_policy_wg.start_profile()
-                if self.use_critic:
-                    self.critic_wg.start_profile()
-                if self.use_rm:
-                    self.rm_wg.start_profile()
-
-    def _stop_profiling(self, do_profile: bool, timing_raw) -> None:
-        """Stop profiling for all worker groups if profiling is enabled."""
-        with marked_timer("stop_profile", timing_raw):
-            if do_profile:
-                self.actor_rollout_wg.stop_profile()
-                if self.use_reference_policy:
-                    self.ref_policy_wg.stop_profile()
-                if self.use_critic:
-                    self.critic_wg.stop_profile()
-                if self.use_rm:
-                    self.rm_wg.stop_profile()
-
-    def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
-        """Reorder the data on single controller such that each dp rank gets similar total tokens"""
-        attention_mask = batch.batch["attention_mask"]
-        batch_size = attention_mask.shape[0]
-        global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist()  # (train_batch_size,)
-        world_size = self.actor_rollout_wg.world_size
-        global_partition_lst = get_seqlen_balanced_partitions(
-            global_seqlen_lst, k_partitions=world_size, equal_size=True
-        )
-        # reorder based on index. The data will be automatically equally partitioned by dispatch function
-        global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
-        batch.reorder(global_idx)
-        global_balance_stats = log_seqlen_unbalance(
-            seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix
-        )
-        metrics.update(global_balance_stats)
-
     def fit(self):
         """
         The training loop of PPO.
@@ -1117,6 +216,10 @@ def fit(self):
             if self.config.trainer.get("val_only", False):
                 return
 
+        if self.config.actor_rollout_ref.rollout.get("skip_rollout", False):
+            rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
+            rollout_skip.wrap_generate_sequences()
+
         # add tqdm
         progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
 
@@ -1125,17 +228,25 @@ def fit(self):
         last_val_metrics = None
         self.max_steps_duration = 0
 
+        prev_step_profile = False
+        curr_step_profile = (
+            self.global_steps in self.config.global_profiler.steps
+            if self.config.global_profiler.steps is not None
+            else False
+        )
+        next_step_profile = False
+
         for epoch in range(self.config.trainer.total_epochs):
             for batch_dict in self.train_dataloader:
                 metrics = {}
                 timing_raw = {}
 
-                do_profile = (
-                    self.global_steps in self.config.trainer.profile_steps
-                    if self.config.trainer.profile_steps is not None
-                    else False
-                )
-                self._start_profiling(do_profile, timing_raw)
+                with marked_timer("start_profile", timing_raw):
+                    self._start_profiling(
+                        not prev_step_profile and curr_step_profile
+                        if self.config.global_profiler.profile_continuous_steps
+                        else curr_step_profile
+                    )
 
                 batch, gen_batch = self._prepare_generate_batch(batch_dict)
 
@@ -1152,6 +263,9 @@ def fit(self):
                         gen_batch_output.meta_info.pop("timing", None)
 
                     if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                        if self.reward_fn is None:
+                            raise ValueError("A reward_fn is required for REMAX advantage estimation.")
+
                         with marked_timer("gen_max", timing_raw, color="purple"):
                             gen_baseline_batch = deepcopy(gen_batch)
                             gen_baseline_batch.meta_info["do_sample"] = False
@@ -1172,10 +286,24 @@ def fit(self):
                     batch = self._post_generate_batch(batch, gen_batch_output, metrics)
                     batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                     self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
-                    last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
-                    self._check_save_checkpoint(is_last_step, timing_raw)
 
-                self._stop_profiling(do_profile, timing_raw)
+                last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw)
+                self._check_save_checkpoint(is_last_step, timing_raw)
+
+                with marked_timer("stop_profile", timing_raw):
+                    next_step_profile = (
+                        self.global_steps + 1 in self.config.global_profiler.steps
+                        if self.config.global_profiler.steps is not None
+                        else False
+                    )
+                    self._stop_profiling(
+                        curr_step_profile and not next_step_profile
+                        if self.config.global_profiler.profile_continuous_steps
+                        else curr_step_profile
+                    )
+                    prev_step_profile = curr_step_profile
+                    curr_step_profile = next_step_profile
+
                 self._collect_metrics(batch, epoch, metrics, timing_raw)
                 self._post_batch_processing(batch)
 
@@ -1185,6 +313,14 @@ def fit(self):
                 progress_bar.update(1)
                 self.global_steps += 1
 
+                if (
+                    hasattr(self.config.actor_rollout_ref.actor, "profiler")
+                    and self.config.actor_rollout_ref.actor.profiler.tool == "torch_memory"
+                ):
+                    self.actor_rollout_wg.dump_memory_snapshot(
+                        tag=f"post_update_step{self.global_steps}", sub_dir=f"step{self.global_steps}"
+                    )
+
                 if is_last_step:
                     pprint(f"Final validation metrics: {last_val_metrics}")
                     progress_bar.close()
@@ -1192,35 +328,22 @@ def fit(self):
 
     def _prepare_generate_batch(self, batch_dict):
         batch: DataProto = DataProto.from_single_dict(batch_dict)
-        # pop those keys for generation
-        batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-        non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-        if "multi_modal_data" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("multi_modal_data")
-        if "raw_prompt" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("raw_prompt")
-        if "tools_kwargs" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("tools_kwargs")
-        if "interaction_kwargs" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-        if "index" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("index")
-        if "agent_name" in batch.non_tensor_batch:
-            non_tensor_batch_keys_to_pop.append("agent_name")
-        gen_batch = batch.pop(
-            batch_keys=batch_keys_to_pop,
-            non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-        )
+
+        # add uid to batch
+        batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
+
+        gen_batch = self._get_gen_batch(batch)
+
         # pass global_steps to trace
         gen_batch.meta_info["global_steps"] = self.global_steps
         gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
         return batch, gen_batch
 
     def _post_generate_batch(self, batch, gen_batch_output, metrics):
-        batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object)
         # repeat to align with repeated responses in rollout
         batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
         batch = batch.union(gen_batch_output)
+
         if "response_mask" not in batch.batch.keys():
             batch.batch["response_mask"] = compute_response_mask(batch)
         # Balance the number of valid tokens across DP ranks.
@@ -1230,8 +353,10 @@ def _post_generate_batch(self, batch, gen_batch_output, metrics):
         # TODO: Decouple the DP balancing and mini-batching.
         if self.config.trainer.balance_batch:
             self._balance_batch(batch, metrics=metrics)
+
         # compute global_valid tokens
         batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
         return batch
 
     def _process_batch_common(self, batch, metrics, timing_raw):
@@ -1245,6 +370,7 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
             else:
                 reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+
         # recompute old_log_probs
         with marked_timer("old_log_prob", timing_raw, color="blue"):
             async_training = self.config.get("async_training", None)
@@ -1265,27 +391,9 @@ def _process_batch_common(self, batch, metrics, timing_raw):
 
                 if "rollout_log_probs" in batch.batch.keys():
                     # TODO: we may want to add diff of probs too.
-                    rollout_old_log_probs = batch.batch["rollout_log_probs"]
-                    actor_old_log_probs = batch.batch["old_log_probs"]
-                    attention_mask = batch.batch["attention_mask"]
-                    responses = batch.batch["responses"]
-                    response_length = responses.size(1)
-                    response_mask = attention_mask[:, -response_length:]
-
-                    rollout_probs = torch.exp(rollout_old_log_probs)
-                    actor_probs = torch.exp(actor_old_log_probs)
-                    rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
-                    rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
-                    rollout_probs_diff_max = torch.max(rollout_probs_diff)
-                    rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
-                    rollout_probs_diff_std = torch.std(rollout_probs_diff)
-                    metrics.update(
-                        {
-                            "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
-                            "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
-                            "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
-                        }
-                    )
+                    from verl.utils.debug.metrics import calculate_debug_metrics
+
+                    metrics.update(calculate_debug_metrics(batch))
 
         if self.use_reference_policy:
             # compute reference log_prob
@@ -1295,11 +403,13 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 else:
                     ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
                 batch = batch.union(ref_log_prob)
+
         # compute values
         if self.use_critic:
             with marked_timer("values", timing_raw, color="cyan"):
                 values = self.critic_wg.compute_values(batch)
                 batch = batch.union(values)
+
         with marked_timer("adv", timing_raw, color="brown"):
             # we combine with rule-based rm
             reward_extra_infos_dict: dict[str, list]
@@ -1334,12 +444,14 @@ def _process_batch_common(self, batch, metrics, timing_raw):
                 norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                 config=self.config.algorithm,
             )
+
         # update critic
         if self.use_critic:
             with marked_timer("update_critic", timing_raw, color="pink"):
                 critic_output = self.critic_wg.update_critic(batch)
             critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
             metrics.update(critic_output_metrics)
+
         # implement critic warmup
         if self.config.trainer.critic_warmup <= self.global_steps:
             # update actor
@@ -1351,21 +463,25 @@ def _process_batch_common(self, batch, metrics, timing_raw):
         return batch, reward_extra_infos_dict
 
     def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw):
-        """Log rollout generations if enabled"""
+        # Log rollout generations if enabled
         rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
         if rollout_data_dir:
             with marked_timer("dump_rollout_generations", timing_raw, color="green"):
                 inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
                 outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
                 scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                sample_gts = [item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in batch]
+
                 if "request_id" in batch.non_tensor_batch:
                     reward_extra_infos_dict.setdefault(
                         "request_id",
                         batch.non_tensor_batch["request_id"].tolist(),
                     )
+
                 self._dump_generations(
                     inputs=inputs,
                     outputs=outputs,
+                    gts=sample_gts,
                     scores=scores,
                     reward_extra_infos_dict=reward_extra_infos_dict,
                     dump_path=rollout_data_dir,
@@ -1382,7 +498,7 @@ def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw)
                 if is_last_step:
                     last_val_metrics = val_metrics
             metrics.update(val_metrics)
-        return last_val_metrics
+            return last_val_metrics
 
     def _check_save_checkpoint(self, is_last_step, timing_raw):
         # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
@@ -1408,6 +524,7 @@ def _check_save_checkpoint(self, is_last_step, timing_raw):
     def _collect_metrics(self, batch, epoch, metrics, timing_raw):
         steps_duration = timing_raw["step"]
         self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+
         # training metrics
         metrics.update(
             {
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index 88d61ee4169..c9c1f5bd77d 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .agent_loop import AgentLoopBase, AgentLoopManager, AsyncLLMServerManager
+# from .agent_loop import AgentLoopBase, AgentLoopManager, AsyncLLMServerManager, AgentLoopWorker
 from .single_turn_agent_loop import SingleTurnAgentLoop
 from .tool_agent_loop import ToolAgentLoop
 
 _ = [SingleTurnAgentLoop, ToolAgentLoop]
 
-__all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager"]
+# __all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker"]
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 1c8d0eac928..f520f89472a 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -179,12 +179,12 @@ class AgentLoopBase(ABC):
     _class_initialized = False
 
     def __init__(
-        self,
-        trainer_config: _DummyConfig,
-        server_manager: AsyncLLMServerManager,
-        tokenizer: AutoTokenizer,
-        processor: AutoProcessor,
-        **kwargs,
+            self,
+            trainer_config: _DummyConfig,
+            server_manager: AsyncLLMServerManager,
+            tokenizer: AutoTokenizer,
+            processor: AutoProcessor,
+            **kwargs,
     ):
         """Initialize agent loop, each sample will have its own loop instance.
 
@@ -329,8 +329,8 @@ def __init__(self, config: DictConfig, local_path: str, rm_executor: BatchExecut
         self.rm_executor = rm_executor
 
     def compute_score(
-        self,
-        data: DataProto,
+            self,
+            data: DataProto,
     ) -> dict:
         """Compute reward score for agent loop output.
 
@@ -355,7 +355,7 @@ class AgentLoopWorker:
     """Agent loop worker takes a batch of messages and run each message in an agent loop."""
 
     def __init__(
-        self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
+            self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
         """Initialize agent loop manager.
 
@@ -364,7 +364,11 @@ def __init__(
             server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
         """
         self.config = config
-        self.server_manager = AsyncLLMServerManager(config, server_handles)
+
+        if self.AsyncLLMServerManager == None:
+            self.AsyncLLMServerManager = AsyncLLMServerManager
+
+        self.server_manager = self.AsyncLLMServerManager(config, server_handles)
         self.rm_executor = rm_executor
 
         model_path = config.actor_rollout_ref.model.path
@@ -455,19 +459,19 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
         return output
 
     async def _run_agent_loop(
-        self,
-        sampling_params: dict[str, Any],
-        trajectory: dict[str, Any],
-        *,
-        agent_name: str,
-        **kwargs,
+            self,
+            sampling_params: dict[str, Any],
+            trajectory: dict[str, Any],
+            *,
+            agent_name: str,
+            **kwargs,
     ) -> _InternalAgentLoopOutput:
         with rollout_trace_attr(
-            step=trajectory["step"],
-            sample_index=trajectory["sample_index"],
-            rollout_n=trajectory["rollout_n"],
-            validate=trajectory["validate"],
-            name="agent_loop",
+                step=trajectory["step"],
+                sample_index=trajectory["sample_index"],
+                rollout_n=trajectory["rollout_n"],
+                validate=trajectory["validate"],
+                name="agent_loop",
         ):
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
@@ -550,8 +554,8 @@ async def _run_agent_loop(
             # TODO: support other multi-modal inputs
             multi_modal_inputs = None
             if (
-                self.processor is not None
-                and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__
+                    self.processor is not None
+                    and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__
             ):
                 from verl.models.transformers.qwen2_vl import get_rope_index
 
@@ -580,8 +584,8 @@ async def _run_agent_loop(
             else:
                 position_ids = compute_position_id_with_mask(attention_mask)  # (1, seq_len)
             enable_async_reward = (
-                self.rm_executor is not None and self.config.reward_model.enable_resource_pool
-            ) or not self.config.reward_model.enable
+                                          self.rm_executor is not None and self.config.reward_model.enable_resource_pool
+                                  ) or not self.config.reward_model.enable
             if output.reward_score is None and enable_async_reward:
                 batch = TensorDict(
                     {
@@ -751,6 +755,9 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]:
         if self.config.actor_rollout_ref.rollout.free_cache_engine:
             self.sleep()
 
+        # for recipe to change AgentLoopWorker
+        self.AgentLoopWorker = AgentLoopWorker
+
     def _initialize_llm_servers(self):
         rollout_world_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
         world_size = (
@@ -783,7 +790,7 @@ def _init_agent_loop_workers(self):
             # Round-robin scheduling over the all nodes
             node_id = node_ids[i % len(node_ids)]
             self.agent_loop_workers.append(
-                AgentLoopWorker.options(
+                self.AgentLoopWorker.options(
                     name=f"agent_loop_worker_{i}",
                     scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
                         node_id=node_id, soft=True
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 0a8e1f3d27b..e33d346e482 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -43,13 +43,14 @@ def main(config):
 
 
 # Define a function to run the PPO-like training process
-def run_ppo(config) -> None:
+def run_ppo(config, task_runner_class=None) -> None:
     """Initialize Ray cluster and run distributed PPO training process.
 
     Args:
         config: Training configuration object containing all necessary parameters
                 for distributed PPO training including Ray initialization settings,
                 model paths, and training hyperparameters.
+        task_runner_class: For recipe to change TaskRunner.
     """
     # Check if Ray is not initialized
     if not ray.is_initialized():
@@ -65,6 +66,9 @@ def run_ppo(config) -> None:
         print(f"ray init kwargs: {ray_init_kwargs}")
         ray.init(**OmegaConf.to_container(ray_init_kwargs))
 
+    if task_runner_class is None:
+        task_runner_class = TaskRunner
+
     # Create a remote instance of the TaskRunner class, and
     # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
     if (
@@ -79,9 +83,9 @@ def run_ppo(config) -> None:
         nsight_options = OmegaConf.to_container(
             config.global_profiler.global_tool_config.nsys.controller_nsight_options
         )
-        runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
+        runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote()
     else:
-        runner = TaskRunner.remote()
+        runner = task_runner_class.remote()
     ray.get(runner.run.remote(config))
 
     # [Optional] get the path of the timeline trace file from the configuration, default to None
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index bb945c0451f..d1c58a67e27 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -642,9 +642,9 @@ def init_workers(self):
             actor_rollout_cls = RayClassWithInitArgs(
                 cls=self.role_worker_mapping[Role.ActorRollout],
                 config=self.config.actor_rollout_ref,
-                role="actor_rollout",
+                role=str(Role.ActorRollout),
             )
-            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls
         else:
             raise NotImplementedError
 
@@ -653,7 +653,7 @@ def init_workers(self):
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
             critic_cfg = omega_conf_to_dataclass(self.config.critic)
             critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
-            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls
 
         # create reference policy if needed
         if self.use_reference_policy:
@@ -661,16 +661,16 @@ def init_workers(self):
             ref_policy_cls = RayClassWithInitArgs(
                 self.role_worker_mapping[Role.RefPolicy],
                 config=self.config.actor_rollout_ref,
-                role="ref",
+                role=str(Role.RefPolicy),
             )
-            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls
 
         # create a reward model if reward_fn is None
         if self.use_rm:
             # we create a RM here
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
             rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
-            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
+            self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls
 
         # initialize WorkerGroup
         # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
@@ -705,20 +705,20 @@ def init_workers(self):
             all_wg.update(spawn_wg)
 
         if self.use_critic:
-            self.critic_wg = all_wg["critic"]
+            self.critic_wg = all_wg[str(Role.Critic)]
             self.critic_wg.init_model()
 
         if self.use_reference_policy and not self.ref_in_actor:
-            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg = all_wg[str(Role.RefPolicy)]
             self.ref_policy_wg.init_model()
 
         self.rm_wg = None
         if self.use_rm:
-            self.rm_wg = all_wg["rm"]
+            self.rm_wg = all_wg[str(Role.RewardModel)]
             self.rm_wg.init_model()
 
         # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
-        self.actor_rollout_wg = all_wg["actor_rollout"]
+        self.actor_rollout_wg = all_wg[str(Role.ActorRollout)]
         self.actor_rollout_wg.init_model()
 
         # create async rollout manager and request scheduler
@@ -766,11 +766,13 @@ def _save_checkpoint(self):
         )
 
         if self.use_critic:
-            critic_local_path = os.path.join(local_global_step_folder, "critic")
+            critic_local_path = os.path.join(local_global_step_folder, str(Role.Critic))
             critic_remote_path = (
                 None
                 if self.config.trainer.default_hdfs_dir is None
-                else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic")
+                else os.path.join(
+                    self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", str(Role.Critic)
+                )
             )
             self.critic_wg.save_checkpoint(
                 critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep
@@ -826,7 +828,7 @@ def _load_checkpoint(self):
         print(f"Resuming from {global_step_folder}")
 
         actor_path = os.path.join(global_step_folder, "actor")
-        critic_path = os.path.join(global_step_folder, "critic")
+        critic_path = os.path.join(global_step_folder, str(Role.Critic))
         # load actor
         self.actor_rollout_wg.load_checkpoint(
             actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
@@ -1044,7 +1046,7 @@ def fit(self):
 
                     if self.use_reference_policy:
                         # compute reference log_prob
-                        with marked_timer("ref", timing_raw, color="olive"):
+                        with marked_timer(str(Role.RefPolicy), timing_raw, color="olive"):
                             if not self.ref_in_actor:
                                 ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
                             else:
diff --git a/verl/trainer/ppo/utils.py b/verl/trainer/ppo/utils.py
index 22d00a45052..31e886fd6f2 100644
--- a/verl/trainer/ppo/utils.py
+++ b/verl/trainer/ppo/utils.py
@@ -36,6 +36,37 @@ class Role(Enum):
     RewardModel = 5
     ActorRolloutRef = 6
 
+    def __str__(self):
+        return self._get_role_string()
+
+    def _get_role_string(self):
+        role_mapping = {
+            Role.Actor: "actor",
+            Role.Rollout: "rollout",
+            Role.ActorRollout: "actor_rollout",
+            Role.Critic: "critic",
+            Role.RefPolicy: "ref",
+            Role.RewardModel: "rm",
+            Role.ActorRolloutRef: "actor_rollout_ref",
+        }
+        return role_mapping.get(self, self.name.lower())
+
+    @classmethod
+    def from_string(cls, name: str):
+        string_mapping = {
+            "actor": cls.Actor,
+            "rollout": cls.Rollout,
+            "actor_rollout": cls.ActorRollout,
+            "critic": cls.Critic,
+            "ref": cls.RefPolicy,
+            "rm": cls.RewardModel,
+            "actor_rollout_ref": cls.ActorRolloutRef,
+        }
+        role = string_mapping.get(name.lower())
+        if role is None:
+            raise ValueError(f"No Role found for string: {name}")
+        return role
+
 
 def need_reference_policy(
     role_worker_mapping: dict[Role, WorkerType],

From 073e40f0848ddbf42ca6f469f6c81f7d323f29b5 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 17 Sep 2025 10:49:53 +0800
Subject: [PATCH 141/182] cleaned up the fully_async metric, fix
 processing_time, add partial metric, add stale_trajectory_processed

---
 .../agent_loop/agent_loop.py                  | 18 +++++--
 .../partial_single_turn_agent_loop.py         | 17 ++++--
 recipe/fully_async_policy/detach_utils.py     | 45 ++++++++++------
 .../fully_async_rollouter.py                  | 53 ++++++++++---------
 .../fully_async_policy/fully_async_trainer.py | 43 ++++++++-------
 5 files changed, 111 insertions(+), 65 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 4f4496c8999..b6433c0acd7 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -137,6 +137,10 @@ class AgentLoopOutput(BaseModel):
     """Indicates whether the request was interrupted"""
     log_probs: list[float] = None
     """Response token log probs including LLM generated token, tool response token."""
+    param_version_start: int = 0
+    """Indicate start parameter version when this response is generated"""
+    param_version_end: int = 0
+    """Indicate end parameter version when this response is generated, used for partial rollout"""
 
 
 # make hydra.utils.instantiate happy
@@ -381,7 +385,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
         return output
 
     async def generate_sequences_no_post(
-        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
+        self, batch: DataProto, param_version: int, partial_output_list: Optional[list[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
@@ -433,7 +437,9 @@ async def generate_sequences_no_post(
         ):
             tasks.append(
                 asyncio.create_task(
-                    self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output)
+                    self._run_agent_loop(
+                        agent_name, messages.tolist(), sampling_params, trajectory, param_version, partial_output
+                    )
                 )
             )
         outputs = await asyncio.gather(*tasks)
@@ -446,6 +452,7 @@ async def _run_agent_loop(
         messages: list[dict[str, Any]],
         sampling_params: dict[str, Any],
         trajectory: dict[str, Any],
+        param_version: Optional[int] = None,
         partial_output: Optional[AgentLoopOutput] = None,
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
@@ -466,7 +473,7 @@ async def _run_agent_loop(
                 tokenizer=self.tokenizer,
             )
             if agent_name == "partial_single_turn_agent":
-                output = await agent_loop.run(messages, sampling_params, partial_output)
+                output = await agent_loop.run(messages, sampling_params, param_version, partial_output)
             else:
                 output = await agent_loop.run(messages, sampling_params)
             return output
@@ -602,6 +609,7 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
     async def generate_single_sample_async(
         self,
         sample: DataProto,
+        param_version: int,
         partial_output_list: Optional[list[AgentLoopOutput]],
     ) -> list[AgentLoopOutput]:
         """
@@ -617,7 +625,7 @@ async def generate_single_sample_async(
         # 使用负载均衡选择 worker
         worker = self._select_best_worker()
         # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
-        output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list)
+        output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list)
         return await asyncio.wrap_future(output_future.future())
 
     def _select_best_worker(self):
@@ -665,7 +673,7 @@ async def cancel_async(self):
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
 
     async def resume_async(self):
-        """Cancel all rollout tasks asynchronously."""
+        """Resume all rollout tasks asynchronously."""
         futures = [server.resume.remote() for server in self.async_llm_servers]
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
 
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
index cf95c1eb965..c97f794bb9c 100644
--- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -33,21 +33,30 @@ def __init__(self, *args, **kwargs):
         self.response_length = self.config.actor_rollout_ref.rollout.response_length
 
     async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
+        self,
+        messages: list[dict[str, Any]],
+        sampling_params: dict[str, Any],
+        param_version: int,
+        output: Optional[AgentLoopOutput],
     ) -> AgentLoopOutput:
+        metrics = {}
+        param_version_start = None
+        param_version_end = None
         if not output:
             prompt_ids = await self.loop.run_in_executor(
                 None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
             )
+            param_version_start = param_version
         else:
             if output.is_cancel:
                 # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
                 prompt_ids = output.prompt_ids + output.response_ids
+                metrics["generate_sequences"] = output.metrics.generate_sequences
+                param_version_start = output.param_version_start
             else:
                 # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
                 return output
-
-        metrics = {}
+        param_version_end = param_version
         request_id = uuid4().hex
         with simple_timer("generate_sequences", metrics):
             response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
@@ -71,4 +80,6 @@ async def run(
             metrics=metrics,
             is_cancel=is_cancel,
             log_probs=log_probs,
+            param_version_start=param_version_start,
+            param_version_end=param_version_end,
         )
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index ad12ef69057..fe6fb8cdc69 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -46,6 +46,8 @@ class RolloutSample:
     # Processing metadata
     processing_times: list[float]
     param_version: int
+    param_version_start: list[int]
+    param_version_end: list[int]
     rollout_status: dict[str, Any]
 
 
@@ -149,7 +151,8 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
     rs.processing_times = []
     for agent_loop in rs.agent_loop_output_list:
         rs.processing_times.append(agent_loop.metrics.generate_sequences)
-
+    rs.param_version_start = [agent_loop.param_version_start for agent_loop in rs.agent_loop_output_list]
+    rs.param_version_end = [agent_loop.param_version_end for agent_loop in rs.agent_loop_output_list]
     # 第四步，清空 agent_loop_output_list
     rs.agent_loop_output_list = []
 
@@ -206,24 +209,34 @@ def assemble_batch_from_rollout_samples(
 
     # 收集统计信息和元数据（直接从 RolloutSample 中获取）
     param_versions = [rs.param_version for rs in rollout_samples]
+    trajectorys_param_versions = [version for rs in rollout_samples for version in rs.param_version_end]
 
     processing_time_stats = {
-        "avg_processing_time": np.mean(processing_times),
-        "max_processing_time": np.max(processing_times),
-        "min_processing_time": np.min(processing_times),
-        "tp50_processing_time": np.percentile(processing_times, 50),  # 中位数
-        "tp99_processing_time": np.percentile(processing_times, 99),  # 99百分位
-        "tp95_processing_time": np.percentile(processing_times, 95),  # 95百分位也很有用
+        "processing_time/avg": np.mean(processing_times),
+        "processing_time/max": np.max(processing_times),
+        "processing_time/min": np.min(processing_times),
+        "processing_time/tp50": np.percentile(processing_times, 50),  # 中位数
+        "processing_time/tp99": np.percentile(processing_times, 99),  # 99百分位
+        "processing_time/tp95": np.percentile(processing_times, 95),  # 95百分位也很有用
     }
     processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()}
 
+    param_version_diff = [abs(a - b) for a, b in zip(rs.param_version_end, rs.param_version_start)]
+    num_diff0 = param_version_diff.count(0)
+    partial_stats = {
+        "fully_async/partial/total_partial_num": len(param_version_diff) - num_diff0,
+        "fully_async/partial/partial_ratio": (len(param_version_diff) - num_diff0) / len(param_version_diff),
+        "fully_async/partial/max_partial_span": max(param_version_diff),
+    }
     # 创建 meta_info
     final_batch.meta_info.update(
         {
             "rollout_param_versions": param_versions,
             "param_version_diversity": len(set(param_versions)) if param_versions else 0,
+            "trajectory_param_versions": trajectorys_param_versions,
             **processing_time_stats,
             **rollout_status,
+            **partial_stats,
         }
     )
 
@@ -255,6 +268,14 @@ def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]:
         return {
             # Time-Based metrics, can add metrics here
             "time_sum": ["perf/time_per_step"],
+            "last": [
+                "fully_async/count/total_generated_samples",
+                "fully_async/count/stale_samples_processed",
+                "fully_async/count/stale_trajectory_processed"
+                "fully_async/count/current_param_version",
+                "fully_async/count/dropped_stale_samples",
+                "training/global_step",  # TODO 改为total_step
+            ],
         }
 
     def add_step_metrics(self, metrics: dict[str, Any], sample_count: int, timestamp: float = None):
@@ -293,12 +314,6 @@ def _get_aggregation_type(self, metric_name: str) -> str:
         if any(keyword in metric_lower for keyword in ["weighted_avg"]):
             return "weighted_avg"
 
-        import warnings
-
-        warnings.warn(
-            f"No aggregation rule is matched in init_aggregation_rules. \
-                      For metric {metric_name}, the 'avg' method is used"
-        )
         return "avg"
 
     def _aggregate_single_metric(self, metric_name: str, values: list[float]) -> float:
@@ -372,10 +387,10 @@ def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, An
             aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / (
                 aggregated["perf/time_per_step"] * self.total_gpus
             )
-        
+
         # trainer/idle_ratio
         if "timing_s/gen" in aggregated.keys() and "timing_s/step" in aggregated.keys():
-           aggregated["trainer/idle_ratio"] = aggregated["timing_s/gen"] / aggregated["timing_s/step"]
+            aggregated["trainer/idle_ratio"] = aggregated["timing_s/gen"] / aggregated["timing_s/step"]
 
         return aggregated
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 8fbed0f0b65..1027e228c18 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -203,13 +203,13 @@ async def update_param_version(self, version: int, validate: bool = False, globa
             timing_raw = {}
             idle_ratio = None
             if self.idle_start_time is not None and self.version_start_time is not None:
-               rollout_active_time = self.idle_start_time - self.version_start_time
-               rollout_version_time = time.time() - self.version_start_time
-               idle_ratio = 1 - rollout_active_time / rollout_version_time
-               timing_raw["rollouter/active_time"] = rollout_active_time
-               timing_raw["rollouter/version_time"] = rollout_version_time
-               timing_raw["rollouter/idle_ratio"] = idle_ratio
-               self.idle_start_time = None
+                rollout_active_time = self.idle_start_time - self.version_start_time
+                rollout_version_time = time.time() - self.version_start_time
+                idle_ratio = 1 - rollout_active_time / rollout_version_time
+                timing_raw["rollouter/active_time"] = rollout_active_time
+                timing_raw["rollouter/version_time"] = rollout_version_time
+                timing_raw["rollouter/idle_ratio"] = idle_ratio
+                self.idle_start_time = None
             print(
                 f"[FullyAsyncRollouter][Public][update_param_version] "
                 f"Parameter version updated from {old_version} to {version} "
@@ -293,6 +293,8 @@ async def _feed_samples(self):
                 sample_id=sample_id,
                 epoch=epoch,
                 param_version=0,  # 待处理后填充
+                param_version_start=[],
+                param_version_end=[],
                 processing_times=[],
                 rollout_status={},
             )
@@ -391,12 +393,10 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         """流式处理单个样本"""
         # 调用异步生成方法
         agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async(
-            rollout_sample.full_batch, rollout_sample.agent_loop_output_list
+            rollout_sample.full_batch, self.current_param_version, rollout_sample.agent_loop_output_list
         )
         # 直接更新 RolloutSample 对象，填充剩余字段
         rollout_sample.agent_loop_output_list = agent_loop_output_list
-        rollout_sample.param_version = self.current_param_version
-        rollout_sample.rollout_status = await self.get_statistics()
 
         is_cancel = False
         # 收集所有信息
@@ -418,6 +418,8 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
             await self.cancel_queue.put(rollout_sample)
         else:
             # 否则放入结果队列
+            rollout_sample.param_version = self.current_param_version
+            rollout_sample.rollout_status = await self.get_statistics()
             await self.result_queue.put(rollout_sample)
 
         self.processed_sample_count += 1
@@ -617,20 +619,23 @@ async def get_statistics(self) -> dict:
         queue_stats = self.message_queue_client.get_statistics_sync()
 
         stats = {
-            "current_param_version": self.current_param_version,
-            "total_generated_samples": self.total_generated_samples,
-            "staleness_samples": self.staleness_samples,
-            "dropped_stale_samples": self.dropped_stale_samples,
-            "max_queue_size": self.max_queue_size,
-            "queue_size": queue_stats["queue_size"],
-            "max_concurrent_samples": self.max_concurrent_samples,
-            "pending_queue_size": self.pending_queue.qsize(),
-            "active_tasks_size": len(self.active_tasks),
-            "result_queue_size": self.result_queue.qsize(),
-            "max_required_samples": self.max_required_samples,
-            "required_samples": self.required_samples,
-            "staleness_threshold": self.staleness_threshold,
-            "cancel_queue_size": self.cancel_queue.qsize(),
+            # static stats
+            "static/max_required_samples": self.max_required_samples,
+            "static/required_samples": self.required_samples,
+            "static/staleness_threshold": self.staleness_threshold,
+            "static/max_queue_size": self.max_queue_size,
+            "static/max_concurrent_samples": self.max_concurrent_samples,
+            # counting stats
+            "count/current_param_version": self.current_param_version,
+            "count/total_generated_samples": self.total_generated_samples,
+            "count/staleness_samples": self.staleness_samples,
+            "count/dropped_stale_samples": self.dropped_stale_samples,
+            # monitor stats
+            "monitor/active_tasks_size": len(self.active_tasks),
+            "monitor/queue/pending_queue_size": self.pending_queue.qsize(),
+            "monitor/queue/cancel_queue_size": self.cancel_queue.qsize(),
+            "monitor/queue/result_queue_size": self.result_queue.qsize(),
+            "monitor/queue/mq_queue_size": queue_stats["queue_size"],
         }
 
         return stats
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 0f0c35d7db5..2f8bf2ddfa5 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -107,6 +107,7 @@ def __init__(
         self.local_trigger_step = 1
         self.processed_samples = 0
         self.stale_samples_processed = 0
+        self.stale_trajectory_processed = 0
         self.current_param_version = 0
         self.total_train_steps = None
         self.progress_bar = None
@@ -262,7 +263,7 @@ def fit(self):
             if val_data.metrics:
                 self.logger.log(data=val_data.metrics, step=val_data.param_version)
                 pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}")
-            self.logger.log(data=val_data.timing_raw, step=val_data.param_version) 
+            self.logger.log(data=val_data.timing_raw, step=val_data.param_version)
 
         # Use queue mode, no need for traditional dataloader iterator
         # Initialize to get the first batch of data
@@ -275,23 +276,7 @@ def fit(self):
                     epoch, batch = self._get_samples_from_queue()
                     if batch is None:
                         break
-
-                    # 从meta_info中获取参数版本信息
-                    if hasattr(batch, "meta_info") and batch.meta_info:
-                        # 统计陈旧样本
-                        rollout_param_versions = batch.meta_info["rollout_param_versions"]
-                        stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1)
-                        self.stale_samples_processed += stale_count
-                        metrics.update(
-                            {
-                                "fully_async/stale_samples_ratio": stale_count / len(rollout_param_versions),
-                                "fully_async/stale_samples_processed": self.stale_samples_processed,
-                                "fully_async/current_param_version": self.current_param_version,
-                            }
-                        )
-                        for key, value in batch.meta_info.items():
-                            if key.startswith("fully_async"):
-                                metrics[key] = value
+                    self._collect_metrics_from_samples(batch, metrics)
 
                 batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw)
                 self._log_rollout(batch, reward_extra_infos_dict, timing_raw)
@@ -342,6 +327,28 @@ def fit(self):
     def load_checkpoint(self):
         return self._load_checkpoint()
 
+    def _collect_metrics_from_samples(self, batch, metrics):
+        """
+        Collect metrics from samples
+        """
+        if hasattr(batch, "meta_info") and batch.meta_info:
+            samples_param_versions = batch.meta_info["rollout_param_versions"]
+            stale_count = sum(1 for v in samples_param_versions if self.current_param_version - v > 1)
+            self.stale_samples_processed += stale_count
+            trajectory_param_versions = batch.meta_info["trajectory_param_versions"]
+            stale_traj_count = sum(1 for v in trajectory_param_versions if self.current_param_version - v > 1)
+            self.stale_trajectory_processed += stale_traj_count
+            metrics.update(
+                {
+                    "fully_async/count/stale_samples_processed": self.stale_samples_processed,
+                    "fully_async/count/stale_trajectory_processed": self.stale_trajectory_processed,
+                    "fully_async/count/current_param_version": self.current_param_version,
+                }
+            )
+            for key, value in batch.meta_info.items():
+                if key.startswith("fully_async"):
+                    metrics[key] = value
+
     def _trigger_parameter_sync_after_step(self, validate: bool = False, global_steps: int = None):
         """
         Trigger parameter synchronization after training step

From f029e30967499bcf51899ccffbdf44738ed80b96 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 17 Sep 2025 10:55:49 +0800
Subject: [PATCH 142/182] refactor 2

---
 .../fully_async_policy/agent_loop/__init__.py |  3 +++
 .../agent_loop/agent_loop.py                  |  4 ++--
 verl/experimental/agent_loop/agent_loop.py    | 23 +++++++++++--------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
index 5f059078964..f1e1c647e51 100644
--- a/recipe/fully_async_policy/agent_loop/__init__.py
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -14,3 +14,6 @@
 
 from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
 _ = [PartialSingleTurnAgentLoop]
+
+
+from  .agent_loop import FullyAgentLoopManager
\ No newline at end of file
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 38c461629dc..4527347994e 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -60,7 +60,7 @@ class PartialAgentLoopOutput(AgentLoopOutput):
 
 
 @ray.remote
-class FullyAgentLoopWorker(AgentLoopWorker):
+class FullyAgentLoopWorker(AgentLoopWorkerBase):
     def __init__(
             self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
@@ -158,8 +158,8 @@ async def _partial_run_agent_loop(
 
 class FullyAgentLoopManager(AgentLoopManager):
     def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None):
-        super().__init__(config, worker_group, rm_wg)
         self.AgentLoopWorker = FullyAgentLoopWorker
+        super().__init__(config, worker_group, rm_wg)
 
     async def generate_single_sample_async(
             self,
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index f520f89472a..9458c8f8123 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -84,12 +84,12 @@ def _choose_server(self, request_id: str) -> ray.actor.ActorHandle:
 
     @rollout_trace_op
     async def generate(
-        self,
-        request_id,
-        *,
-        prompt_ids: list[int],
-        sampling_params: dict[str, Any],
-        image_data: Optional[list[Any]] = None,
+            self,
+            request_id,
+            *,
+            prompt_ids: list[int],
+            sampling_params: dict[str, Any],
+            image_data: Optional[list[Any]] = None,
     ) -> TokenOutput:
         """Generate tokens from prompt ids.
 
@@ -350,8 +350,7 @@ def compute_score(
         return {"reward_score": reward_score, "reward_extra_info": reward_extra_info}
 
 
-@ray.remote
-class AgentLoopWorker:
+class AgentLoopWorkerBase:
     """Agent loop worker takes a batch of messages and run each message in an agent loop."""
 
     def __init__(
@@ -690,6 +689,12 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto:
         )
 
 
+@ray.remote
+class AgentLoopWorker(AgentLoopWorkerBase):
+    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle],
+                 rm_executor: BatchExecutor = None):
+        super().__init__(config, server_handles, rm_executor)
+
 async def get_trajectory_info(step, index, validate):
     """Get trajectory info.
 
@@ -854,7 +859,7 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data
 
         return timing
 
-    def wake_up(self):
+    async def wake_up(self):
         """Wake up all rollout replica instances."""
         self._run_all([replica.wake_up() for replica in self.rollout_replicas])
 

From 94d681dce4911c3ad6abb011a2d03c62d318868d Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 17 Sep 2025 11:03:52 +0800
Subject: [PATCH 143/182] qwen3-32b-64-64

---
 .../exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
index 8427547d161..48be3ab3c84 100644
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1'
+exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64'
 
 # Paths
 MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
@@ -62,11 +62,11 @@ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=128
-total_rollout_steps=$(((512*400)))
+train_prompt_mini_bsz=16
+total_rollout_steps=$(((512*200)))
 test_freq=20
 staleness_threshold=0.1
-trigger_parameter_sync_step=1
+trigger_parameter_sync_step=8
 partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \

From a382f9af5b496b4778721f281bf85750a4788357 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 17 Sep 2025 11:49:25 +0800
Subject: [PATCH 144/182] add param_sync time log

---
 recipe/fully_async_policy/fully_async_trainer.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 2f8bf2ddfa5..3a81fcc1892 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -366,9 +366,13 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step
         )
         self.progress_bar.update(1)
         self.metrics_aggregator.reset()
-        ray.get(self.param_synchronizer.wait_last_valid.remote())
-        ray.get(
-            self.param_synchronizer.sync_weights.remote(
-                self.current_param_version, validate=validate, global_steps=global_steps
+        timing_param_sync = {}
+        with marked_timer("timing_s/wait_last_valid", timing_param_sync):
+            ray.get(self.param_synchronizer.wait_last_valid.remote())
+        with marked_timer("timing_s/param_sync", timing_param_sync):
+            ray.get(
+                self.param_synchronizer.sync_weights.remote(
+                    self.current_param_version, validate=validate, global_steps=global_steps
+                )
             )
-        )
+        self.logger.log(data=timing_param_sync, step=self.current_param_version)
\ No newline at end of file

From e6d51d32c20ee902abe4713e632baf17847e371d Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 17 Sep 2025 11:52:09 +0800
Subject: [PATCH 145/182] fix typo

---
 recipe/fully_async_policy/agent_loop/agent_loop.py | 1 +
 recipe/fully_async_policy/detach_utils.py          | 2 +-
 recipe/fully_async_policy/fully_async_trainer.py   | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 72d1b29c1c0..2e61b0fc725 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -701,6 +701,7 @@ def async_server_class(
 
         if rollout_backend == "vllm":
             from recipe.fully_async_policy.vllm_rollout.vllm_async_server import AsyncvLLMServer
+
             return AsyncvLLMServer
         else:
             raise NotImplementedError(f"rollout backend {rollout_backend} is not supported")
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index fe6fb8cdc69..133d8178ee7 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -271,7 +271,7 @@ def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]:
             "last": [
                 "fully_async/count/total_generated_samples",
                 "fully_async/count/stale_samples_processed",
-                "fully_async/count/stale_trajectory_processed"
+                "fully_async/count/stale_trajectory_processed",
                 "fully_async/count/current_param_version",
                 "fully_async/count/dropped_stale_samples",
                 "training/global_step",  # TODO 改为total_step
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 3a81fcc1892..f014993f13e 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -375,4 +375,4 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step
                     self.current_param_version, validate=validate, global_steps=global_steps
                 )
             )
-        self.logger.log(data=timing_param_sync, step=self.current_param_version)
\ No newline at end of file
+        self.logger.log(data=timing_param_sync, step=self.current_param_version)

From d759cfe92e4329ce160c7efcbe0bff0ee0e5cca4 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 17 Sep 2025 13:10:07 +0800
Subject: [PATCH 146/182] fix typo

---
 recipe/fully_async_policy/agent_loop/agent_loop.py | 4 +---
 recipe/fully_async_policy/detach_utils.py          | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 2e61b0fc725..901e584f7c3 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -34,6 +34,7 @@
 from verl.utils import hf_tokenizer
 from verl.utils.fs import copy_to_local
 from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
+from verl.workers.rollout.async_server import AsyncServerBase
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -678,9 +679,6 @@ async def resume_async(self):
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
 
 
-from verl.workers.rollout.async_server import AsyncServerBase
-
-
 def async_server_class(
     rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None
 ) -> type[AsyncServerBase]:
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 133d8178ee7..32a36bb882a 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -289,7 +289,7 @@ def add_step_metrics(self, metrics: dict[str, Any], sample_count: int, timestamp
 
         # Store all metrics values
         for key, value in metrics.items():
-            if isinstance(value, (int, float, np.number)):
+            if isinstance(value, int | float | np.number):
                 self.metric_values[key].append(float(value))
             elif isinstance(value, torch.Tensor):
                 self.metric_values[key].append(float(value.item()))

From c8db507eb62054974c40a5691078e55147ac7055 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Wed, 17 Sep 2025 15:40:14 +0800
Subject: [PATCH 147/182] refactor 3

---
 .../fully_async_policy/agent_loop/__init__.py |   5 +-
 .../agent_loop/agent_loop.py                  |  84 ++--
 .../partial_single_turn_agent_loop.py         |  27 +-
 .../agent_loop/single_turn_agent_loop.py      |  55 ---
 recipe/fully_async_policy/detach_utils.py     |   4 +-
 .../fully_async_rollouter.py                  |   4 +-
 .../fully_async_policy/fully_async_trainer.py |   2 -
 recipe/fully_async_policy/ray_trainer.py      |   4 +-
 .../vllm_rollout/__init__.py                  |  13 +
 .../vllm_rollout/vllm_async_server.py         | 364 +++---------------
 verl/experimental/agent_loop/agent_loop.py    |  70 ++--
 .../rollout/vllm_rollout/vllm_async_server.py |   6 +-
 12 files changed, 170 insertions(+), 468 deletions(-)
 delete mode 100644 recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
index f1e1c647e51..40dcd0ac7a3 100644
--- a/recipe/fully_async_policy/agent_loop/__init__.py
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
+
 _ = [PartialSingleTurnAgentLoop]
 
 
-from  .agent_loop import FullyAgentLoopManager
\ No newline at end of file
+from .agent_loop import PartialAgentLoopManager
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 4527347994e..72f7c0afc2e 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,23 +14,9 @@
 import asyncio
 import logging
 import os
-from typing import Any, Optional
-
-import hydra
-import numpy as np
-import ray
-import torch
-from omegaconf import DictConfig, OmegaConf
-from tensordict import TensorDict
-
-from verl.protocol import DataProto
-from verl.single_controller.ray.base import RayWorkerGroup
-from verl.utils import hf_tokenizer
-from verl.utils.fs import copy_to_local
-from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr
-from verl.workers.rollout.replica import TokenOutput
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, _agent_loop_registry, _DummyConfig
+from verl.protocol import DataProto
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -60,15 +46,15 @@ class PartialAgentLoopOutput(AgentLoopOutput):
 
 
 @ray.remote
-class FullyAgentLoopWorker(AgentLoopWorkerBase):
+class PartialAgentLoopWorker(AgentLoopWorkerBase):
     def __init__(
-            self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
+        self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
         self.AsyncLLMServerManager = PartialAsyncLLMServerManager
         super().__init__(config, server_handles, rm_executor)
 
     async def generate_sequences_no_post(
-            self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
+        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
@@ -105,7 +91,7 @@ async def generate_sequences_no_post(
         if "index" in batch.non_tensor_batch:
             index = batch.non_tensor_batch["index"]
         else:
-            index = np.arange(len(raw_prompts))
+            index = np.arange(len(batch))
 
         trajectory_info = await get_trajectory_info(
             batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False)
@@ -117,29 +103,26 @@ async def generate_sequences_no_post(
         tasks = []
         for i in range(len(batch)):
             kwargs = {k: v[i] for k, v in batch.non_tensor_batch.items()}
+            kwargs["output"] = partial_output_list[i]
             tasks.append(
-                asyncio.create_task(
-                    self._partial_run_agent_loop(sampling_params,
-                                                 trajectory_info[i],
-                                                 partial_output_list[i],
-                                                 **kwargs)))
+                asyncio.create_task(self._partial_run_agent_loop(sampling_params, trajectory_info[i], **kwargs))
+            )
         return await asyncio.gather(*tasks)
 
     async def _partial_run_agent_loop(
-            self,
-            sampling_params: dict[str, Any],
-            trajectory: dict[str, Any],
-            partial_output: Optional[AgentLoopOutput] = None,
-            *,
-            agent_name: str,
-            **kwargs,
+        self,
+        sampling_params: dict[str, Any],
+        trajectory: dict[str, Any],
+        *,
+        agent_name: str,
+        **kwargs,
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
-                step=trajectory["step"],
-                sample_index=trajectory["sample_index"],
-                rollout_n=trajectory["rollout_n"],
-                validate=trajectory["validate"],
-                name="agent_loop",
+            step=trajectory["step"],
+            sample_index=trajectory["sample_index"],
+            rollout_n=trajectory["rollout_n"],
+            validate=trajectory["validate"],
+            name="agent_loop",
         ):
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
@@ -153,18 +136,18 @@ async def _partial_run_agent_loop(
                 tokenizer=self.tokenizer,
                 processor=self.processor,
             )
-            return await agent_loop.run(sampling_params, partial_output, **kwargs)
+            return await agent_loop.run(sampling_params, **kwargs)
 
 
-class FullyAgentLoopManager(AgentLoopManager):
+class PartialAgentLoopManager(AgentLoopManager):
     def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None):
-        self.AgentLoopWorker = FullyAgentLoopWorker
+        self.AgentLoopWorker = PartialAgentLoopWorker
         super().__init__(config, worker_group, rm_wg)
 
     async def generate_single_sample_async(
-            self,
-            sample: DataProto,
-            partial_output_list: Optional[list[AgentLoopOutput]],
+        self,
+        sample: DataProto,
+        partial_output_list: Optional[list[AgentLoopOutput]],
     ) -> list[AgentLoopOutput]:
         """
         异步处理单个样本, 需要复制n次
@@ -191,12 +174,23 @@ def _select_best_worker(self):
         self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
         return worker
 
+    async def sleep(self):
+        futures = [replica.sleep.remote() for replica in self.rollout_replicas]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+
+    async def wake_up(self):
+        futures = [replica.wake_up.remote() for replica in self.rollout_replicas]
+        await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+
     async def cancel_async(self):
         """Cancel all rollout tasks asynchronously."""
-        futures = [server.cancel.remote() for server in self.async_llm_servers]
+        futures = [replica.cancel.remote() for replica in self.rollout_replicas]
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
 
     async def resume_async(self):
         """Cancel all rollout tasks asynchronously."""
-        futures = [server.resume.remote() for server in self.async_llm_servers]
+        futures = [replica.resume.remote() for replica in self.rollout_replicas]
         await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True)
+
+    def _run_all(self, tasks: list[asyncio.Task]):
+        pass
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
index cf95c1eb965..5e512093bfe 100644
--- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,12 @@
 from typing import Any, Optional
 from uuid import uuid4
 
-from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from recipe.fully_async_policy.agent_loop.agent_loop import (
+    AgentLoopBase,
+    AgentLoopOutput,
+    PartialAgentLoopOutput,
+    register,
+)
 from verl.utils.profiler import simple_timer
 
 logger = logging.getLogger(__file__)
@@ -31,13 +36,21 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
         self.response_length = self.config.actor_rollout_ref.rollout.response_length
+        self.apply_chat_template_kwargs = self.config.data.get("apply_chat_template_kwargs", {})
+
+    async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
+        output: Optional[PartialAgentLoopOutput] = kwargs.get("output", None)
+        messages = list(kwargs["raw_prompt"])
+
+        metrics = {}
+        request_id = uuid4().hex
 
-    async def run(
-        self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput]
-    ) -> AgentLoopOutput:
         if not output:
             prompt_ids = await self.loop.run_in_executor(
-                None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+                None,
+                lambda: self.tokenizer.apply_chat_template(
+                    messages, add_generation_prompt=True, tokenize=True, **self.apply_chat_template_kwargs
+                ),
             )
         else:
             if output.is_cancel:
@@ -63,7 +76,7 @@ async def run(
             response_ids = output.response_ids + response_ids
             response_mask = [1] * len(response_ids)
 
-        return AgentLoopOutput(
+        return PartialAgentLoopOutput(
             prompt_ids=prompt_ids,
             response_ids=response_ids[: self.response_length],
             response_mask=response_mask[: self.response_length],
diff --git a/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py
deleted file mode 100644
index 6dcdf327b09..00000000000
--- a/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import os
-from typing import Any
-from uuid import uuid4
-
-from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
-from verl.utils.profiler import simple_timer
-
-logger = logging.getLogger(__file__)
-logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
-
-
-@register("single_turn_agent")
-class SingleTurnAgentLoop(AgentLoopBase):
-    """Naive agent loop that only do single turn chat completion."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
-        self.response_length = self.config.actor_rollout_ref.rollout.response_length
-
-    async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
-        metrics = {}
-        request_id = uuid4().hex
-        prompt_ids = await self.loop.run_in_executor(
-            None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
-        )
-
-        with simple_timer("generate_sequences", metrics):
-            response_ids = await self.server_manager.generate(
-                request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
-            )
-        response_mask = [1] * len(response_ids)
-
-        output = AgentLoopOutput(
-            prompt_ids=prompt_ids,
-            response_ids=response_ids[: self.response_length],
-            response_mask=response_mask[: self.response_length],
-            num_turns=2,
-            metrics=metrics,
-        )
-        return output
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 69041d923b5..450b67b9ff9 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -228,7 +228,7 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
 
 
 def assemble_batch_from_rollout_samples(
-        rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None
+    rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None
 ) -> DataProto:
     """
     Assemble gen_batch_output from RolloutSample objects
@@ -439,7 +439,7 @@ def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, An
         REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"}
         if REQUIRED_PERF_KEYS.issubset(aggregated):
             aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / (
-                    aggregated["perf/time_per_step"] * self.total_gpus
+                aggregated["perf/time_per_step"] * self.total_gpus
             )
 
         return aggregated
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index e53e6c43ef5..048f727ea0d 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -258,10 +258,10 @@ def _create_continuous_iterator(self):
     def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         assert self.config.actor_rollout_ref.rollout.mode == "async"
-        from recipe.fully_async_policy.agent_loop import FullyAgentLoopManager
+        from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager
 
         self.async_rollout_mode = True
-        self.async_rollout_manager = FullyAgentLoopManager(
+        self.async_rollout_manager = PartialAgentLoopManager(
             config=self.config,
             worker_group=self.rollout_wg,
         )
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 4cba527c857..5d945137ab2 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -93,8 +93,6 @@ def __init__(
             )
             self.use_critic = False
 
-        self._validate_config()
-
         self.message_queue_client = None
         self.param_synchronizer = None
 
diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py
index 0a74c5ed386..33601621993 100644
--- a/recipe/fully_async_policy/ray_trainer.py
+++ b/recipe/fully_async_policy/ray_trainer.py
@@ -175,10 +175,10 @@ def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         self.async_rollout_mode = False
         if self.config.actor_rollout_ref.rollout.mode == "async":
-            from recipe.fully_async_policy.agent_loop.agent_loop import FullyAgentLoopManager
+            from recipe.fully_async_policy.agent_loop.agent_loop import PartialAgentLoopManager
 
             self.async_rollout_mode = True
-            self.async_rollout_manager = FullyAgentLoopManager(
+            self.async_rollout_manager = PartialAgentLoopManager(
                 config=self.config,
                 worker_group=self.actor_rollout_wg,
             )
diff --git a/recipe/fully_async_policy/vllm_rollout/__init__.py b/recipe/fully_async_policy/vllm_rollout/__init__.py
index e69de29bb2d..9cd3ed5b8e9 100644
--- a/recipe/fully_async_policy/vllm_rollout/__init__.py
+++ b/recipe/fully_async_policy/vllm_rollout/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
index 4826ebaa1d0..19a70c8d44b 100644
--- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
+++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,199 +13,39 @@
 # limitations under the License.
 import asyncio
 import logging
-import os
-import pickle
-from typing import Any, Callable, Optional, Sequence
+from typing import Any, Optional, Sequence
 
 import ray
-import zmq
 from omegaconf import DictConfig
-from starlette.requests import Request
-from starlette.responses import JSONResponse, StreamingResponse
+from ray.actor import ActorHandle
 from vllm import SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.inputs import TokensPrompt
 from vllm.outputs import RequestOutput
-from vllm.v1.engine.async_llm import AsyncLLM
-from vllm.v1.executor.abstract import Executor
-from vllm.worker.worker_base import WorkerWrapperBase
 
-from verl.utils.fs import copy_to_local
-from verl.workers.rollout.async_server import AsyncServerBase
+from verl.workers.rollout.replica import RolloutMode
+from verl.workers.rollout.vllm_rollout.vllm_async_server import (
+    _qwen2_5_vl_dedup_image_tokens,
+    vLLMHttpServer,
+    vLLMReplica,
+)
 
 logger = logging.getLogger(__file__)
-
-
-def _get_model_runner_workers(vllm_config, init_ray: bool = True):
-    assert vllm_config.instance_id is not None, "instance_id must be set for external ray actors."
-
-    fields = vllm_config.instance_id.split(":")
-    assert len(fields) == 4, (
-        f"instance_id: {vllm_config.instance_id} must be in the format of "
-        f"<namespace>:<wg_prefix>:<vllm_dp_size>:<vllm_dp_rank>."
-    )
-    namespace, wg_prefix, vllm_dp_size, vllm_dp_rank = fields[0], fields[1], int(fields[2]), int(fields[3])
-
-    # Make sure subprocess in same namespace as parent actor.
-    # actor name format: {name_prefix}WorkerDict_{pg_idx}:{local_rank}
-    if init_ray:
-        ray.init(namespace=namespace)
-    actor_names = [
-        actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict")
-    ]
-
-    vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size
-    assert len(actor_names) == vllm_dp_size * vllm_tp_size, (
-        f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: "
-        f"{vllm_dp_size} * vllm_tp_size: {vllm_tp_size} = {vllm_dp_size * vllm_tp_size} is expected."
-    )
-
-    def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]:
-        fields = actor_name.split(":")
-        assert len(fields) == 2, f"invalid actor name: {actor_name}"
-        pg_index, local_rank = int(fields[0].split("_")[-1]), int(fields[1])
-        return pg_index, local_rank
-
-    # sort actor names by pg_index and local_rank
-    actor_names = sorted(actor_names, key=get_pg_index_and_local_rank)
-    actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size]
-    workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names]
-    print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}")
-
-    return workers
-
-
-class ExternalRayDistributedExecutor(Executor):
-    """An executor that engines are launched by external ray actors."""
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True)
-
-        kwargs = dict(
-            vllm_config=self.vllm_config,
-            local_rank=None,
-            rank=None,
-            distributed_init_method="env://",
-            is_driver_worker=True,
-        )
-        self.collective_rpc("init_worker", args=([kwargs],))
-        self.collective_rpc("init_device")
-        self.collective_rpc("load_model")
-        print(f"instance_id: {self.vllm_config.instance_id} initializes finished.")
-
-    def collective_rpc(
-        self,
-        method: str | Callable,
-        timeout: Optional[float] = None,
-        args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[Any]:
-        # TODO(wuxibin): support ray compiled graph
-        if isinstance(method, str):
-            sent_method = method
-        else:
-            sent_method = pickle.dumps(method)
-        del method
-
-        # ~3ms overhead per schedule step due to SchedulerOutput/ModelRunnerOutput serialization/deserialization.
-        outputs = ray.get(
-            [worker.execute_method.remote(sent_method, *args, **(kwargs or {})) for worker in self.workers]
-        )
-        return outputs
-
-    def check_health(self):
-        return
-
-
-class ExternalZeroMQDistributedExecutor(Executor):
-    """An executor that engines are launched by external ray actors."""
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",")
-        self.context = zmq.Context()
-        self.sockets = []
-        for address in addresses:
-            socket = self.context.socket(zmq.REQ)
-            socket.connect(address)
-            self.sockets.append(socket)
-
-        kwargs = dict(
-            vllm_config=self.vllm_config,
-            local_rank=None,
-            rank=None,
-            distributed_init_method="env://",
-            is_driver_worker=True,
-        )
-        self.collective_rpc("init_worker", args=([kwargs],))
-        self.collective_rpc("init_device")
-        self.collective_rpc("load_model")
-
-    def collective_rpc(
-        self,
-        method: str | Callable,
-        timeout: Optional[float] = None,
-        args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[Any]:
-        if isinstance(method, str):
-            sent_method = method
-        else:
-            sent_method = pickle.dumps(method)
-        del method
-
-        message = pickle.dumps((sent_method, args, kwargs or {}))
-        for socket in self.sockets:
-            socket.send(message, zmq.DONTWAIT)
-
-        outputs = []
-        for socket in self.sockets:
-            outputs.append(pickle.loads(socket.recv()))
-        return outputs
-
-    def check_health(self):
-        return
+logger.setLevel(logging.INFO)
 
 
 @ray.remote(num_cpus=1)
-class AsyncvLLMServer(AsyncServerBase):
-    """
-    AsyncvLLMServer is a wrapper for AsyncLLM, it uses ExternalRayDistributedExecutor to launch engines
-    in hybrid rollout workers, i.e AsyncActorRolloutRefWorker.
-
-    AsyncvLLMServer works as follows:
-    1. Start FastAPI server first.
-    2. Initialize AsyncLLM with ExternalRayDistributedExecutor.
-    3. AsyncLLM spawn EngineCore in subprocess.
-    4. EngineCore initialize ExternalRayDistributedExecutor.
-    5. ExternalRayDistributedExecutor lookup its corresponding actors by name.
-    6. ExternalRayDistributedExecutor init executor: init_worker, init_device, load_model.
-
-    For vLLM AsyncLLM design, see: https://github.com/vllm-project/vllm/pull/9826
-    """
-
-    def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_prefix: str):
-        """
-        Args:
-            config: DictConfig.
-            vllm_dp_size: int, vllm data parallel size.
-            vllm_dp_rank: int, vllm data parallel rank.
-            wg_prefix: str, worker group prefix, used to lookup actors.
-        """
-        super().__init__()
-
-        self.config = config.actor_rollout_ref
-        self.vllm_dp_size = vllm_dp_size
-        self.vllm_dp_rank = vllm_dp_rank
-        self.wg_prefix = wg_prefix
-        self.engine: AsyncLLM = None
+class vLLMHttpServerForPartial(vLLMHttpServer):
+    def __init__(
+        self,
+        config: DictConfig,
+        rollout_mode: RolloutMode,
+        workers: list[ActorHandle],
+        replica_rank: int,
+        node_rank: int,
+        gpus_per_node: int,
+        nnodes: int,
+    ):
+        super().__init__(config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes)
 
         # for cancel LLMServer
         self.paused = False
@@ -213,131 +53,21 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_
         self.cancel_event: dict[str, asyncio.Event] = {}
         self.req_output: dict[str, Optional[RequestOutput]] = {}
 
-    async def init_engine(self):
-        """Init vLLM AsyncLLM engine."""
-        config = self.config
-        model_path = config.model.path
-        model_name = "/".join(model_path.split("/")[-2:])
-        local_path = copy_to_local(model_path)
-        trust_remote_code = config.model.get("trust_remote_code", False)
-        config = config.rollout
-
-        tensor_parallel_size = config.get("tensor_model_parallel_size", 1)
-        max_num_batched_tokens = config.get("max_num_batched_tokens", 8192)
-        max_model_len = config.max_model_len if config.max_model_len else config.prompt_length + config.response_length
-        self.max_model_len = int(max_model_len)
-
-        # Override default generation config from hugging face model config,
-        # user can still override them by passing kwargs in each request.
-        kwargs = dict(
-            n=1,
-            logprobs=0,
-            repetition_penalty=1.0,
-            max_new_tokens=config.response_length,
-        )
-        for k in config.keys():
-            if hasattr(SamplingParams(), str(k)):
-                kwargs[k] = config.get(k)
-        print(f"override_generation_config: {kwargs}")
-
-        backend = os.environ.get("VERL_VLLM_DISTRIBUTED_BACKEND", "zeromq")
-        if backend == "zeromq":
-            distributed_executor_backend = ExternalZeroMQDistributedExecutor
-        elif backend == "ray":
-            distributed_executor_backend = ExternalRayDistributedExecutor
-        else:
-            distributed_executor_backend = None
-
-        engine_args = AsyncEngineArgs(
-            model=local_path,
-            enable_sleep_mode=config.free_cache_engine,
-            override_generation_config=kwargs,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            dtype=config.dtype,
-            enforce_eager=config.enforce_eager,
-            gpu_memory_utilization=config.gpu_memory_utilization,
-            disable_custom_all_reduce=True,
-            skip_tokenizer_init=False,
-            max_model_len=self.max_model_len,
-            load_format="auto",
-            disable_log_stats=config.disable_log_stats,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=config.enable_chunked_prefill,
-            enable_prefix_caching=True,
-            trust_remote_code=trust_remote_code,
-            seed=config.get("seed", 0),
-        )
-
-        # init async llm engine
-        vllm_config = self._create_engine_config(engine_args)
-        self.engine = AsyncLLM.from_vllm_config(vllm_config)
-
-        # build serving chat
-        model_config = self.engine.model_config
-        BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)]
-        models = OpenAIServingModels(self.engine, model_config, BASE_MODEL_PATHS)
-        self.openai_serving_chat = OpenAIServingChat(
-            self.engine,
-            model_config,
-            models,
-            "assistant",
-            request_logger=RequestLogger(max_log_len=4096),
-            chat_template=None,
-            chat_template_content_format="auto",
-            enable_auto_tools=config.multi_turn.tool_config_path is not None,
-            tool_parser=config.multi_turn.format,  # hermes, llama3_json, ...
-        )
-
-    def _create_engine_config(self, engine_args: AsyncEngineArgs):
-        vllm_config = engine_args.create_engine_config()
-        namespace = ray.get_runtime_context().namespace
-        vllm_config.instance_id = f"{namespace}:{self.wg_prefix}:{self.vllm_dp_size}:{self.vllm_dp_rank}"
-
-        # VERL_VLLM_ZMQ_ADDRESSES
-        if engine_args.distributed_executor_backend == ExternalZeroMQDistributedExecutor:
-            workers = _get_model_runner_workers(vllm_config=vllm_config, init_ray=False)
-            zmq_addresses = ray.get([worker.get_zeromq_address.remote() for worker in workers])
-            print(f"VERL_VLLM_ZMQ_ADDRESSES: {zmq_addresses}")
-            os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses)
-
-        return vllm_config
-
-    async def chat_completion(self, raw_request: Request):
-        """OpenAI-compatible HTTP endpoint.
-
-        API reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
-        """
-        request_json = await raw_request.json()
-        request = ChatCompletionRequest(**request_json)
-        generator = await self.openai_serving_chat.create_chat_completion(request, raw_request)
-
-        if isinstance(generator, ErrorResponse):
-            return JSONResponse(content=generator.model_dump(), status_code=generator.code)
-        if request.stream:
-            return StreamingResponse(content=generator, media_type="text/event-stream")
-        else:
-            assert isinstance(generator, ChatCompletionResponse)
-            return JSONResponse(content=generator.model_dump())
-
-    async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]:
-        max_tokens = self.max_model_len - len(prompt_ids)
+    async def _generate_step(
+        self,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        request_id: str,
+        image_data: Optional[list[Any]] = None,
+    ):
+        max_tokens = self.config.max_model_len - len(prompt_ids)
+        sampling_params["logprobs"] = 1
+        sampling_params.setdefault("repetition_penalty", self.config.get("repetition_penalty", 1.0))
         sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params)
-        prompt = TokensPrompt(prompt_token_ids=prompt_ids)
-        generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
-
-        # Get final response
-        final_res: Optional[RequestOutput] = None
-        async for output in generator:
-            final_res = output
-        assert final_res is not None
-
-        return final_res.outputs[0].token_ids
-
-    async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str):
-        max_tokens = self.max_model_len - len(prompt_ids)
-        sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params)
-        prompt = TokensPrompt(prompt_token_ids=prompt_ids)
+        prompt_ids = _qwen2_5_vl_dedup_image_tokens(prompt_ids, self.model_config.processor)
+        prompt = TokensPrompt(
+            prompt_token_ids=prompt_ids, multi_modal_data={"image": image_data} if image_data else None
+        )
         generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id)
 
         # Get final response
@@ -347,7 +77,11 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str,
         assert self.req_output[request_id] is not None
 
     async def generate_for_partial(
-        self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
+        self,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        request_id: str,
+        image_data: Optional[list[Any]] = None,
     ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
         # 设置中断标志
         async with self.lock:
@@ -356,7 +90,9 @@ async def generate_for_partial(
                 return [], [], True
             self.cancel_event[request_id] = asyncio.Event()
             cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
-            generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id))
+            generation_handle = asyncio.create_task(
+                self._generate_step(prompt_ids, sampling_params, request_id, image_data)
+            )
 
         done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED)
 
@@ -388,12 +124,8 @@ async def resume(self):
         async with self.lock:
             self.paused = False
 
-    async def wake_up(self):
-        if self.config.rollout.free_cache_engine:
-            await self.engine.wake_up()
 
-    async def sleep(self):
-        # TODO: https://github.com/vllm-project/vllm/issues/17103
-        await self.engine.reset_prefix_cache()
-        if self.config.rollout.free_cache_engine:
-            await self.engine.sleep()
+class vLLMReplicaForPartial(vLLMReplica):
+    def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8):
+        super().__init__(replica_rank, config, gpus_per_node)
+        self.server_class = vLLMHttpServerForPartail
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 9458c8f8123..8a32691e1a9 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -84,12 +84,12 @@ def _choose_server(self, request_id: str) -> ray.actor.ActorHandle:
 
     @rollout_trace_op
     async def generate(
-            self,
-            request_id,
-            *,
-            prompt_ids: list[int],
-            sampling_params: dict[str, Any],
-            image_data: Optional[list[Any]] = None,
+        self,
+        request_id,
+        *,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        image_data: Optional[list[Any]] = None,
     ) -> TokenOutput:
         """Generate tokens from prompt ids.
 
@@ -179,12 +179,12 @@ class AgentLoopBase(ABC):
     _class_initialized = False
 
     def __init__(
-            self,
-            trainer_config: _DummyConfig,
-            server_manager: AsyncLLMServerManager,
-            tokenizer: AutoTokenizer,
-            processor: AutoProcessor,
-            **kwargs,
+        self,
+        trainer_config: _DummyConfig,
+        server_manager: AsyncLLMServerManager,
+        tokenizer: AutoTokenizer,
+        processor: AutoProcessor,
+        **kwargs,
     ):
         """Initialize agent loop, each sample will have its own loop instance.
 
@@ -329,8 +329,8 @@ def __init__(self, config: DictConfig, local_path: str, rm_executor: BatchExecut
         self.rm_executor = rm_executor
 
     def compute_score(
-            self,
-            data: DataProto,
+        self,
+        data: DataProto,
     ) -> dict:
         """Compute reward score for agent loop output.
 
@@ -354,7 +354,7 @@ class AgentLoopWorkerBase:
     """Agent loop worker takes a batch of messages and run each message in an agent loop."""
 
     def __init__(
-            self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
+        self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
         """Initialize agent loop manager.
 
@@ -364,7 +364,7 @@ def __init__(
         """
         self.config = config
 
-        if self.AsyncLLMServerManager == None:
+        if self.AsyncLLMServerManager is None:
             self.AsyncLLMServerManager = AsyncLLMServerManager
 
         self.server_manager = self.AsyncLLMServerManager(config, server_handles)
@@ -458,19 +458,19 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
         return output
 
     async def _run_agent_loop(
-            self,
-            sampling_params: dict[str, Any],
-            trajectory: dict[str, Any],
-            *,
-            agent_name: str,
-            **kwargs,
+        self,
+        sampling_params: dict[str, Any],
+        trajectory: dict[str, Any],
+        *,
+        agent_name: str,
+        **kwargs,
     ) -> _InternalAgentLoopOutput:
         with rollout_trace_attr(
-                step=trajectory["step"],
-                sample_index=trajectory["sample_index"],
-                rollout_n=trajectory["rollout_n"],
-                validate=trajectory["validate"],
-                name="agent_loop",
+            step=trajectory["step"],
+            sample_index=trajectory["sample_index"],
+            rollout_n=trajectory["rollout_n"],
+            validate=trajectory["validate"],
+            name="agent_loop",
         ):
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
@@ -553,8 +553,8 @@ async def _run_agent_loop(
             # TODO: support other multi-modal inputs
             multi_modal_inputs = None
             if (
-                    self.processor is not None
-                    and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__
+                self.processor is not None
+                and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__
             ):
                 from verl.models.transformers.qwen2_vl import get_rope_index
 
@@ -583,8 +583,8 @@ async def _run_agent_loop(
             else:
                 position_ids = compute_position_id_with_mask(attention_mask)  # (1, seq_len)
             enable_async_reward = (
-                                          self.rm_executor is not None and self.config.reward_model.enable_resource_pool
-                                  ) or not self.config.reward_model.enable
+                self.rm_executor is not None and self.config.reward_model.enable_resource_pool
+            ) or not self.config.reward_model.enable
             if output.reward_score is None and enable_async_reward:
                 batch = TensorDict(
                     {
@@ -691,10 +691,12 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto:
 
 @ray.remote
 class AgentLoopWorker(AgentLoopWorkerBase):
-    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle],
-                 rm_executor: BatchExecutor = None):
+    def __init__(
+        self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
+    ):
         super().__init__(config, server_handles, rm_executor)
 
+
 async def get_trajectory_info(step, index, validate):
     """Get trajectory info.
 
@@ -859,7 +861,7 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data
 
         return timing
 
-    async def wake_up(self):
+    def wake_up(self):
         """Wake up all rollout replica instances."""
         self._run_all([replica.wake_up() for replica in self.rollout_replicas])
 
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 02c02417744..c4feae92c3f 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -328,6 +328,10 @@ async def sleep(self):
 
 
 class vLLMReplica(RolloutReplica):
+    def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8):
+        super().__init__(replica_rank, config, gpus_per_node)
+        self.server_class = vLLMHttpServer
+
     def get_ray_class_with_init_args(self) -> RayClassWithInitArgs:
         """Get rollout worker actor class for colocated and standalone mode."""
         worker_dict_cls = RayClassWithInitArgs(
@@ -362,7 +366,7 @@ async def launch_servers(self):
         for node_rank in range(nnodes):
             workers = self.workers[node_rank * gpus_per_node : (node_rank + 1) * gpus_per_node]
             node_id = worker_node_ids[node_rank * gpus_per_node]
-            server = vLLMHttpServer.options(
+            server = self.server_class.options(
                 scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
                     node_id=node_id,
                     soft=False,

From 3898c5fc4666cf10665d3cf9289a7355abcaf8f3 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 17 Sep 2025 15:52:52 +0800
Subject: [PATCH 148/182] translate

---
 .../agent_loop/agent_loop.py                  |  13 +-
 .../partial_single_turn_agent_loop.py         |  10 +-
 recipe/fully_async_policy/detach_utils.py     |  74 ++++-----
 recipe/fully_async_policy/fsdp_workers.py     |   2 +-
 .../fully_async_rollouter.py                  | 148 ++++++++----------
 .../fully_async_policy/fully_async_trainer.py |   5 +-
 recipe/fully_async_policy/ray_trainer.py      |   5 +-
 .../vllm_rollout/vllm_async_server.py         |   6 +-
 8 files changed, 124 insertions(+), 139 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 901e584f7c3..db29229915d 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -614,23 +614,24 @@ async def generate_single_sample_async(
         partial_output_list: Optional[list[AgentLoopOutput]],
     ) -> list[AgentLoopOutput]:
         """
-        异步处理单个样本, 需要复制n次
+        Asynchronously process a single sample
 
         Args:
-            sample: 单个样本数据
+            sample: Single sample data
             partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
 
         Returns:
-            tuple[AgentLoopOutput, float]: 处理结果和处理时间
+            list[AgentLoopOutput]: Processing results
         """
-        # 使用负载均衡选择 worker
+        # select a worker
         worker = self._select_best_worker()
-        # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
+        # Process a single sample asynchronously，
+        # get the raw AgentLoopOutput using the no post-processing version
         output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list)
         return await asyncio.wrap_future(output_future.future())
 
     def _select_best_worker(self):
-        """选择最佳的 worker（简单的轮询负载均衡）"""
+        """Select the best worker, simple round-robin load balancing"""
         if not hasattr(self, "_worker_index"):
             self._worker_index = 0
 
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
index c97f794bb9c..25964406753 100644
--- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -49,12 +49,16 @@ async def run(
             param_version_start = param_version
         else:
             if output.is_cancel:
-                # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
+                # Resume the paused sample,
+                # add the result directly after prompt_ids,
+                # and reset generate_sequences metric
                 prompt_ids = output.prompt_ids + output.response_ids
                 metrics["generate_sequences"] = output.metrics.generate_sequences
                 param_version_start = output.param_version_start
             else:
-                # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
+                # In the same batch of samples,
+                # ome are canceled and some are not.
+                # The samples without partial rollout are returned directly.
                 return output
         param_version_end = param_version
         request_id = uuid4().hex
@@ -65,8 +69,8 @@ async def run(
 
         if not output:
             response_mask = [1] * len(response_ids)
-        # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
         else:
+            # Pause the sample to be resumed, add the output result to response_ids, and reset response_mask
             prompt_ids = output.prompt_ids
             log_probs = output.log_probs + log_probs
             response_ids = output.response_ids + response_ids
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 32a36bb882a..35128a4ade1 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -53,6 +53,8 @@ class RolloutSample:
 
 @dataclass
 class ValidateMetrics:
+    """Metrics for validation"""
+
     timing_raw: dict[str, Any]
     metrics: Optional[dict[str, Any]] = None
     global_steps: Optional[int] = None
@@ -61,23 +63,15 @@ class ValidateMetrics:
 
 def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto:
     """
-    类似 ray_trainer._prepare_generate_batch 的逻辑，但针对单个样本
-    分离出用于生成的数据和需要保留的原始数据
+    Similar to the logic of ray_trainer._prepare_generate_batch, but for a single sample.
+    Separate the data used for generation from the original data.
 
     Returns:
         tuple: (original_batch_dict, gen_data_for_single_sample)
     """
 
-    # 创建完整的 DataProto
     full_batch = DataProto.from_single_dict(batch_dict)
 
-    # batch : TensorDict { input_ids, attention_mask, position_ids}
-    # non_tensor_batch: raw_prompt_ids, raw_prompt,
-    #                   multi_modal_data, tools_kwargs, interaction_kwargs, index, agent_name,
-    #                   data_source, ability, reward_model
-    # meta_info: {}
-
-    # 定义需要传递给生成服务器的字段
     batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
     non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
 
@@ -86,10 +80,10 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP
         non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
     )
 
-    # 设置使用支持partial的agent
+    # Setting agent - partial_single_turn_agent, that supports partial
     full_batch.non_tensor_batch["agent_name"] = np.array(["partial_single_turn_agent"] * len(full_batch), dtype=object)
 
-    # 添加全局步数到生成数据
+    # Add global step count to generated data
     full_batch.meta_info["global_steps"] = global_steps
     full_batch = full_batch.repeat(repeat_times=rollout_n, interleave=True)
     return full_batch
@@ -97,32 +91,29 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP
 
 def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[list[float]]) -> torch.Tensor:
     """
-    根据 DataProto 中的 mask 逻辑处理 rollout_log_probs
-    # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+    Process rollout_log_probs according to the mask in DataProto
+    mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
 
     Args:
-        data_proto: 包含 batch 信息的 DataProto 对象
-        rollout_log_probs: 二维列表，每个子列表包含一个样本的 log_probs
+        data_proto: A DataProto object containing batch information
+        rollout_log_probs: A two-dimensional list, each sublist containing the log_probs of a sample
 
     Returns:
-        torch.Tensor: 处理后的 log_probs tensor，形状为 [bsz, response_length]
+        torch.Tensor: The processed log_probs tensor, with shape: [bsz, response_length]
     """
 
     batch = data_proto.batch
     response_mask = batch["response_mask"]
-    bsz, response_length = response_mask.shape
-
-    # 初始化结果 tensor
-    rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1
+    rollout_log_probs_tensor = torch.zeros(response_mask.shape, dtype=torch.float32) - 1
 
     for i, log_probs_seq in enumerate(rollout_log_probs):
-        # 获取当前样本的有效长度（mask 中为 1 的位置数量）
+        # Get the effective length of the current sample (the number of positions with 1 in the mask)
         valid_length = response_mask[i].sum().item()
 
-        # 确保 log_probs_seq 的长度不超过有效长度
+        # Ensure that the length of log_probs_seq does not exceed the valid length
         actual_length = min(len(log_probs_seq), valid_length)
 
-        # 将 log_probs 填入对应位置
+        # Fill log_probs into the corresponding position
         if actual_length > 0:
             rollout_log_probs_tensor[i, :actual_length] = torch.tensor(log_probs_seq[:actual_length])
 
@@ -131,29 +122,32 @@ def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[lis
 
 
 def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
-    # 第一步：从 AgentLoopOutput 创建生成结果的 DataProto
+    """
+    Supplement and refine the RolloutSample object,
+    """
+    # Step 1: Create a DataProto from the AgentLoopOutput to generate the result
     gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config)
     rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list]
     rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs)
     gen_batch_output.batch["rollout_log_probs"] = rollout_log_probs.to(torch.float32)
 
-    # 第二步：添加 uid
+    # Step 2: Add uid
     rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object)
 
-    # 第二步：合并batch
-    # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch
+    # Step 2: Merge batches
+    # Merge the non_tensor_batch and meta_info of original_batch into final_batch
     for key, value in rs.full_batch.non_tensor_batch.items():
         gen_batch_output.non_tensor_batch[key] = value
     gen_batch_output.meta_info.update(rs.full_batch.meta_info)
 
-    # 第三步，设置 full_batch
+    # Step 3, set full_batch
     rs.full_batch = gen_batch_output
     rs.processing_times = []
     for agent_loop in rs.agent_loop_output_list:
         rs.processing_times.append(agent_loop.metrics.generate_sequences)
     rs.param_version_start = [agent_loop.param_version_start for agent_loop in rs.agent_loop_output_list]
     rs.param_version_end = [agent_loop.param_version_end for agent_loop in rs.agent_loop_output_list]
-    # 第四步，清空 agent_loop_output_list
+    # Step 4, clear agent_loop_output_list
     rs.agent_loop_output_list = []
 
     return rs
@@ -164,7 +158,7 @@ def assemble_batch_from_rollout_samples(
 ) -> DataProto:
     """
     Assemble gen_batch_output from RolloutSample objects
-    从 RolloutSample 对象中组装批次，类似 ray_trainer 的 _post_generate_batch 逻辑
+    Assembles batches from RolloutSample objects, similar to the _post_generate_batch logic in ray_trainer.
 
     Args:
         rollout_samples: List of RolloutSample objects
@@ -188,7 +182,7 @@ def assemble_batch_from_rollout_samples(
     rollout_samples_batch = []
     processing_times = []
     rollout_status = rollout_samples[0].rollout_status
-    # 为 rollout_status 的所有 key 添加前缀
+    # Add a prefix to all rollout_status keys
     rollout_status = {f"fully_async/{key}": value for key, value in rollout_status.items()}
 
     for rs in rollout_samples:
@@ -196,18 +190,18 @@ def assemble_batch_from_rollout_samples(
         processing_times.extend(rs.processing_times)
     final_batch = DataProto.concat(rollout_samples_batch)
 
-    # 计算 response_mask（如果不存在）
+    # Calculate response_mask (if not present)
     if "response_mask" not in final_batch.batch.keys():
         final_batch.batch["response_mask"] = compute_response_mask(final_batch)
 
     if balance_batch:
         balance_batch(final_batch, metrics={})
 
-    # 计算全局有效 token 数
+    # Calculate the global valid token number
     if "attention_mask" in final_batch.batch:
         final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist()
 
-    # 收集统计信息和元数据（直接从 RolloutSample 中获取）
+    # Collect statistics
     param_versions = [rs.param_version for rs in rollout_samples]
     trajectorys_param_versions = [version for rs in rollout_samples for version in rs.param_version_end]
 
@@ -215,9 +209,9 @@ def assemble_batch_from_rollout_samples(
         "processing_time/avg": np.mean(processing_times),
         "processing_time/max": np.max(processing_times),
         "processing_time/min": np.min(processing_times),
-        "processing_time/tp50": np.percentile(processing_times, 50),  # 中位数
-        "processing_time/tp99": np.percentile(processing_times, 99),  # 99百分位
-        "processing_time/tp95": np.percentile(processing_times, 95),  # 95百分位也很有用
+        "processing_time/tp50": np.percentile(processing_times, 50),
+        "processing_time/tp99": np.percentile(processing_times, 99),
+        "processing_time/tp95": np.percentile(processing_times, 95),
     }
     processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()}
 
@@ -228,7 +222,7 @@ def assemble_batch_from_rollout_samples(
         "fully_async/partial/partial_ratio": (len(param_version_diff) - num_diff0) / len(param_version_diff),
         "fully_async/partial/max_partial_span": max(param_version_diff),
     }
-    # 创建 meta_info
+    # add meta_info
     final_batch.meta_info.update(
         {
             "rollout_param_versions": param_versions,
@@ -274,7 +268,7 @@ def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]:
                 "fully_async/count/stale_trajectory_processed",
                 "fully_async/count/current_param_version",
                 "fully_async/count/dropped_stale_samples",
-                "training/global_step",  # TODO 改为total_step
+                "training/global_step",  # TODO change name to: total_step
             ],
         }
 
diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index 7a1b59aa64c..9e4f96c2e8b 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -55,7 +55,7 @@ def get_inference_model(rollout):
     Args:
         rollout: rollout object
     Returns:
-        model: 模型对象
+        model: model object
     """
     inference_engine = rollout.inference_engine
     if hasattr(inference_engine, "llm_engine"):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 1027e228c18..81ebf9780f8 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -115,7 +115,6 @@ def __init__(
         self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1)
         self.required_samples = None
         self.max_required_samples = None
-        # 单次最多扔一次更新需要的样本
         self.max_concurrent_samples = None
         # queue size
         self.max_queue_size = None
@@ -125,22 +124,22 @@ def __init__(
         self.total_generated_samples = 0
         self.staleness_samples = 0
         self.dropped_stale_samples = 0
-        self.processed_sample_count = 0  # 已处理的样本计数
+        self.processed_sample_count = 0
         self.global_steps = 0
         self.idle_start_time = None
         self.version_start_time = None
 
         # Concurrency control
+        # Modified by self.pause() or self._should_pause_generation()
         self.paused = False
         self.running = True
-        # 通过 pause 和 resume 控制 monitor_loop 中，是否进行 尝试恢复 操作
         self.monitor_loop_trigger = True
 
         # Initialize async locks directly
         self.lock = asyncio.Lock()
         self.condition = asyncio.Condition(self.lock)
 
-        # 初始化异步队列
+        # Initialize async queues
         self.pending_queue = asyncio.Queue(maxsize=128)
         self.active_tasks = set()
         self.result_queue = asyncio.Queue()
@@ -164,7 +163,7 @@ async def set_required_samples(self, required_samples: int):
                 / (self.required_samples * self.config.async_training.trigger_parameter_sync_step)
             )
 
-            # 单次最多扔一次更新需要的样本
+            # max_concurrent_samples should be related to the resources
             self.max_concurrent_samples = self.async_rollout_manager.rollout_dp_size * 16
             self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples)
             self.max_queue_size = self.max_required_samples
@@ -275,12 +274,12 @@ def _init_async_rollout_manager(self):
             worker_group=self.rollout_wg,
         )
 
-    # 添加样本到待处理队列的协程
+    # Add samples to the pending_queue
     async def _feed_samples(self):
         continuous_iterator = self._create_continuous_iterator()
 
         for epoch, batch_dict in continuous_iterator:
-            # 类似 _prepare_generate_batch 的逻辑：分离数据
+            # Similar to _prepare_generate_batch: Separate data
             full_batch = prepare_single_generation_data(
                 batch_dict, self.global_steps, self.config.actor_rollout_ref.rollout.n
             )
@@ -289,10 +288,10 @@ async def _feed_samples(self):
 
             rollout_sample = RolloutSample(
                 full_batch=full_batch,
-                agent_loop_output_list=[None] * self.config.actor_rollout_ref.rollout.n,  # 待处理后填充
+                agent_loop_output_list=[None] * self.config.actor_rollout_ref.rollout.n,
                 sample_id=sample_id,
                 epoch=epoch,
-                param_version=0,  # 待处理后填充
+                param_version=0,
                 param_version_start=[],
                 param_version_end=[],
                 processing_times=[],
@@ -301,23 +300,25 @@ async def _feed_samples(self):
 
             await self.pending_queue.put(rollout_sample)
 
-            # 检查是否到达最后一步
+            # Check if have reached the last step
             if self.global_steps >= self.total_rollout_steps:
                 print(
                     f"[FullyAsyncRollouter][Feed] "
-                    f"达到最大步数，停止添加新样本 "
+                    f"Maximum count has been reached, stop adding new samples"
                     f"{self.global_steps} >= {self.total_rollout_steps}"
                 )
                 break
 
             self.global_steps += 1
 
-        # 发送结束信号
+        # End signal
         await self.pending_queue.put("DONE")
-        print(f"[FullyAsyncRollouter][Feed] 样本添加完成，总共添加了 {self.global_steps} 个步骤的样本")
+        print(f"[FullyAsyncRollouter][Feed] Sample addition is complete, {self.global_steps} samples have been added")
 
     async def _processor_worker(self):
-        """流式处理工作协程 - 逐个样本立即提交处理，不等待批次"""
+        """
+        Streaming worker coroutines, a sample is submitted for processing without waiting for batches
+        """
 
         while True:
             simple_from_cancel_queue = False
@@ -328,15 +329,15 @@ async def _processor_worker(self):
                 rollout_sample = await self.pending_queue.get()
                 self.staleness_samples += 1
 
-            # 判断是否需要暂停
-            # self.paused 由 pause() 和 self._should_pause_generation() 负责修改
             if self.paused or await self._should_pause_generation():
-                print("[FullyAsyncRollouter][Processor] 收到暂停信号，等待剩余任务完成...")
+                print(
+                    "[FullyAsyncRollouter][Processor] Received pause signal, waiting for remaining tasks to return..."
+                )
                 async with self.lock:
                     self.paused = True
                 while self.active_tasks:
                     async with self.lock:
-                        # 获取锁后，active_tasks 数量会发生变化，需要再次校验
+                        # After acquiring the lock, the number of active_tasks may change, need to be verified again
                         if self.active_tasks:
                             done_tasks, self.active_tasks = await asyncio.wait(
                                 self.active_tasks, return_when=asyncio.FIRST_COMPLETED
@@ -349,9 +350,10 @@ async def _processor_worker(self):
                         self.idle_start_time = time.time()
                         await self.condition.wait()
 
-            # 获取待处理的部分 RolloutSample
             if rollout_sample == "DONE":
-                print("[FullyAsyncRollouter][Processor] 收到结束信号，等待剩余任务完成...")
+                print(
+                    "[FullyAsyncRollouter][Processor] Received end signal, waiting for the remaining tasks to complete..."
+                )
                 while self.active_tasks:
                     async with self.lock:
                         if self.active_tasks:
@@ -362,7 +364,7 @@ async def _processor_worker(self):
                             await task
                 break
 
-            # 检查并发数是否超限
+            # Check whether the number of concurrent tasks exceeds the limit
             while len(self.active_tasks) >= self.max_concurrent_samples:
                 async with self.lock:
                     if self.active_tasks:
@@ -372,9 +374,10 @@ async def _processor_worker(self):
                     for task in done_tasks:
                         await task
 
-            # 立即提交单个样本处理
+            # Submit single sample processing
             async with self.lock:
-                # pause结束后，获取到锁，还需要判断是否是暂停阶段，否则继续等待
+                # After the pause is over, the lock is acquired and it is necessary
+                # to determine whether it is the pause phase, otherwise continue to wait
                 while self.paused:
                     await self.condition.wait()
                 task = asyncio.create_task(
@@ -383,41 +386,29 @@ async def _processor_worker(self):
                 )
                 self.active_tasks.add(task)
 
-            # 标记队列任务完成
             if simple_from_cancel_queue:
                 self.cancel_queue.task_done()
             else:
                 self.pending_queue.task_done()
 
     async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
-        """流式处理单个样本"""
-        # 调用异步生成方法
+        """Process a single sample streamingly"""
+        # Calling asynchronous generation methods
         agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async(
             rollout_sample.full_batch, self.current_param_version, rollout_sample.agent_loop_output_list
         )
-        # 直接更新 RolloutSample 对象，填充剩余字段
         rollout_sample.agent_loop_output_list = agent_loop_output_list
 
         is_cancel = False
-        # 收集所有信息
         for agent_loop in agent_loop_output_list:
             if not is_cancel and agent_loop.is_cancel:
                 is_cancel = True
 
-        # rollout_data = {
-        #     "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list],
-        #     "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list],
-        # }
-        # if is_cancel:
-        #     rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list]
-        # formatted_data = pformat(rollout_data, width=200, compact=True)
-        # print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}")
-
         if is_cancel:
-            # 放入 cancel 队列中，等待恢复生成
+            # Put in the cancel queue and wait for the generation to resume
             await self.cancel_queue.put(rollout_sample)
         else:
-            # 否则放入结果队列
+            # put into the result_queue
             rollout_sample.param_version = self.current_param_version
             rollout_sample.rollout_status = await self.get_statistics()
             await self.result_queue.put(rollout_sample)
@@ -425,13 +416,15 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         self.processed_sample_count += 1
 
     async def _consumer_worker(self):
-        """消费者协程，负责从结果队列获取处理结果并放入消息队列"""
+        """
+        The consumer coroutine is responsible for obtaining the processing results
+        from the result queue and putting them into the message queue
+        """
         while True:
-            # 从结果队列获取 RolloutSample
             rollout_sample = await self.result_queue.get()
             rollout_sample = merge_rollout_sample(self.config, self.tokenizer, rollout_sample)
 
-            # 将 RolloutSample 放入消息队列
+            # Put RolloutSample into the message queue
             success = await self.message_queue_client.put_sample(
                 sample=ray.cloudpickle.dumps(rollout_sample),
                 param_version=rollout_sample.param_version,
@@ -441,55 +434,50 @@ async def _consumer_worker(self):
             else:
                 self.dropped_stale_samples += 1
 
-            # 标记结果队列任务完成
             self.result_queue.task_done()
 
     async def _streaming_generation_main(self):
-        """流式处理的主入口方法，包含初始化和验证逻辑"""
+        """The main entry method for stream processing"""
 
         # we start from step 1
         self.global_steps += 1
 
-        # 确保async_rollout_manager已经初始化
         if self.async_rollout_manager is None:
             self._init_async_rollout_manager()
 
-        # 启动流式处理循环
-        print(f"[FullyAsyncRollouter] 启动流式处理模式，最大并发样本数: {self.max_concurrent_samples}")
+        # Start the streaming loop
+        print(f"[FullyAsyncRollouter] Start streaming mode, maximum concurrent samples: {self.max_concurrent_samples}")
 
-        # 启动流式处理协程和消费者协程
+        # Start sample feed coroutine, streaming process coroutine and consumer coroutine
         self.feed_task = asyncio.create_task(self._feed_samples())
         self.processor_task = asyncio.create_task(self._processor_worker())
         self.consumer_task = asyncio.create_task(self._consumer_worker())
-        # 启动样本添加协程
 
         try:
-            # 等待样本添加完成
+            # Wait for sample feed to complete
             await self.feed_task
-            print("[FullyAsyncRollouter] 样本添加完成")
+            print("[FullyAsyncRollouter] Sample feed completed")
 
-            # 等待流式处理完成
+            # Wait for streaming to complete
             await self.processor_task
-            print("[FullyAsyncRollouter] 流式处理完成")
+            print("[FullyAsyncRollouter] Streaming process completed")
 
-            # 等待结果队列清空
+            # Waiting for the result queue to clear
             await self.result_queue.join()
-            print("[FullyAsyncRollouter] 所有结果处理完成")
+            print("[FullyAsyncRollouter] Result queue cleared")
 
         except Exception as e:
-            print(f"[FullyAsyncRollouter] 流式处理异常: {e}")
+            print(f"[FullyAsyncRollouter] Streaming process exception:{e}")
 
         finally:
-            # 取消所有任务
             if self.processor_task:
                 self.processor_task.cancel()
             if self.consumer_task:
                 self.consumer_task.cancel()
 
-            # 等待任务结束
             await asyncio.gather(self.processor_task, self.consumer_task, return_exceptions=True)
 
-        # 发送终止信号
+        # Send a finish signal
         await self.message_queue_client.put_sample(
             sample=None,
             param_version=self.current_param_version,
@@ -501,35 +489,35 @@ async def _streaming_generation_main(self):
     async def fit(self):
         """
         Start the async rollouter - entry point that sets up and runs async tasks
-        Main async fit method that coordinates all coroutines"""
+        Main async fit method that coordinates all coroutines
+        """
 
         print("[FullyAsyncRollouter] Starting FullyAsyncRollouter...")
 
         if self.message_queue_client is None:
             raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.")
 
-        # 设置运行状态
+        # Set the running status flag
         async with self.lock:
             self.paused = False
             self.running = True
 
-        # 创建主要的异步任务
+        # Create the main asynchronous task
         generation_task = asyncio.create_task(self._streaming_generation_main())
         monitor_task = asyncio.create_task(self._async_monitor_loop())
 
         try:
-            # 并发运行生成和监控任务
+            # Run build and monitoring tasks concurrently
             await asyncio.gather(generation_task, monitor_task, return_exceptions=True)
         except Exception as e:
-            print(f"[FullyAsyncRollouter] 异步任务执行出错: {e}")
+            print(f"[FullyAsyncRollouter] Asynchronous task execution error: {e}")
         finally:
-            # 清理任务
             if not generation_task.done():
                 generation_task.cancel()
             if not monitor_task.done():
                 monitor_task.cancel()
 
-            # 等待任务完成
+            # Wait for the task to complete
             await asyncio.gather(generation_task, monitor_task, return_exceptions=True)
 
         print("[FullyAsyncRollouter] Rollouter fit completed")
@@ -549,14 +537,14 @@ async def _async_monitor_loop(self):
                 if not self.running:
                     break
             await asyncio.sleep(check_interval)
-            # 定期打印统计信息
+            # Print statistics periodically
             current_time = time.time()
             if current_time - last_stats_time >= stats_interval:
                 stats = await self.get_statistics()
                 print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}")
                 last_stats_time = current_time
 
-            # pause 和 resume 之间，不进行恢复操作
+            # Trigger rollout recovery
             if self.monitor_loop_trigger:
                 if not await self._should_pause_generation():
                     async with self.lock:
@@ -594,7 +582,7 @@ async def pause(self):
         print("[FullyAsyncRollouter][Public][Pause]")
         async with self.lock:
             self.paused = True
-            # 取消rollout所有任务
+            # Cancel all rollout tasks
             if self.config.async_training.partial_rollout:
                 await self.async_rollout_manager.cancel_async()
             if self.active_tasks:
@@ -619,23 +607,23 @@ async def get_statistics(self) -> dict:
         queue_stats = self.message_queue_client.get_statistics_sync()
 
         stats = {
-            # static stats
-            "static/max_required_samples": self.max_required_samples,
-            "static/required_samples": self.required_samples,
-            "static/staleness_threshold": self.staleness_threshold,
-            "static/max_queue_size": self.max_queue_size,
-            "static/max_concurrent_samples": self.max_concurrent_samples,
-            # counting stats
-            "count/current_param_version": self.current_param_version,
-            "count/total_generated_samples": self.total_generated_samples,
-            "count/staleness_samples": self.staleness_samples,
-            "count/dropped_stale_samples": self.dropped_stale_samples,
             # monitor stats
             "monitor/active_tasks_size": len(self.active_tasks),
             "monitor/queue/pending_queue_size": self.pending_queue.qsize(),
             "monitor/queue/cancel_queue_size": self.cancel_queue.qsize(),
             "monitor/queue/result_queue_size": self.result_queue.qsize(),
             "monitor/queue/mq_queue_size": queue_stats["queue_size"],
+            # counting stats
+            "count/current_param_version": self.current_param_version,
+            "count/total_generated_samples": self.total_generated_samples,
+            "count/staleness_samples": self.staleness_samples,
+            "count/dropped_stale_samples": self.dropped_stale_samples,
+            # static stats
+            "static/max_required_samples": self.max_required_samples,
+            "static/required_samples": self.required_samples,
+            "static/staleness_threshold": self.staleness_threshold,
+            "static/max_queue_size": self.max_queue_size,
+            "static/max_concurrent_samples": self.max_concurrent_samples,
         }
 
         return stats
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index f014993f13e..c3d5773bde2 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -158,11 +158,10 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         queue_samples = []
 
         while len(queue_samples) < self.required_samples:
-            # 获取单个样本，会一直等待直到有样本或收到None
+            # Get a single sample and wait until there is a sample or None is received
             sample, queue_len = self.message_queue_client.get_sample_sync()
 
             if sample is None:
-                # 检测到结束信号（None），立即退出
                 print(
                     f"[FullyAsyncTrainer] Detected termination signal (None), stopping sample collection. "
                     f"Collected {len(queue_samples)}/{self.required_samples} samples"
@@ -322,7 +321,7 @@ def fit(self):
             pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}")
         self.progress_bar.close()
 
-        self._check_save_checkpoint(True, timing_raw)  # TODO: 检查checkpoint
+        self._check_save_checkpoint(True, timing_raw)  # TODO: check checkpoint
 
     def load_checkpoint(self):
         return self._load_checkpoint()
diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py
index dea3aa2c26e..c0543191e8e 100644
--- a/recipe/fully_async_policy/ray_trainer.py
+++ b/recipe/fully_async_policy/ray_trainer.py
@@ -80,11 +80,10 @@ class Role(Enum):
     ActorRolloutRef = 6
 
     def __str__(self):
-        """返回与代码中一致的字符串表示"""
         return self._get_role_string()
 
     def _get_role_string(self):
-        """获取角色对应的字符串名称"""
+        """Get the string name corresponding to the role"""
         role_mapping = {
             Role.Actor: "actor",
             Role.Rollout: "rollout",
@@ -98,7 +97,7 @@ def _get_role_string(self):
 
     @classmethod
     def from_string(cls, name: str):
-        """从字符串创建Role实例"""
+        """Create a Role instance from a string"""
         string_mapping = {
             "actor": cls.Actor,
             "rollout": cls.Rollout,
diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
index 4826ebaa1d0..2f2cef94a0e 100644
--- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
+++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
@@ -349,10 +349,9 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str,
     async def generate_for_partial(
         self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str
     ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
-        # 设置中断标志
         async with self.lock:
             if self.paused:
-                # cancel 后， 所有任务直接返回，等待下次提交
+                # After cancel, all tasks will return directly and wait for the next submission
                 return [], [], True
             self.cancel_event[request_id] = asyncio.Event()
             cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait())
@@ -370,7 +369,8 @@ async def generate_for_partial(
             token_ids = self.req_output[request_id].outputs[0].token_ids
             log_probs: list[float] = []
             for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs):
-                # sampling_params 中 logprobs 设置为1，应该返回1个, 但是实测会有多个，取token_id所对应的log_prob
+                # In sampling_params, logprobs is set to 1, which should return 1,
+                # but in practice there are multiple. Take the log_prob corresponding to token_id
                 token_id = self.req_output[request_id].outputs[0].token_ids[i]
                 log_probs.append(x[token_id].logprob)
             is_cancel = generation_handle not in done

From 106f5eb75adb1f0e3614d8a5d2d807cdd301effb Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Wed, 17 Sep 2025 16:11:46 +0800
Subject: [PATCH 149/182] fix typo

---
 recipe/fully_async_policy/fully_async_rollouter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 81ebf9780f8..dcc84f81993 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -352,7 +352,7 @@ async def _processor_worker(self):
 
             if rollout_sample == "DONE":
                 print(
-                    "[FullyAsyncRollouter][Processor] Received end signal, waiting for the remaining tasks to complete..."
+                    "[FullyAsyncRollouter][Processor] Received end signal, waiting for remaining tasks to complete..."
                 )
                 while self.active_tasks:
                     async with self.lock:

From c2219e0938c5b333a0c568ab51ce9e694099355f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 11:29:31 +0800
Subject: [PATCH 150/182] qwen3-32B-sta0

---
 .../exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh          | 4 ++--
 .../qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
index 48be3ab3c84..c79c960a701 100644
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
@@ -65,9 +65,9 @@ n_resp_per_prompt=16
 train_prompt_mini_bsz=16
 total_rollout_steps=$(((512*200)))
 test_freq=20
-staleness_threshold=0.1
+staleness_threshold=0
 trigger_parameter_sync_step=8
-partial_rollout=True
+partial_rollout=False
 
 python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
index ea506be787e..61c0adbaca7 100644
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
+++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
@@ -1,4 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1"
+  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-sta0"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file

From 91d199c9dd941ff03ce9415956aff3d0172897b9 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 16:07:59 +0800
Subject: [PATCH 151/182] refactor 5

---
 .../grpo_trainer/run_qwen2-7b_seq_balance.sh  |   6 +-
 .../agent_loop/agent_loop.py                  | 175 +++---------------
 .../partial_single_turn_agent_loop.py         |  20 +-
 recipe/fully_async_policy/detach_utils.py     |   1 -
 .../fully_async_rollouter.py                  |  12 +-
 .../fully_async_policy/fully_async_trainer.py |   1 +
 recipe/fully_async_policy/ray_trainer.py      |   5 -
 .../vllm_rollout/vllm_async_server.py         |   2 +-
 tests/special_e2e/run_fully_async_policy.sh   |   2 +-
 verl/experimental/agent_loop/agent_loop.py    |   7 +-
 verl/workers/rollout/replica.py               |   1 -
 11 files changed, 54 insertions(+), 178 deletions(-)

diff --git a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
index fdc1ef606d7..f4ca9a41d7e 100644
--- a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
+++ b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
@@ -3,7 +3,7 @@ set -x
 
 # For async rollout mode, dataset should return raw chat.
 rollout_mode="async"
-rollout_name="sglang" # sglang or vllm
+rollout_name="vllm" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
     export VLLM_USE_V1=1
     return_raw_chat="True"
@@ -19,7 +19,7 @@ python3 -m verl.trainer.main_ppo \
     data.max_response_length=1024 \
     data.filter_overlong_prompts=True \
     data.truncation='error' \
-    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=True \
     actor_rollout_ref.actor.ppo_mini_batch_size=256 \
@@ -41,7 +41,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
-    trainer.logger='["console","wandb"]' \
+    trainer.logger='["console","tensorboard"]' \
     trainer.project_name='verl_grpo_example_gsm8k' \
     trainer.experiment_name='qwen2_7b_function_rm_kl1e-3' \
     trainer.val_before_train=False \
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index a13c4744e08..1b0b9218087 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -14,16 +14,26 @@
 import asyncio
 import logging
 import os
-
-from recipe.fully_async_policy.vllm_rollout.vllm_async_server import vLLMReplicaForPartial
-from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, _agent_loop_registry, _DummyConfig
+from typing import Optional, Any
+
+import hydra
+import numpy as np
+import ray
+import torch
+from omegaconf import DictConfig
+
+from recipe.fully_async_policy.vllm_rollout.vllm_async_server import FullyAsyncvLLMReplica
+from verl.experimental.agent_loop.agent_loop import (AgentLoopOutput, _agent_loop_registry, _DummyConfig,
+                                                     AsyncLLMServerManager, AgentLoopWorkerBase, BatchExecutor,
+                                                     get_trajectory_info, AgentLoopManager)
 from verl.protocol import DataProto
+from verl.single_controller.ray import RayWorkerGroup
+from verl.utils.rollout_trace import rollout_trace_attr
+from verl.workers.rollout.replica import TokenOutput
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
-from verl.experimental.agent_loop.agent_loop import *
-
 
 class FullyAsyncLLMServerManager(AsyncLLMServerManager):
     async def generate_for_partial(self, request_id, prompt_ids, sampling_params) -> TokenOutput:
@@ -37,61 +47,24 @@ async def generate_for_partial(self, request_id, prompt_ids, sampling_params) ->
         return output
 
 
-class PartialAgentLoopOutput(AgentLoopOutput):
+class FullyAsyncAgentLoopOutput(AgentLoopOutput):
     """Agent loop output."""
-
     is_cancel: bool = False
     """Indicates whether the request was interrupted"""
     log_probs: list[float] = None
     """Response token log probs including LLM generated token, tool response token."""
-
+    param_version_start: int = 0
+    """Indicate start parameter version when this response is generated"""
+    param_version_end: int = 0
+    """Indicate end parameter version when this response is generated, used for partial rollout"""
 
 @ray.remote
 class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase):
     def __init__(
             self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
-        """Initialize agent loop manager.
-
-        Args:
-            config (DictConfig): YAML config.
-            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
-        """
-        self.config = config
-
-        self.server_manager = FullyAsyncLLMServerManager(config, server_handles)
-        self.rm_executor = rm_executor
-
-        model_path = config.actor_rollout_ref.model.path
-        self.model_name = "/".join(model_path.split("/")[-2:])
-        local_path = copy_to_local(config.actor_rollout_ref.model.path)
-        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True)
-        self.processor = hf_processor(local_path, trust_remote_code=True)
-
-        agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path
-        if agent_loop_config_path:
-            agent_loop_configs = OmegaConf.load(agent_loop_config_path)
-            for agent_loop_config in agent_loop_configs:
-                _agent_loop_registry[agent_loop_config.name] = agent_loop_config
-        if self.config.actor_rollout_ref.model.get("custom_chat_template", None) is not None:
-            if self.processor is not None:
-                self.processor.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
-            self.tokenizer.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
-
-        self.reward_manager_worker = RewardManagerWorker.options(
-            scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
-                node_id=ray.get_runtime_context().get_node_id(),
-                soft=False,
-            ),
-        ).remote(self.config, local_path, self.rm_executor)
-
-        trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
-        RolloutTraceConfig.init(
-            self.config.trainer.project_name,
-            self.config.trainer.experiment_name,
-            trace_config.get("backend"),
-            trace_config.get("token2text", False),
-        )
+        self.server_manager_class = FullyAsyncLLMServerManager(config, server_handles)
+        super().__init__(config, server_handles, rm_executor)
 
     async def generate_sequences_no_post(
             self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
@@ -103,13 +76,7 @@ async def generate_sequences_no_post(
             partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
 
         Returns:
-            list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
-            Each AgentLoopOutput contains:
-            - prompt_ids: prompt token ids
-            - response_ids: response token ids including LLM generated and tool response tokens
-            - response_mask: 1 for LLM generated tokens, 0 for tool response tokens
-            - num_turns: number of chat turns
-            - metrics: performance metrics
+            list[FullyAsyncAgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
         """
         config = self.config.actor_rollout_ref.rollout
         sampling_params = dict(
@@ -172,7 +139,7 @@ async def _partial_run_agent_loop(
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
                 trainer_config=_DummyConfig(config=self.config),
-                server_manager=self.server_manager,
+                server_manager=self.server_manager_class,
                 tokenizer=self.tokenizer,
                 processor=self.processor,
             )
@@ -187,7 +154,7 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w
         self.rm_executor = None
         self.rm_micro_batch_size = None
         self.agent_loop_workers_class = FullyAsyncAgentLoopWorker
-        self.rollout_replica_class = vLLMReplicaForPartial
+        self.rollout_replica_class = FullyAsyncvLLMReplica
 
         # 初始化其他必要属性为None，稍后在异步初始化中设置
         self.rm_wg = rm_wg
@@ -199,7 +166,6 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w
     @classmethod
     async def create(cls, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None):
         """异步工厂方法来创建和初始化 PartialAgentLoopManager 实例"""
-        print("异步工厂方法来创建和初始化 PartialAgentLoopManager 实例")
         instance = cls(config, worker_group, rm_wg)
         await instance._async_init()
         return instance
@@ -207,7 +173,6 @@ async def create(cls, config: DictConfig, worker_group: RayWorkerGroup = None, r
     async def _async_init(self):
         """异步初始化方法"""
         # 处理 rm_wg 相关初始化
-        print("处理 rm_wg 相关初始化")
         if self.rm_wg:
             def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]:
                 new_data_list = []
@@ -229,14 +194,8 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]:
 
             self.rm_micro_batch_size = self.rm_wg.world_size
 
-        # 初始化 LLM 服务器
-        print("初始化 LLM 服务器")
         await self._initialize_llm_servers_async()
-        await self._init_agent_loop_workers_async()
-
-        # 最初处于睡眠模式
-        if self.config.actor_rollout_ref.rollout.free_cache_engine:
-            await self.sleep()
+        self._init_agent_loop_workers()
 
     async def _initialize_llm_servers_async(self):
         """异步初始化 LLM 服务器"""
@@ -256,37 +215,16 @@ async def _initialize_llm_servers_async(self):
         ]
 
         if self.worker_group:
-            print("await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas])")
             await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
         else:
-            print("asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas])")
             await asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas])
 
         self.server_handles = [server._server_handle for server in self.rollout_replicas]
         self.server_addresses = [server._server_address for server in self.rollout_replicas]
 
-    async def _init_agent_loop_workers_async(self):
-        """异步初始化 agent loop workers"""
-        self.agent_loop_workers = []
-        num_workers = self.config.actor_rollout_ref.rollout.agent.num_workers
-
-        node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0]
-        tasks = []
-        for i in range(num_workers):
-            # Round-robin scheduling over the all nodes
-            node_id = node_ids[i % len(node_ids)]
-            worker = self.agent_loop_workers_class.options(
-                name=f"agent_loop_worker_{i}",
-                scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
-                    node_id=node_id, soft=True
-                ),
-            ).remote(self.config, self.server_handles, self.rm_executor)
-            self.agent_loop_workers.append(worker)
-
     async def generate_single_sample_async(
             self,
             sample: DataProto,
-            param_version: int,
             partial_output_list: Optional[list[AgentLoopOutput]],
     ) -> list[AgentLoopOutput]:
         """
@@ -294,16 +232,13 @@ async def generate_single_sample_async(
 
         Args:
             sample: 单个样本数据
-            param_version: 参数版本
             partial_output_list: Optional[List[AgentLoopOutput]]: 已经 rollout 的结果
 
         Returns:
             list[AgentLoopOutput]: 处理结果列表
         """
-        # 使用负载均衡选择 worker
         worker = self._select_best_worker()
-        # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
-        output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list)
+        output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list)
         return await asyncio.wrap_future(output_future.future())
 
     def _select_best_worker(self):
@@ -326,59 +261,3 @@ async def wake_up(self):
 
     async def sleep(self):
         await asyncio.gather(*[replica.sleep() for replica in self.rollout_replicas])
-
-# class PartialAgentLoopManager(AgentLoopManager):
-#     def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None):
-#         self.agent_loop_workers_class = FullyAsyncAgentLoopWorker
-#         self.rollout_replica_class = vLLMReplicaForPartial
-#         super().__init__(config, worker_group, rm_wg)
-#
-#     async def generate_single_sample_async(
-#             self,
-#             sample: DataProto,
-#             param_version: int,
-#             partial_output_list: Optional[list[AgentLoopOutput]],
-#     ) -> list[AgentLoopOutput]:
-#         """
-#         异步处理单个样本, 需要复制n次
-#
-#         Args:
-#             sample: 单个样本数据
-#             partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result.
-#
-#         Returns:
-#             tuple[AgentLoopOutput, float]: 处理结果和处理时间
-#         """
-#         # 使用负载均衡选择 worker
-#         worker = self._select_best_worker()
-#         # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput
-#         output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list)
-#         return await asyncio.wrap_future(output_future.future())
-#
-#     def _select_best_worker(self):
-#         """选择最佳的 worker（简单的轮询负载均衡）"""
-#         if not hasattr(self, "_worker_index"):
-#             self._worker_index = 0
-#
-#         worker = self.agent_loop_workers[self._worker_index]
-#         self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers)
-#         return worker
-#
-#     def cancel(self):
-#         """Cancel all rollout tasks asynchronously."""
-#         self._run_all([replica.cancel() for replica in self.rollout_replicas])
-#
-#     def resume(self):
-#         """Resume all rollout tasks asynchronously."""
-#         self._run_all([replica.resume() for replica in self.rollout_replicas])
-#
-#     def _run_all(self, tasks: list[asyncio.Task]):
-#         async def run_all():
-#             await asyncio.gather(*tasks)
-#
-#         try:
-#             loop = asyncio.get_running_loop()
-#             future = asyncio.run_coroutine_threadsafe(run_all(), loop)
-#             future.result()
-#         except RuntimeError:
-#             asyncio.run(run_all())
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
index e4b2a7115bb..85b2cd24d96 100644
--- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -17,11 +17,11 @@
 from uuid import uuid4
 
 from recipe.fully_async_policy.agent_loop.agent_loop import (
-    AgentLoopBase,
     AgentLoopOutput,
-    PartialAgentLoopOutput,
-    register,
+    FullyAsyncAgentLoopOutput
 )
+from verl.experimental.agent_loop.agent_loop import register
+from verl.experimental.agent_loop import AgentLoopBase
 from verl.utils.profiler import simple_timer
 
 logger = logging.getLogger(__file__)
@@ -39,11 +39,16 @@ def __init__(self, *args, **kwargs):
         self.apply_chat_template_kwargs = self.config.data.get("apply_chat_template_kwargs", {})
 
     async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
-        output: Optional[PartialAgentLoopOutput] = kwargs.get("output", None)
+        output: Optional[FullyAsyncAgentLoopOutput] = kwargs.get("output", None)
         messages = list(kwargs["raw_prompt"])
+        param_version = kwargs.get("param_version", 0)
 
         metrics = {}
         request_id = uuid4().hex
+
+        param_version_start = param_version
+        param_version_end = param_version
+
         if not output:
             prompt_ids = await self.loop.run_in_executor(
                 None,
@@ -56,15 +61,14 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu
                 # 恢复暂停的样本，结果直接添加到 prompt_ids 后面
                 prompt_ids = output.prompt_ids + output.response_ids
                 metrics["generate_sequences"] = output.metrics.generate_sequences
+                param_version_start = output.param_version_start
             else:
                 # 同一批样本，部分cancel，部分没有cancel， 没有cancel的样本直接返回
                 return output
-        request_id = uuid4().hex
         with simple_timer("generate_sequences", metrics):
             response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial(
                 request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
             )
-
         if not output:
             response_mask = [1] * len(response_ids)
         # 暂停待恢复样本, 把输出结果加到 response_ids 后，并重置 response_mask
@@ -74,7 +78,7 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu
             response_ids = output.response_ids + response_ids
             response_mask = [1] * len(response_ids)
 
-        return PartialAgentLoopOutput(
+        return FullyAsyncAgentLoopOutput(
             prompt_ids=prompt_ids,
             response_ids=response_ids[: self.response_length],
             response_mask=response_mask[: self.response_length],
@@ -82,4 +86,6 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu
             metrics=metrics,
             is_cancel=is_cancel,
             log_probs=log_probs,
+            param_version_start=param_version_start,
+            param_version_end=param_version_end,
         )
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index dcb22972e27..0f6a26ca13f 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -226,7 +226,6 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample):
     rs.param_version_end = [agent_loop.param_version_end for agent_loop in rs.agent_loop_output_list]
     # 第四步，清空 agent_loop_output_list
     rs.agent_loop_output_list = []
-
     return rs
 
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 82dc0bac008..40bf8eefa64 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -97,14 +97,14 @@ def __init__(
 
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
+        # ==================== fully async config ====================
+
         self.total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
         if self.config.rollout.total_rollout_steps is not None:
             self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps)
         print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}")
         self.total_train_steps = None
 
-        # ==================== fully async config ====================
-
         # Rollouter parameter configuration
         self.message_queue_client = None
 
@@ -247,15 +247,10 @@ async def init_workers(self):
         1. Ray resource pools from configuration
         2. Worker groups for each role (actor, critic, etc.)
         """
-        print("_init_resource_pools")
         self._init_resource_pools()
-        print("_create_worker_classes")
         self._create_worker_classes()
-        print("_init_worker_groups")
         self._init_worker_groups()
-        print("_init_models")
         self._init_models()
-        print("_init_async_rollout_manager")
         await self._init_async_rollout_manager()
 
     def _create_actor_rollout_classes(self):
@@ -285,7 +280,6 @@ def _create_continuous_iterator(self):
 
     async def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
-        print(f"_init_async_rollout_manager !!!!!!!!!!!!! {self.config.actor_rollout_ref.rollout.mode}")
         assert self.config.actor_rollout_ref.rollout.mode == "async"
         from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager
         self.async_rollout_mode = True
@@ -415,7 +409,7 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         """流式处理单个样本"""
         # 调用异步生成方法
         agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async(
-            rollout_sample.full_batch, self.current_param_version, rollout_sample.agent_loop_output_list
+            rollout_sample.full_batch, rollout_sample.agent_loop_output_list
         )
         # 直接更新 RolloutSample 对象，填充剩余字段
         rollout_sample.agent_loop_output_list = agent_loop_output_list
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 4ff64d8f787..f13781c850e 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -83,6 +83,7 @@ def __init__(
         if self.config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
 
+        # ==================== fully async config ====================
 
         self.message_queue_client = None
         self.param_synchronizer = None
diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py
index d4e09e794d2..86992de4c13 100644
--- a/recipe/fully_async_policy/ray_trainer.py
+++ b/recipe/fully_async_policy/ray_trainer.py
@@ -61,15 +61,10 @@ def init_workers(self):
         1. Ray resource pools from configuration
         2. Worker groups for each role (actor, critic, etc.)
         """
-        print("_init_resource_pools")
         self._init_resource_pools()
-        print("_create_worker_classes")
         self._create_worker_classes()
-        print("_init_worker_groups")
         self._init_worker_groups()
-        print("_init_models")
         self._init_models()
-        print("_init_async_rollout_manager")
         self._init_async_rollout_manager()
 
     def _init_resource_pools(self):
diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
index cbbea43e2ad..2faf6b89c0e 100644
--- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
+++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
@@ -125,7 +125,7 @@ async def resume(self):
             self.paused = False
 
 
-class vLLMReplicaForPartial(vLLMReplica):
+class FullyAsyncvLLMReplica(vLLMReplica):
     def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8):
         super().__init__(replica_rank, config, gpus_per_node)
         self.server_class = vLLMHttpServerForPartial
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 2ddc61910ba..fa7587204f1 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -118,7 +118,7 @@ common_params=(
     trainer.logger=['console']
     trainer.project_name='verl-test-fully-async'
     trainer.experiment_name="${exp_name}"
-    trainer.val_before_train=False
+    trainer.val_before_train=True
     trainer.save_freq=-1
     trainer.resume_mode=disable
     trainer.nnodes=1
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 11d541df17b..a6fdda210a0 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -364,7 +364,10 @@ def __init__(
         """
         self.config = config
 
-        self.server_manager = AsyncLLMServerManager(config, server_handles)
+        # for recipe to change
+        if not hasattr(self, 'server_manager_class'):
+            self.server_manager_class = AsyncLLMServerManager(config, server_handles)
+
         self.rm_executor = rm_executor
 
         model_path = config.actor_rollout_ref.model.path
@@ -477,7 +480,7 @@ async def _run_agent_loop(
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
                 trainer_config=_DummyConfig(config=self.config),
-                server_manager=self.server_manager,
+                server_manager=self.server_manager_class,
                 tokenizer=self.tokenizer,
                 processor=self.processor,
             )
diff --git a/verl/workers/rollout/replica.py b/verl/workers/rollout/replica.py
index d673bb51cd7..5b289af4b7e 100644
--- a/verl/workers/rollout/replica.py
+++ b/verl/workers/rollout/replica.py
@@ -110,7 +110,6 @@ async def init_hybrid(self, worker_group: RayWorkerGroup):
         Args:
             worker_group: RayWorkerGroup, fused workers where training engine(fsdp/megatron) have been initialized.
         """
-        print("=========== init_hybrid ============")
         self.rollout_mode = RolloutMode.HYBRID
         self.workers = worker_group.workers[
             self.world_size * self.replica_rank : self.world_size * (self.replica_rank + 1)

From 0e88084eb2dfb3e36e9fab53e0900053503a0576 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 20:18:17 +0800
Subject: [PATCH 152/182] refactor 6

---
 .../agent_loop/agent_loop.py                  | 50 ++++++++++-------
 .../partial_single_turn_agent_loop.py         |  7 +--
 recipe/fully_async_policy/detach_utils.py     |  2 +-
 recipe/fully_async_policy/fsdp_workers.py     | 13 +----
 .../fully_async_rollouter.py                  |  5 +-
 .../fully_async_policy/fully_async_trainer.py |  5 +-
 .../vllm_rollout/vllm_async_server.py         |  4 +-
 tests/special_e2e/run_fully_async_policy.sh   |  4 +-
 verl/experimental/agent_loop/__init__.py      |  2 +-
 verl/experimental/agent_loop/agent_loop.py    |  7 +--
 .../rollout/vllm_rollout/vllm_async_server.py | 56 +++++++++----------
 .../rollout/vllm_rollout/vllm_rollout_spmd.py | 48 ++++++++--------
 12 files changed, 95 insertions(+), 108 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 1b0b9218087..d9cb2c9187e 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -14,7 +14,7 @@
 import asyncio
 import logging
 import os
-from typing import Optional, Any
+from typing import Any, Optional
 
 import hydra
 import numpy as np
@@ -23,9 +23,16 @@
 from omegaconf import DictConfig
 
 from recipe.fully_async_policy.vllm_rollout.vllm_async_server import FullyAsyncvLLMReplica
-from verl.experimental.agent_loop.agent_loop import (AgentLoopOutput, _agent_loop_registry, _DummyConfig,
-                                                     AsyncLLMServerManager, AgentLoopWorkerBase, BatchExecutor,
-                                                     get_trajectory_info, AgentLoopManager)
+from verl.experimental.agent_loop.agent_loop import (
+    AgentLoopManager,
+    AgentLoopOutput,
+    AgentLoopWorkerBase,
+    AsyncLLMServerManager,
+    BatchExecutor,
+    _agent_loop_registry,
+    _DummyConfig,
+    get_trajectory_info,
+)
 from verl.protocol import DataProto
 from verl.single_controller.ray import RayWorkerGroup
 from verl.utils.rollout_trace import rollout_trace_attr
@@ -49,6 +56,7 @@ async def generate_for_partial(self, request_id, prompt_ids, sampling_params) ->
 
 class FullyAsyncAgentLoopOutput(AgentLoopOutput):
     """Agent loop output."""
+
     is_cancel: bool = False
     """Indicates whether the request was interrupted"""
     log_probs: list[float] = None
@@ -58,16 +66,17 @@ class FullyAsyncAgentLoopOutput(AgentLoopOutput):
     param_version_end: int = 0
     """Indicate end parameter version when this response is generated, used for partial rollout"""
 
+
 @ray.remote
 class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase):
     def __init__(
-            self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
+        self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
         self.server_manager_class = FullyAsyncLLMServerManager(config, server_handles)
         super().__init__(config, server_handles, rm_executor)
 
     async def generate_sequences_no_post(
-            self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
+        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
@@ -117,19 +126,19 @@ async def generate_sequences_no_post(
         return await asyncio.gather(*tasks)
 
     async def _partial_run_agent_loop(
-            self,
-            sampling_params: dict[str, Any],
-            trajectory: dict[str, Any],
-            *,
-            agent_name: str,
-            **kwargs,
+        self,
+        sampling_params: dict[str, Any],
+        trajectory: dict[str, Any],
+        *,
+        agent_name: str,
+        **kwargs,
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
-                step=trajectory["step"],
-                sample_index=trajectory["sample_index"],
-                rollout_n=trajectory["rollout_n"],
-                validate=trajectory["validate"],
-                name="agent_loop",
+            step=trajectory["step"],
+            sample_index=trajectory["sample_index"],
+            rollout_n=trajectory["rollout_n"],
+            validate=trajectory["validate"],
+            name="agent_loop",
         ):
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
@@ -174,6 +183,7 @@ async def _async_init(self):
         """异步初始化方法"""
         # 处理 rm_wg 相关初始化
         if self.rm_wg:
+
             def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]:
                 new_data_list = []
                 for data in data_list:
@@ -223,9 +233,9 @@ async def _initialize_llm_servers_async(self):
         self.server_addresses = [server._server_address for server in self.rollout_replicas]
 
     async def generate_single_sample_async(
-            self,
-            sample: DataProto,
-            partial_output_list: Optional[list[AgentLoopOutput]],
+        self,
+        sample: DataProto,
+        partial_output_list: Optional[list[AgentLoopOutput]],
     ) -> list[AgentLoopOutput]:
         """
         异步处理单个样本
diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
index 85b2cd24d96..e7223eea894 100644
--- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -16,12 +16,9 @@
 from typing import Any, Optional
 from uuid import uuid4
 
-from recipe.fully_async_policy.agent_loop.agent_loop import (
-    AgentLoopOutput,
-    FullyAsyncAgentLoopOutput
-)
-from verl.experimental.agent_loop.agent_loop import register
+from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopOutput, FullyAsyncAgentLoopOutput
 from verl.experimental.agent_loop import AgentLoopBase
+from verl.experimental.agent_loop.agent_loop import register
 from verl.utils.profiler import simple_timer
 
 logger = logging.getLogger(__file__)
diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py
index 0f6a26ca13f..4225340e539 100644
--- a/recipe/fully_async_policy/detach_utils.py
+++ b/recipe/fully_async_policy/detach_utils.py
@@ -291,7 +291,7 @@ def assemble_batch_from_rollout_samples(
     }
     processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()}
 
-    param_version_diff = [abs(a - b) for a, b in zip(rs.param_version_end, rs.param_version_start)]
+    param_version_diff = [abs(a - b) for a, b in zip(rs.param_version_end, rs.param_version_start, strict=False)]
     num_diff0 = param_version_diff.count(0)
     partial_stats = {
         "fully_async/partial/total_partial_num": len(param_version_diff) - num_diff0,
diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index 241ce46272a..8471897fc83 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -18,29 +18,18 @@
 
 import torch
 import torch.distributed
-from omegaconf import DictConfig, OmegaConf
-from torch.distributed.device_mesh import init_device_mesh
+from omegaconf import DictConfig
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from transformers import AutoConfig
 
-from verl.single_controller.base import Worker
 from verl.single_controller.base.decorator import Dispatch, register
-from verl.utils import hf_processor, hf_tokenizer, omega_conf_to_dataclass
-from verl.utils.debug import DistProfiler, DistProfilerExtension, log_gpu_memory_usage
 from verl.utils.device import (
     get_device_name,
-    get_nccl_backend,
     get_torch_device,
 )
-from verl.utils.fs import copy_to_local
 from verl.utils.fsdp_utils import (
     fsdp_version,
 )
-from verl.utils.import_utils import import_external_libs
-from verl.utils.model import get_generation_config, update_model_config
-from verl.workers.config import HFModelConfig, RolloutConfig
 from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
-from verl.workers.rollout import get_rollout_class
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 40bf8eefa64..273f8348929 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -282,6 +282,7 @@ async def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         assert self.config.actor_rollout_ref.rollout.mode == "async"
         from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager
+
         self.async_rollout_mode = True
         print(f"{self.async_rollout_mode}")
         self.async_rollout_manager = await PartialAgentLoopManager.create(
@@ -290,7 +291,6 @@ async def _init_async_rollout_manager(self):
         )
         print(f"self.async_rollout_manager {self.async_rollout_manager}")
 
-
     # 添加样本到待处理队列的协程
     async def _feed_samples(self):
         continuous_iterator = self._create_continuous_iterator()
@@ -604,8 +604,7 @@ async def _should_pause_generation(self) -> bool:
         return False
 
     async def pause(self):
-        """pause rollout
-        """
+        """pause rollout"""
         print("[FullyAsyncRollouter][Public][Pause]")
         async with self.lock:
             self.paused = True
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index f13781c850e..a4c59c33701 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import time
-import warnings
 from datetime import datetime
 from pprint import pprint
 from typing import Any
@@ -31,11 +30,9 @@
 from recipe.fully_async_policy.ray_trainer import FullyAsyncRayPPOTrainer
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.trainer.ppo import core_algos
-from verl.trainer.ppo.core_algos import AdvantageEstimator
 from verl.trainer.ppo.ray_trainer import ResourcePoolManager
-from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model, need_critic
+from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model
 from verl.utils.debug import marked_timer
-from verl.utils.tracking import ValidationGenerationsLogger
 
 
 @ray.remote(num_cpus=10)
diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
index 2faf6b89c0e..0831aebd5b4 100644
--- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
+++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
@@ -25,8 +25,8 @@
 from verl.workers.rollout.replica import RolloutMode
 from verl.workers.rollout.vllm_rollout.vllm_async_server import (
     _qwen2_5_vl_dedup_image_tokens,
-    vLLMHttpServer,
-    vLLMReplica, vLLMHttpServerBase,
+    vLLMHttpServerBase,
+    vLLMReplica,
 )
 
 logger = logging.getLogger(__file__)
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index fa7587204f1..096cb05c7a1 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -49,8 +49,8 @@ top_k=-1
 val_top_p=0.7
 
 # Fully async specific parameters
-n_gpus_rollout=1
-n_gpus_training=1
+n_gpus_rollout=4
+n_gpus_training=4
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index 27b633e5055..fd3d2ca1b84 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .agent_loop import AgentLoopBase, AgentLoopManager, AsyncLLMServerManager, AgentLoopWorker
+from .agent_loop import AgentLoopBase, AgentLoopManager, AgentLoopWorker, AsyncLLMServerManager
 from .single_turn_agent_loop import SingleTurnAgentLoop
 from .tool_agent_loop import ToolAgentLoop
 
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index a6fdda210a0..5183865ee64 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -365,7 +365,7 @@ def __init__(
         self.config = config
 
         # for recipe to change
-        if not hasattr(self, 'server_manager_class'):
+        if not hasattr(self, "server_manager_class"):
             self.server_manager_class = AsyncLLMServerManager(config, server_handles)
 
         self.rm_executor = rm_executor
@@ -756,9 +756,9 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]:
             self.rm_micro_batch_size = rm_wg.world_size
 
         # for recipe to change
-        if not hasattr(self, 'rollout_replica_class'):
+        if not hasattr(self, "rollout_replica_class"):
             self.rollout_replica_class = get_rollout_replica_class(self.config.actor_rollout_ref.rollout.name)
-        if not hasattr(self, 'agent_loop_workers_class'):
+        if not hasattr(self, "agent_loop_workers_class"):
             self.agent_loop_workers_class = AgentLoopWorker
 
         self._initialize_llm_servers()
@@ -790,7 +790,6 @@ def _initialize_llm_servers(self):
         self.server_handles = [server._server_handle for server in self.rollout_replicas]
         self.server_addresses = [server._server_address for server in self.rollout_replicas]
 
-
     def _init_agent_loop_workers(self):
         self.agent_loop_workers = []
         num_workers = self.config.actor_rollout_ref.rollout.agent.num_workers
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index e996df19247..a3b22765ba4 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -74,11 +74,11 @@ def _init_executor(self) -> None:
         self.collective_rpc("load_model")
 
     def collective_rpc(
-            self,
-            method: str | Callable,
-            timeout: Optional[float] = None,
-            args: tuple = (),
-            kwargs: Optional[dict[str, Any]] = None,
+        self,
+        method: str | Callable,
+        timeout: Optional[float] = None,
+        args: tuple = (),
+        kwargs: Optional[dict[str, Any]] = None,
     ) -> list[Any]:
         if isinstance(method, str):
             sent_method = method
@@ -107,14 +107,14 @@ class vLLMHttpServerBase:
     """
 
     def __init__(
-            self,
-            config: DictConfig,
-            rollout_mode: RolloutMode,
-            workers: list[ActorHandle],
-            replica_rank: int,
-            node_rank: int,
-            gpus_per_node: int,
-            nnodes: int,
+        self,
+        config: DictConfig,
+        rollout_mode: RolloutMode,
+        workers: list[ActorHandle],
+        replica_rank: int,
+        node_rank: int,
+        gpus_per_node: int,
+        nnodes: int,
     ):
         """
         Args:
@@ -244,7 +244,7 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
         print(
             "=" * 1000,
             f"replica_rank={self.replica_rank}, node_rank={self.node_rank}, nnodes={self.nnodes}, "
-            f"get worker zmq addresses: {zmq_addresses}"
+            f"get worker zmq addresses: {zmq_addresses}",
         )
         os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses)
 
@@ -273,11 +273,11 @@ async def run_server(self, args: argparse.Namespace):
             engine_client.shutdown = lambda: None
 
     async def generate(
-            self,
-            prompt_ids: list[int],
-            sampling_params: dict[str, Any],
-            request_id: str,
-            image_data: Optional[list[Any]] = None,
+        self,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        request_id: str,
+        image_data: Optional[list[Any]] = None,
     ) -> TokenOutput:
         """Generate sequence with token-in-token-out."""
         # TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready.
@@ -333,14 +333,14 @@ class vLLMHttpServer(vLLMHttpServerBase):
     """
 
     def __init__(
-            self,
-            config: DictConfig,
-            rollout_mode: RolloutMode,
-            workers: list[ActorHandle],
-            replica_rank: int,
-            node_rank: int,
-            gpus_per_node: int,
-            nnodes: int,
+        self,
+        config: DictConfig,
+        rollout_mode: RolloutMode,
+        workers: list[ActorHandle],
+        replica_rank: int,
+        node_rank: int,
+        gpus_per_node: int,
+        nnodes: int,
     ):
         super().__init__(config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes)
 
@@ -385,7 +385,7 @@ async def launch_servers(self):
 
         # create server actor in each node with node affinity
         for node_rank in range(nnodes):
-            workers = self.workers[node_rank * gpus_per_node: (node_rank + 1) * gpus_per_node]
+            workers = self.workers[node_rank * gpus_per_node : (node_rank + 1) * gpus_per_node]
             node_id = worker_node_ids[node_rank * gpus_per_node]
             server = self.server_class.options(
                 scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index 5e084509aee..ff301dfb3ac 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -91,10 +91,10 @@ def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor) -> list[in
 
 class vLLMRollout(BaseRollout):
     def __init__(
-            self,
-            config: RolloutConfig,
-            model_config: HFModelConfig,
-            device_mesh: DeviceMesh,
+        self,
+        config: RolloutConfig,
+        model_config: HFModelConfig,
+        device_mesh: DeviceMesh,
     ):
         super().__init__(config, model_config, device_mesh)
 
@@ -125,11 +125,11 @@ def __init__(
             if hasattr(model_hf_config, "max_position_embeddings"):
                 max_position_embeddings = model_hf_config.max_position_embeddings
             elif hasattr(model_hf_config, "llm_config") and hasattr(
-                    model_hf_config.llm_config, "max_position_embeddings"
+                model_hf_config.llm_config, "max_position_embeddings"
             ):
                 max_position_embeddings = model_hf_config.llm_config.max_position_embeddings
             elif hasattr(model_hf_config, "text_config") and hasattr(
-                    model_hf_config.text_config, "max_position_embeddings"
+                model_hf_config.text_config, "max_position_embeddings"
             ):
                 max_position_embeddings = model_hf_config.text_config.max_position_embeddings
             if max_position_embeddings is None:
@@ -144,12 +144,12 @@ def __init__(
             rope_scaling_factor = rope_scaling_config.get("factor", 1.0)
 
             assert (
-                    model_hf_config.max_position_embeddings * rope_scaling_factor
-                    >= config.prompt_length + config.response_length
+                model_hf_config.max_position_embeddings * rope_scaling_factor
+                >= config.prompt_length + config.response_length
             ), (
-                    "model context length should be greater than total sequence length, "
-                    + f"got rope_scaling_factor={rope_scaling_factor} and "
-                    + f"max_position_embeddings={model_hf_config.max_position_embeddings}"
+                "model context length should be greater than total sequence length, "
+                + f"got rope_scaling_factor={rope_scaling_factor} and "
+                + f"max_position_embeddings={model_hf_config.max_position_embeddings}"
             )
 
         max_model_len = int(config.max_model_len or config.prompt_length + config.response_length)
@@ -289,7 +289,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
         if "multi_modal_data" in non_tensor_batch:
             vllm_inputs = []
             for raw_prompt_ids, multi_modal_data in zip(
-                    non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True
+                non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True
             ):
                 vllm_inputs.append({"prompt_token_ids": raw_prompt_ids, "multi_modal_data": multi_modal_data})
         else:
@@ -332,9 +332,8 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
             if len(lora_int_ids) > 0:
                 lora_int_id = lora_int_ids[0]
                 lora_requests = [
-                                    LoRARequest(lora_name=f"{lora_int_id}", lora_int_id=lora_int_id,
-                                                lora_path="/simon-stub-path")
-                                ] * batch_size
+                    LoRARequest(lora_name=f"{lora_int_id}", lora_int_id=lora_int_id, lora_path="/simon-stub-path")
+                ] * batch_size
 
         # users can customize different sampling_params at different run
         with self.update_sampling_params(**kwargs):
@@ -459,9 +458,9 @@ def _monkey_patch_compute_logits(model, vocab_size: int):
     original_compute_logits = model.compute_logits
 
     def compute_logits(
-            self,
-            hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata,
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
         logits = original_compute_logits(hidden_states, sampling_metadata)
         logits[..., vocab_size:] = float("-inf")
@@ -474,10 +473,10 @@ class vLLMAsyncRollout(BaseRollout):
     """vLLMAsyncRollout is a thin wrapper of WorkerWrapperBase, which is engine in single worker process."""
 
     def __init__(
-            self,
-            config: RolloutConfig,
-            model_config: HFModelConfig,
-            device_mesh: DeviceMesh,
+        self,
+        config: RolloutConfig,
+        model_config: HFModelConfig,
+        device_mesh: DeviceMesh,
     ):
         super().__init__(config, model_config, device_mesh)
 
@@ -535,10 +534,7 @@ async def _loop_forever(self):
     def _init_worker(self, all_kwargs: list[dict[str, Any]]):
         """Initialize worker engine."""
 
-        print("=" * 100, "\n",
-              "=" * 100, "\n",
-              "=" * 100, "\n",
-              "Initializing vLLMAsyncRollout...")
+        print("=" * 100, "\n", "=" * 100, "\n", "=" * 100, "\n", "Initializing vLLMAsyncRollout...")
 
         all_kwargs[0]["rank"] = int(os.environ["RANK"])
         device_name = "NPU" if is_npu_available else "GPU"

From a48ec884b82fb01161b21803b6e847318b9c835f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 20:49:43 +0800
Subject: [PATCH 153/182] refactor 7

---
 examples/grpo_trainer/run_qwen2-7b_seq_balance.sh      | 6 +++---
 recipe/fully_async_policy/agent_loop/__init__.py       | 5 ++---
 recipe/fully_async_policy/agent_loop/agent_loop.py     | 4 ++--
 recipe/fully_async_policy/fully_async_rollouter.py     | 3 ---
 verl/experimental/agent_loop/agent_loop.py             | 6 +++---
 verl/workers/rollout/vllm_rollout/vllm_async_server.py | 5 ++---
 verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py | 3 ---
 7 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
index f4ca9a41d7e..fdc1ef606d7 100644
--- a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
+++ b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
@@ -3,7 +3,7 @@ set -x
 
 # For async rollout mode, dataset should return raw chat.
 rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
+rollout_name="sglang" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
     export VLLM_USE_V1=1
     return_raw_chat="True"
@@ -19,7 +19,7 @@ python3 -m verl.trainer.main_ppo \
     data.max_response_length=1024 \
     data.filter_overlong_prompts=True \
     data.truncation='error' \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=True \
     actor_rollout_ref.actor.ppo_mini_batch_size=256 \
@@ -41,7 +41,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
-    trainer.logger='["console","tensorboard"]' \
+    trainer.logger='["console","wandb"]' \
     trainer.project_name='verl_grpo_example_gsm8k' \
     trainer.experiment_name='qwen2_7b_function_rm_kl1e-3' \
     trainer.val_before_train=False \
diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
index 40dcd0ac7a3..773dab10572 100644
--- a/recipe/fully_async_policy/agent_loop/__init__.py
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
+from .agent_loop import PartialAgentLoopManager
 
 _ = [PartialSingleTurnAgentLoop]
-
-
-from .agent_loop import PartialAgentLoopManager
+__all__ = ["PartialAgentLoopManager"]
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index d9cb2c9187e..3fce8d65e60 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -72,7 +72,7 @@ class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase):
     def __init__(
         self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
-        self.server_manager_class = FullyAsyncLLMServerManager(config, server_handles)
+        self.server_manager = FullyAsyncLLMServerManager(config, server_handles)
         super().__init__(config, server_handles, rm_executor)
 
     async def generate_sequences_no_post(
@@ -148,7 +148,7 @@ async def _partial_run_agent_loop(
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
                 trainer_config=_DummyConfig(config=self.config),
-                server_manager=self.server_manager_class,
+                server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
                 processor=self.processor,
             )
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 273f8348929..ca5e312a1a1 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -282,14 +282,11 @@ async def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         assert self.config.actor_rollout_ref.rollout.mode == "async"
         from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager
-
         self.async_rollout_mode = True
-        print(f"{self.async_rollout_mode}")
         self.async_rollout_manager = await PartialAgentLoopManager.create(
             config=self.config,
             worker_group=self.rollout_wg,
         )
-        print(f"self.async_rollout_manager {self.async_rollout_manager}")
 
     # 添加样本到待处理队列的协程
     async def _feed_samples(self):
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 5183865ee64..ae56c2a187a 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -365,8 +365,8 @@ def __init__(
         self.config = config
 
         # for recipe to change
-        if not hasattr(self, "server_manager_class"):
-            self.server_manager_class = AsyncLLMServerManager(config, server_handles)
+        if not hasattr(self, "server_manager"):
+            self.server_manager = AsyncLLMServerManager(config, server_handles)
 
         self.rm_executor = rm_executor
 
@@ -480,7 +480,7 @@ async def _run_agent_loop(
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
                 trainer_config=_DummyConfig(config=self.config),
-                server_manager=self.server_manager_class,
+                server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
                 processor=self.processor,
             )
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index a3b22765ba4..75195009057 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -241,10 +241,9 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
         server_args.distributed_executor_backend = distributed_executor_backend
 
         zmq_addresses = ray.get([worker.get_zeromq_address.remote() for worker in self.workers])
-        print(
-            "=" * 1000,
+        logger.info(
             f"replica_rank={self.replica_rank}, node_rank={self.node_rank}, nnodes={self.nnodes}, "
-            f"get worker zmq addresses: {zmq_addresses}",
+            f"get worker zmq addresses: {zmq_addresses}"
         )
         os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses)
 
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index ff301dfb3ac..baef0c9315e 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -68,7 +68,6 @@
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
-
 # TODO
 # 1. support pp in vllm
 # 2. passing tokenizer is not necessary? no encoding/decoding is happending here
@@ -534,8 +533,6 @@ async def _loop_forever(self):
     def _init_worker(self, all_kwargs: list[dict[str, Any]]):
         """Initialize worker engine."""
 
-        print("=" * 100, "\n", "=" * 100, "\n", "=" * 100, "\n", "Initializing vLLMAsyncRollout...")
-
         all_kwargs[0]["rank"] = int(os.environ["RANK"])
         device_name = "NPU" if is_npu_available else "GPU"
         all_kwargs[0]["local_rank"] = (

From e6819cdb180f516253386a45807066fc2c12aa52 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 21:06:42 +0800
Subject: [PATCH 154/182] refactor 8

---
 .../fully_async_policy/agent_loop/__init__.py |   4 +-
 .../agent_loop/agent_loop.py                  |  14 +-
 recipe/fully_async_policy/fsdp_workers.py     |   2 +-
 .../fully_async_rollouter.py                  |  10 +-
 recipe/fully_async_policy/ray_trainer.py      |  16 +-
 .../unittest/test_batch_utils.py              | 344 ----------------
 recipe/fully_async_policy/unittest/test_mq.py | 387 ------------------
 .../vllm_rollout/vllm_async_server.py         |   8 +
 verl/experimental/agent_loop/__init__.py      |   2 +-
 9 files changed, 25 insertions(+), 762 deletions(-)
 delete mode 100644 recipe/fully_async_policy/unittest/test_batch_utils.py
 delete mode 100644 recipe/fully_async_policy/unittest/test_mq.py

diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py
index 773dab10572..e30d78f1a8a 100644
--- a/recipe/fully_async_policy/agent_loop/__init__.py
+++ b/recipe/fully_async_policy/agent_loop/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .agent_loop import FullyAsyncAgentLoopManager
 from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop
-from .agent_loop import PartialAgentLoopManager
 
 _ = [PartialSingleTurnAgentLoop]
-__all__ = ["PartialAgentLoopManager"]
+__all__ = [FullyAsyncAgentLoopManager]
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 3fce8d65e60..2ccfa712c54 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -155,9 +155,8 @@ async def _partial_run_agent_loop(
             return await agent_loop.run(sampling_params, **kwargs)
 
 
-class PartialAgentLoopManager(AgentLoopManager):
+class FullyAsyncAgentLoopManager(AgentLoopManager):
     def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None):
-        # 初始化基本属性，但不执行异步操作
         self.config = config
         self.worker_group = worker_group
         self.rm_executor = None
@@ -165,7 +164,6 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w
         self.agent_loop_workers_class = FullyAsyncAgentLoopWorker
         self.rollout_replica_class = FullyAsyncvLLMReplica
 
-        # 初始化其他必要属性为None，稍后在异步初始化中设置
         self.rm_wg = rm_wg
         self.rollout_replicas = None
         self.server_handles = None
@@ -174,14 +172,11 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w
 
     @classmethod
     async def create(cls, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None):
-        """异步工厂方法来创建和初始化 PartialAgentLoopManager 实例"""
         instance = cls(config, worker_group, rm_wg)
         await instance._async_init()
         return instance
 
     async def _async_init(self):
-        """异步初始化方法"""
-        # 处理 rm_wg 相关初始化
         if self.rm_wg:
 
             def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]:
@@ -208,7 +203,6 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]:
         self._init_agent_loop_workers()
 
     async def _initialize_llm_servers_async(self):
-        """异步初始化 LLM 服务器"""
         rollout_world_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
         world_size = (
             self.worker_group.world_size
@@ -239,11 +233,9 @@ async def generate_single_sample_async(
     ) -> list[AgentLoopOutput]:
         """
         异步处理单个样本
-
         Args:
             sample: 单个样本数据
             partial_output_list: Optional[List[AgentLoopOutput]]: 已经 rollout 的结果
-
         Returns:
             list[AgentLoopOutput]: 处理结果列表
         """
@@ -252,7 +244,6 @@ async def generate_single_sample_async(
         return await asyncio.wrap_future(output_future.future())
 
     def _select_best_worker(self):
-        """选择最佳的 worker（简单的轮询负载均衡）"""
         if not hasattr(self, "_worker_index"):
             self._worker_index = 0
 
@@ -271,3 +262,6 @@ async def wake_up(self):
 
     async def sleep(self):
         await asyncio.gather(*[replica.sleep() for replica in self.rollout_replicas])
+
+    async def reset_prefix_cache(self):
+        await asyncio.gather(*[replica.reset_prefix_cache() for replica in self.rollout_replicas])
diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index 8471897fc83..ffe50941187 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -36,7 +36,7 @@
 
 device_name = get_device_name()
 
-__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
+__all__ = ["DetachActorWorker", "DetachAsyncRolloutWorker", "CriticWorker"]
 
 
 def get_inference_model(rollout):
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index ca5e312a1a1..d10c0684be4 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -281,9 +281,10 @@ def _create_continuous_iterator(self):
     async def _init_async_rollout_manager(self):
         # create async rollout manager and request scheduler
         assert self.config.actor_rollout_ref.rollout.mode == "async"
-        from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager
+        from recipe.fully_async_policy.agent_loop import FullyAsyncAgentLoopManager
+
         self.async_rollout_mode = True
-        self.async_rollout_manager = await PartialAgentLoopManager.create(
+        self.async_rollout_manager = await FullyAsyncAgentLoopManager.create(
             config=self.config,
             worker_group=self.rollout_wg,
         )
@@ -405,6 +406,9 @@ async def _processor_worker(self):
     async def _process_single_sample_streaming(self, rollout_sample: RolloutSample):
         """流式处理单个样本"""
         # 调用异步生成方法
+        rollout_sample.full_batch.non_tensor_batch["param_version"] = [self.current_param_version] * len(
+            rollout_sample.full_batch
+        )
         agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async(
             rollout_sample.full_batch, rollout_sample.agent_loop_output_list
         )
@@ -612,7 +616,7 @@ async def pause(self):
                 await asyncio.gather(*self.active_tasks, return_exceptions=True)
                 self.active_tasks.clear()
                 print("[FullyAsyncRollouter][Public][Pause] All active tasks completed")
-            # TODO async_rollout_manager clear kv cache
+            await self.async_rollout_manager.reset_prefix_cache()
             self.monitor_loop_trigger = False
 
     async def resume(self):
diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py
index 86992de4c13..b82d9fe0aae 100644
--- a/recipe/fully_async_policy/ray_trainer.py
+++ b/recipe/fully_async_policy/ray_trainer.py
@@ -38,7 +38,7 @@
     compute_throughout_metrics,
     compute_timing_metrics,
 )
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer, compute_advantage
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
 from verl.trainer.ppo.reward import compute_reward, compute_reward_async
 from verl.trainer.ppo.utils import Role
 from verl.utils.checkpoint.checkpoint_manager import should_save_ckpt_esi
@@ -51,9 +51,6 @@
 
 
 class FullyAsyncRayPPOTrainer(RayPPOTrainer):
-    def __init__(self, *args, **kwargs):
-        pass
-
     def init_workers(self):
         """Initialize distributed training workers using Ray backend.
 
@@ -161,16 +158,7 @@ def _init_models(self):
         self.actor_rollout_wg.init_model()
 
     def _init_async_rollout_manager(self):
-        # create async rollout manager and request scheduler
-        self.async_rollout_mode = False
-        if self.config.actor_rollout_ref.rollout.mode == "async":
-            from recipe.fully_async_policy.agent_loop.agent_loop import PartialAgentLoopManager
-
-            self.async_rollout_mode = True
-            self.async_rollout_manager = PartialAgentLoopManager(
-                config=self.config,
-                worker_group=self.actor_rollout_wg,
-            )
+        pass
 
     def fit(self):
         """
diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py
deleted file mode 100644
index 363423b589d..00000000000
--- a/recipe/fully_async_policy/unittest/test_batch_utils.py
+++ /dev/null
@@ -1,344 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import time
-import unittest
-from dataclasses import dataclass
-from unittest.mock import MagicMock
-
-import numpy as np
-import torch
-from tensordict import TensorDict
-
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
-
-from recipe.fully_async_policy.detach_utils import RolloutSample, assemble_batch_from_rollout_samples
-from verl import DataProto
-
-
-@dataclass
-class MockAgentLoopMetrics:
-    """Mock AgentLoopMetrics for testing"""
-
-    generate_sequences: float = 0.5
-    tool_calls: float = 0.0
-
-
-@dataclass
-class MockAgentLoopOutput:
-    """Mock AgentLoopOutput for testing"""
-
-    prompt_ids: list[int]
-    response_ids: list[int]
-    response_mask: list[int]
-    num_turns: int = 1
-    metrics: MockAgentLoopMetrics = None
-
-    def __post_init__(self):
-        if self.metrics is None:
-            self.metrics = MockAgentLoopMetrics()
-
-
-class MockConfig:
-    """Mock configuration object"""
-
-    def __init__(self):
-        self.trainer = MockTrainerConfig()
-
-
-class MockTrainerConfig:
-    """Mock trainer configuration"""
-
-    def __init__(self):
-        self.balance_batch = False
-
-
-class TestBatchUtils(unittest.TestCase):
-    def setUp(self):
-        """设置测试环境"""
-        self.tokenizer = MagicMock()
-        self.config = MockConfig()
-
-        # Mock postprocess_agent_loop_outputs function
-        self.mock_postprocess = MagicMock()
-
-        # Patch the postprocess function
-        import recipe.fully_async_policy.detach_utils as detach_utils_module
-
-        self.original_postprocess = detach_utils_module.postprocess_agent_loop_outputs
-        detach_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess
-
-        # Mock compute_response_mask function
-        self.original_compute_response_mask = detach_utils_module.compute_response_mask
-        detach_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64))
-
-    def tearDown(self):
-        """清理测试环境"""
-        import recipe.fully_async_policy.detach_utils as detach_utils_module
-
-        detach_utils_module.postprocess_agent_loop_outputs = self.original_postprocess
-        detach_utils_module.compute_response_mask = self.original_compute_response_mask
-
-    def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> RolloutSample:
-        """创建测试用的 RolloutSample"""
-        # 创建 mock AgentLoopOutput
-        agent_loop_output = MockAgentLoopOutput(
-            prompt_ids=torch.randint(0, 32000, (175,)).tolist(),
-            response_ids=torch.randint(0, 32000, (175,)).tolist(),
-            response_mask=[1] * 175,  # 真实的response长度
-            num_turns=2,
-            metrics=MockAgentLoopMetrics(generate_sequences=1.6468379497528076, tool_calls=0.0),
-        )
-
-        # 创建mock _gen_data
-        mock_gen_data = DataProto(
-            non_tensor_batch={
-                "raw_prompt": np.array(
-                    [
-                        [
-                            {
-                                "content": "Tom receives a $12 allowance per month.",
-                                "role": "user",
-                            }
-                        ]
-                    ],
-                    dtype=object,
-                ),
-                "tools_kwargs": np.array([{}], dtype=object),
-                "interaction_kwargs": np.array([{}], dtype=object),
-                "index": np.array([4570], dtype=object),
-            },
-            meta_info={"global_steps": 1},
-        )
-
-        return RolloutSample(
-            full_batch=mock_gen_data,
-            agent_loop_output_list=agent_loop_output,
-            sample_id=sample_id,
-            epoch=0,
-            rollout_n_index=0,
-            original_sample_index=0,
-            processing_time=1.6468379497528076,
-            generation_timestamp=time.time(),
-            param_version=param_version,
-        )
-
-    # def test_assemble_batch_empty_input(self):
-    #     """测试空输入的情况"""
-    #     with self.assertRaises(ValueError) as context:
-    #         assemble_batch_from_rollout_samples([], self.tokenizer, self.config)
-    #
-    #     self.assertIn("Empty rollout_samples", str(context.exception))
-    #
-    # def test_assemble_batch_single_sample(self):
-    #     """测试单个样本的批次组装"""
-    #     # 设置mock返回值 - 使用正确的TensorDict格式
-    #     mock_gen_batch = DataProto(
-    #         batch=TensorDict({
-    #             "input_ids": torch.randint(0, 1000, (1, 256)),
-    #             "attention_mask": torch.ones(1, 256, dtype=torch.int64),
-    #             "position_ids": torch.arange(256).unsqueeze(0),
-    #             "prompts": torch.randint(0, 1000, (1, 128)),
-    #             "responses": torch.randint(0, 1000, (1, 128)),
-    #             "response_mask": torch.ones(1, 128, dtype=torch.int64),
-    #         }, batch_size=1),
-    #         non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
-    #         meta_info={"test_meta": "test_value"}
-    #     )
-    #     self.mock_postprocess.return_value = mock_gen_batch
-    #
-    #     # 创建测试样本
-    #     rollout_samples = [self.create_mock_rollout_sample("sample_1")]
-    #
-    #     # 调用函数
-    #     result = assemble_batch_from_rollout_samples(
-    #         rollout_samples=rollout_samples,
-    #         tokenizer=self.tokenizer,
-    #         config=self.config
-    #     )
-    #
-    #     # 验证结果
-    #     self.assertIsInstance(result, DataProto)
-    #     self.assertIn("uid", result.non_tensor_batch)
-    #     self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1")
-    #
-    #     # 验证meta_info包含预期字段
-    #     expected_fields = [
-    #         "rollout_param_versions", "sample_timestamps", "avg_processing_time",
-    #         "max_processing_time", "param_version_diversity", "avg_sample_age", "assembly_time"
-    #     ]
-    #     for field in expected_fields:
-    #         self.assertIn(field, result.meta_info)
-    #
-    #     # 验证统计信息
-    #     self.assertEqual(result.meta_info["rollout_param_versions"], [1])
-    #     self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5)
-    #     self.assertEqual(result.meta_info["param_version_diversity"], 1)
-
-    def test_assemble_batch_multiple_samples(self):
-        """测试多个样本的批次组装"""
-        # 设置mock返回值 - 使用正确的TensorDict格式
-        mock_gen_batch = DataProto(
-            batch=TensorDict(
-                {
-                    "input_ids": torch.randint(0, 1000, (2, 256)),
-                    "attention_mask": torch.ones(2, 256, dtype=torch.int64),
-                    "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1),
-                    "prompts": torch.randint(0, 1000, (2, 128)),
-                    "responses": torch.randint(0, 1000, (2, 128)),
-                    "response_mask": torch.ones(2, 128, dtype=torch.int64),
-                },
-                batch_size=2,
-            ),
-            non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)},
-            meta_info={"test_meta": "test_value"},
-        )
-        self.mock_postprocess.return_value = mock_gen_batch
-
-        # 创建测试样本
-        rollout_samples = [
-            self.create_mock_rollout_sample("sample_1", param_version=1),
-            self.create_mock_rollout_sample("sample_2", param_version=2),
-        ]
-
-        print(rollout_samples)
-
-        # 调用函数
-        result = assemble_batch_from_rollout_samples(
-            rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config
-        )
-
-        # 验证结果
-        self.assertIsInstance(result, DataProto)
-        self.assertEqual(len(result.non_tensor_batch["uid"]), 2)
-        self.assertListEqual(list(result.non_tensor_batch["uid"]), ["uid_sample_1", "uid_sample_2"])
-
-        # 验证多样本统计
-        self.assertEqual(result.meta_info["rollout_param_versions"], [1, 2])
-        self.assertEqual(result.meta_info["param_version_diversity"], 2)  # 两个不同版本
-        self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5)
-
-    # def test_assemble_batch_with_balance_batch_flag(self):
-    #     """测试启用balance_batch标志的情况"""
-    #     # 设置mock返回值 - 使用正确的TensorDict格式
-    #     mock_gen_batch = DataProto(
-    #         batch=TensorDict({
-    #             "input_ids": torch.randint(0, 1000, (1, 256)),
-    #             "attention_mask": torch.ones(1, 256, dtype=torch.int64),
-    #             "position_ids": torch.arange(256).unsqueeze(0),
-    #             "prompts": torch.randint(0, 1000, (1, 128)),
-    #             "responses": torch.randint(0, 1000, (1, 128)),
-    #             "response_mask": torch.ones(1, 128, dtype=torch.int64),
-    #         }, batch_size=1),
-    #         non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
-    #         meta_info={"test_meta": "test_value"}
-    #     )
-    #     self.mock_postprocess.return_value = mock_gen_batch
-    #
-    #     # 设置config启用balance_batch
-    #     self.config.trainer.balance_batch = True
-    #
-    #     # 创建测试样本
-    #     rollout_samples = [self.create_mock_rollout_sample("sample_1")]
-    #
-    #     # 调用函数
-    #     result = assemble_batch_from_rollout_samples(
-    #         rollout_samples=rollout_samples,
-    #         tokenizer=self.tokenizer,
-    #         config=self.config,
-    #         balance_batch=True
-    #     )
-    #
-    #     # 验证结果（主要验证没有抛出异常）
-    #     self.assertIsInstance(result, DataProto)
-    #
-    # def test_assemble_batch_attention_mask_processing(self):
-    #     """测试attention_mask处理逻辑"""
-    #     # 设置mock返回值 - 使用正确的TensorDict格式
-    #     mock_gen_batch = DataProto(
-    #         batch=TensorDict({
-    #             "input_ids": torch.randint(0, 1000, (2, 256)),
-    #             "attention_mask": torch.ones(2, 256, dtype=torch.int64),
-    #             "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1),
-    #             "prompts": torch.randint(0, 1000, (2, 128)),
-    #             "responses": torch.randint(0, 1000, (2, 128)),
-    #             "response_mask": torch.ones(2, 128, dtype=torch.int64),
-    #         }, batch_size=2),
-    #         non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)},
-    #         meta_info={"test_meta": "test_value"}
-    #     )
-    #     self.mock_postprocess.return_value = mock_gen_batch
-    #
-    #     # 创建测试样本
-    #     rollout_samples = [
-    #         self.create_mock_rollout_sample("sample_1"),
-    #         self.create_mock_rollout_sample("sample_2"),
-    #     ]
-    #
-    #     # 调用函数
-    #     result = assemble_batch_from_rollout_samples(
-    #         rollout_samples=rollout_samples,
-    #         tokenizer=self.tokenizer,
-    #         config=self.config
-    #     )
-    #
-    #     # 验证global_token_num被正确计算
-    #     self.assertIn("global_token_num", result.meta_info)
-    #     self.assertIsInstance(result.meta_info["global_token_num"], list)
-    #
-    # def test_mock_postprocess_called_correctly(self):
-    #     """测试postprocess_agent_loop_outputs被正确调用"""
-    #     # 设置mock返回值 - 使用正确的TensorDict格式
-    #     mock_gen_batch = DataProto(
-    #         batch=TensorDict({
-    #             "input_ids": torch.randint(0, 1000, (1, 256)),
-    #             "attention_mask": torch.ones(1, 256, dtype=torch.int64),
-    #             "position_ids": torch.arange(256).unsqueeze(0),
-    #             "prompts": torch.randint(0, 1000, (1, 128)),
-    #             "responses": torch.randint(0, 1000, (1, 128)),
-    #             "response_mask": torch.ones(1, 128, dtype=torch.int64),
-    #         }, batch_size=1),
-    #         non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)},
-    #         meta_info={"test_meta": "test_value"}
-    #     )
-    #     self.mock_postprocess.return_value = mock_gen_batch
-    #
-    #     # 创建测试样本
-    #     rollout_samples = [self.create_mock_rollout_sample("sample_1")]
-    #
-    #     # 调用函数
-    #     result = assemble_batch_from_rollout_samples(
-    #         rollout_samples=rollout_samples,
-    #         tokenizer=self.tokenizer,
-    #         config=self.config
-    #     )
-    #
-    #     # 验证postprocess_agent_loop_outputs被调用
-    #     self.mock_postprocess.assert_called_once()
-    #     call_args = self.mock_postprocess.call_args
-    #
-    #     # 验证调用参数
-    #     agent_loop_outputs, tokenizer, config = call_args[0]
-    #     self.assertEqual(len(agent_loop_outputs), 1)
-    #     self.assertEqual(tokenizer, self.tokenizer)
-    #     self.assertEqual(config, self.config)
-    #
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py
deleted file mode 100644
index 7af4945f311..00000000000
--- a/recipe/fully_async_policy/unittest/test_mq.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright 2025 Meituan Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import threading
-import time
-from unittest.mock import Mock
-
-import pytest
-import ray
-from omegaconf import DictConfig
-
-from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient
-
-
-@pytest.fixture
-def mock_sample():
-    """Mock sample data object"""
-    return Mock()
-
-
-@pytest.fixture
-def basic_config():
-    """Basic configuration"""
-    return DictConfig({"async_training": {"staleness_threshold": 3}})
-
-
-@pytest.fixture
-def queue_config():
-    """Queue configuration with different staleness threshold"""
-    return DictConfig({"async_training": {"staleness_threshold": 2}})
-
-
-@pytest.fixture
-def ray_setup():
-    """Setup Ray environment"""
-    if not ray.is_initialized():
-        ray.init(local_mode=True, ignore_reinit_error=True)
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture
-def message_queue_client(ray_setup, basic_config):
-    """Create MessageQueue actor and return its client"""
-    actor = MessageQueue.remote(basic_config, max_queue_size=10)
-    client = MessageQueueClient(actor)
-    yield client
-    client.shutdown()
-
-
-class TestMessageQueue:
-    """Test MessageQueue (through MessageQueueClient)"""
-
-    def test_put_sample_success(self, message_queue_client, mock_sample):
-        """Test successfully putting a sample"""
-        result = message_queue_client.put_sample(sample=mock_sample, param_version=1)
-        assert result is True
-
-        # Check queue size
-        queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 1
-
-        # Check statistics
-        stats = message_queue_client.get_statistics()
-        assert stats["total_produced"] == 1
-        assert stats["queue_size"] == 1
-
-    def test_put_multiple_samples(self, message_queue_client, mock_sample):
-        """Test putting multiple samples"""
-        for i in range(3):
-            result = message_queue_client.put_sample(sample=mock_sample, param_version=1)
-            assert result is True
-
-        # Check queue size
-        queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 3
-
-        # Check statistics
-        stats = message_queue_client.get_statistics()
-        assert stats["total_produced"] == 3
-        assert stats["queue_size"] == 3
-
-    def test_put_sample_staleness_check(self, message_queue_client, mock_sample):
-        """Test freshness check when putting samples"""
-        # Update parameter version to 5
-        message_queue_client.update_param_version(5)
-
-        # Try to put a stale sample (version difference >= 3 will be rejected)
-        result = message_queue_client.put_sample(
-            sample=mock_sample,
-            param_version=2,  # 5-2=3, reaches threshold
-        )
-
-        assert result is False
-
-        # Check dropped samples count in statistics
-        stats = message_queue_client.get_statistics()
-        assert stats["dropped_samples"] == 1
-
-    def test_put_sample_queue_overflow(self, message_queue_client, mock_sample):
-        """Test queue overflow handling"""
-        # Fill the queue (max capacity 10)
-        for i in range(12):  # Put 12 samples, exceeding max capacity 10
-            message_queue_client.put_sample(sample=mock_sample, param_version=1)
-
-        # Queue size should stay at maximum value
-        queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 10
-
-        # Check statistics
-        stats = message_queue_client.get_statistics()
-        assert stats["dropped_samples"] == 2  # 2 samples should be dropped
-
-    def test_get_samples_success(self, message_queue_client, mock_sample):
-        """Test successfully getting samples"""
-        # First put some samples
-        for i in range(3):
-            message_queue_client.put_sample(sample=mock_sample, param_version=1)
-
-        # Get 2 samples
-        retrieved_samples = message_queue_client.get_samples(min_batch_count=2)
-
-        assert retrieved_samples is not None
-        assert len(retrieved_samples) == 2
-
-        # Check queue size decreased
-        queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 1
-
-        # Check statistics
-        stats = message_queue_client.get_statistics()
-        assert stats["total_consumed"] == 2
-
-    def test_get_samples_blocking_behavior(self, message_queue_client, mock_sample):
-        """Test blocking behavior"""
-        result = []
-
-        def get_samples():
-            # This will block until enough samples are available
-            samples = message_queue_client.get_samples(min_batch_count=2)
-            result.append(samples)
-
-        def put_samples_later():
-            time.sleep(0.5)  # Delay putting samples
-            message_queue_client.put_sample(sample=mock_sample, param_version=1)
-            message_queue_client.put_sample(sample=mock_sample, param_version=1)
-
-        # Start consumer thread
-        consumer_thread = threading.Thread(target=get_samples)
-        producer_thread = threading.Thread(target=put_samples_later)
-
-        consumer_thread.start()
-        producer_thread.start()
-
-        # Wait for both threads to complete
-        producer_thread.join(timeout=2)
-        consumer_thread.join(timeout=2)
-
-        assert len(result) == 1
-        assert len(result[0]) == 2
-
-    def test_update_param_version(self, message_queue_client):
-        """Test updating parameter version"""
-        message_queue_client.update_param_version(10)
-        stats = message_queue_client.get_statistics()
-        assert stats["current_param_version"] == 10
-
-    def test_clear_queue(self, message_queue_client, mock_sample):
-        """Test clearing the queue"""
-        # First add some samples
-        for i in range(3):
-            message_queue_client.put_sample(sample=mock_sample, param_version=1)
-
-        # Clear the queue
-        message_queue_client.clear_queue()
-
-        # Check queue size
-        queue_size = message_queue_client.get_queue_size()
-        assert queue_size == 0
-
-    def test_get_queue_size(self, message_queue_client, mock_sample):
-        """Test getting queue size"""
-        assert message_queue_client.get_queue_size() == 0
-
-        message_queue_client.put_sample(sample=mock_sample, param_version=1)
-        assert message_queue_client.get_queue_size() == 1
-
-    def test_get_statistics(self, message_queue_client):
-        """Test getting statistics"""
-        stats = message_queue_client.get_statistics()
-
-        expected_keys = {
-            "queue_size",
-            "total_produced",
-            "total_consumed",
-            "dropped_samples",
-            "current_param_version",
-            "staleness_threshold",
-            "max_queue_size",
-        }
-        assert set(stats.keys()) == expected_keys
-        assert isinstance(stats["queue_size"], int)
-        assert isinstance(stats["total_produced"], int)
-        assert isinstance(stats["total_consumed"], int)
-
-    def test_get_memory_usage(self, message_queue_client, mock_sample):
-        """Test getting memory usage statistics"""
-        # Add some samples
-        for i in range(2):
-            message_queue_client.put_sample(sample=mock_sample, param_version=1)
-
-        memory_stats = message_queue_client.get_memory_usage()
-
-        expected_keys = {"queue_samples", "estimated_memory_bytes", "estimated_memory_mb"}
-        assert set(memory_stats.keys()) == expected_keys
-        assert memory_stats["queue_samples"] == 2
-        assert memory_stats["estimated_memory_bytes"] > 0
-        assert memory_stats["estimated_memory_mb"] > 0
-
-    def test_shutdown(self, ray_setup, basic_config):
-        """Test shutdown functionality"""
-        # Create new actor for testing shutdown
-        actor = MessageQueue.remote(basic_config, max_queue_size=10)
-        client = MessageQueueClient(actor)
-
-        # Shutdown should not throw exceptions
-        client.shutdown()
-
-
-class TestConcurrency:
-    """Test concurrent scenarios"""
-
-    def setup_method(self):
-        """Setup before each test method"""
-        if not ray.is_initialized():
-            ray.init(local_mode=True, ignore_reinit_error=True)
-
-    def teardown_method(self):
-        """Cleanup after each test method"""
-        if ray.is_initialized():
-            ray.shutdown()
-
-    def create_message_queue_client(self, config=None):
-        """Helper method to create MessageQueue client"""
-        if config is None:
-            config = DictConfig({"async_training": {"staleness_threshold": 3}})
-        actor = MessageQueue.remote(config, max_queue_size=10)
-        return MessageQueueClient(actor)
-
-    def test_concurrent_put_get(self, mock_sample):
-        """Test concurrent put and get"""
-        client = self.create_message_queue_client()
-        try:
-            results = []
-
-            def producer():
-                for i in range(50):
-                    samples = [mock_sample, mock_sample]
-                    result = client.put_sample(sample=samples, param_version=1, rollout_metadata=None)
-                    results.append(("put", result))
-                    time.sleep(0.1)
-
-            def consumer():
-                for _ in range(100):
-                    try:
-                        retrieved_samples = client.get_samples(min_batch_count=1)
-                        results.append(("get", len(retrieved_samples) > 0))
-                    except Exception as e:
-                        print(e)
-                        results.append(("get", False))
-                    time.sleep(0.1)
-
-            # Start producer and consumer threads
-            producer_thread = threading.Thread(target=producer)
-            consumer_thread = threading.Thread(target=consumer)
-
-            producer_thread.start()
-            time.sleep(0.05)
-            consumer_thread.start()
-
-            producer_thread.join(timeout=5)
-            consumer_thread.join(timeout=5)
-
-            # Check results
-            put_results = [r[1] for r in results if r[0] == "put"]
-            get_results = [r[1] for r in results if r[0] == "get"]
-
-            assert all(put_results)
-            assert all(get_results)
-        finally:
-            client.shutdown()
-
-    def test_consume_first_produce_later(self, message_queue_client, mock_data_proto):
-        """Test consume first, produce later scenario - verify blocking and wake-up mechanism"""
-        consumer_result = []
-        producer_result = []
-
-        def consumer_task():
-            """Consumer task: start first, wait for producer to generate data"""
-            # Record the start time of consumption
-            consumer_start = time.time()
-            # This will block until at least 3 samples are available
-            samples = message_queue_client.get_samples(min_batch_count=3)
-            consumer_end = time.time()
-            consumer_result.append(
-                {
-                    "success": True,
-                    "samples_count": len(samples),
-                    "wait_time": consumer_end - consumer_start,
-                    "samples": samples,
-                }
-            )
-
-        def producer_task():
-            """Producer task: start producing after a delay"""
-            time.sleep(4.0)
-            producer_start = time.time()
-            message_queue_client.put_sample(
-                sample=mock_data_proto,
-                param_version=1,
-            )
-            time.sleep(1)
-            message_queue_client.put_sample(
-                sample=mock_data_proto,
-                param_version=1,
-            )
-            time.sleep(1)
-            message_queue_client.put_sample(
-                sample=mock_data_proto,
-                param_version=1,
-            )
-            producer_end = time.time()
-            producer_result.append(
-                {
-                    "put_count": 3,
-                    "produce_time": producer_end - producer_start,
-                }
-            )
-
-            print("produce finish")
-
-        # Start consumer thread (first)
-        consumer_thread = threading.Thread(target=consumer_task, name="Consumer")
-        time.sleep(3)
-        # Start producer thread (later)
-        producer_thread = threading.Thread(target=producer_task, name="Producer")
-
-        consumer_thread.start()
-        time.sleep(0.1)
-        producer_thread.start()
-
-        print("=========", flush=True)
-
-        producer_thread.join()
-        print("producer_result", producer_result, flush=True)
-        consumer_thread.join()
-        print("consumer_result", consumer_result, flush=True)
-
-        assert len(consumer_result) == 1, "消费者应该执行一次"
-
-        consumer_data = consumer_result[0]
-        producer_data = producer_result[0]
-
-        assert producer_data["put_count"] == 3
-        assert consumer_data["samples_count"] == 3
-
-        final_queue_size = message_queue_client.get_queue_size()
-        assert final_queue_size == 0
-
-        stats = message_queue_client.get_statistics()
-        assert stats["total_produced"] == 3
-        assert stats["total_consumed"] == 3
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v", "--tb=short"])
diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
index 0831aebd5b4..3bcd3e0a959 100644
--- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
+++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
@@ -124,6 +124,10 @@ async def resume(self):
         async with self.lock:
             self.paused = False
 
+    async def reset_prefix_cache(self):
+        async with self.lock:
+            await self.engine.reset_prefix_cache()
+
 
 class FullyAsyncvLLMReplica(vLLMReplica):
     def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8):
@@ -137,3 +141,7 @@ async def cancel(self):
     async def resume(self):
         """Resume each rollout server."""
         await asyncio.gather(*[server.resume.remote() for server in self.servers])
+
+    async def reset_prefix_cache(self):
+        """reset kv cache in each rollout server."""
+        await asyncio.gather(*[server.reset_prefix_cache.remote() for server in self.servers])
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index fd3d2ca1b84..d43683df3e4 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -18,4 +18,4 @@
 
 _ = [SingleTurnAgentLoop, ToolAgentLoop]
 
-# __all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker"]
+__all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker"]

From 8f62a942ab086273cfd6c4663682b54abc7fea7e Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 21:07:23 +0800
Subject: [PATCH 155/182] refactor 8

---
 .../unittest/simple_streaming_demo.py         | 100 +++++++++---------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/recipe/fully_async_policy/unittest/simple_streaming_demo.py b/recipe/fully_async_policy/unittest/simple_streaming_demo.py
index d3ae0702e3f..209c2aae39b 100644
--- a/recipe/fully_async_policy/unittest/simple_streaming_demo.py
+++ b/recipe/fully_async_policy/unittest/simple_streaming_demo.py
@@ -18,7 +18,7 @@
 
 
 class SimpleStreamingSystem:
-    """简化的流式处理系统演示"""
+    """Simplified streaming system demonstration"""
 
     def __init__(self, max_concurrent_tasks: int = 4):
         self.max_concurrent_tasks = max_concurrent_tasks
@@ -26,148 +26,148 @@ def __init__(self, max_concurrent_tasks: int = 4):
         self.result_queue = asyncio.Queue()
         self.consumer_count = 0
 
-    # 数据流协程
+    # Data stream coroutine
     async def data_stream(self):
-        # 添加初始数据
-        # 准备测试数据
-        test_data = [{"id": f"task_{i}", "content": f"数据_{i}"} for i in range(8)]
+        # Add initial data
+        # Prepare test data
+        test_data = [{"id": f"task_{i}", "content": f"data_{i}"} for i in range(8)]
         await self.add_data_stream(test_data)
 
-        # 模拟后续数据流
+        # Simulate subsequent data stream
         await asyncio.sleep(3)
-        print("\n添加第二批数据...")
-        extra_data = [{"id": f"extra_{i}", "content": f"额外数据_{i}"} for i in range(5)]
+        print("\nAdding second batch of data...")
+        extra_data = [{"id": f"extra_{i}", "content": f"extra_data_{i}"} for i in range(5)]
         await self.add_data_stream(extra_data)
 
-        # 发送结束信号
+        # Send termination signal
         await asyncio.sleep(1)
         await self.data_queue.put("DONE")
-        print("发送结束信号")
+        print("Sending termination signal")
 
     async def add_data_stream(self, data_list: list[dict]):
-        """模拟数据流"""
-        print("开始添加数据流...")
+        """Simulate data stream"""
+        print("Starting to add data stream...")
 
         for i, data_item in enumerate(data_list):
             await self.data_queue.put(data_item)
-            print(f"数据 {data_item['id']} 进入待处理队列")
+            print(f"Data {data_item['id']} added to pending queue")
 
-            # 模拟数据流的间隔
-            if i < len(data_list) - 1:  # 最后一个不等待
+            # Simulate interval between data streams
+            if i < len(data_list) - 1:  # Don't wait after the last item
                 await asyncio.sleep(0.8)
 
-        print("初始数据流添加完成")
+        print("Initial data stream added successfully")
 
     async def _process_data_async(self, data_item: dict):
-        """异步处理单个数据项"""
+        """Asynchronously process a single data item"""
         data_id = data_item["id"]
         content = data_item["content"]
 
-        # 模拟不同的处理时间（1-3秒）
+        # Simulate different processing times (1-3 seconds)
         processing_time = random.uniform(1, 3)
 
-        print(f"    开始处理 {data_id}，预计耗时 {processing_time:.1f}s")
+        print(f"    Starting to process {data_id}, estimated time {processing_time:.1f}s")
 
-        # 异步等待处理完成
+        # Asynchronously wait for processing completion
         await asyncio.sleep(processing_time)
 
         result = {
             "id": data_id,
-            "processed_content": f"处理后的{content}",
+            "processed_content": f"Processed {content}",
             "processing_time": round(processing_time, 2),
             "completed_at": time.time(),
         }
 
-        # 立即放入结果队列
+        # Immediately put into result queue
         await self.result_queue.put(result)
-        print(f"    {data_id} 处理完成！(耗时 {processing_time:.1f}s) -> 进入结果队列")
+        print(f"    {data_id} processing completed! (took {processing_time:.1f}s) -> Added to result queue")
 
     async def _submit_worker(self):
-        """流式提交工作协程"""
+        """Stream submission worker coroutine"""
         active_tasks = set()
 
-        print("流式提交器启动...")
+        print("Stream submitter started...")
 
         while True:
-            # 获取待处理数据
+            # Get data to process
             data_item = await self.data_queue.get()
 
             if data_item == "DONE":
-                print("收到结束信号，等待剩余任务完成...")
+                print("Received termination signal, waiting for remaining tasks to complete...")
                 if active_tasks:
                     await asyncio.gather(*active_tasks, return_exceptions=True)
                 break
 
-            # 检查并发数限制
+            # Check concurrent limit
             while len(active_tasks) >= self.max_concurrent_tasks:
-                print(f"达到最大并发数 {self.max_concurrent_tasks}，等待任务完成...")
+                print(f"Reached maximum concurrency {self.max_concurrent_tasks}, waiting for tasks to complete...")
                 done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
 
-                # 清理完成的任务
+                # Clean up completed tasks
                 for task in done_tasks:
                     try:
                         await task
-                        print(f"task 完成 {task}")
+                        print(f"Task completed {task}")
                     except Exception as e:
-                        print(f"任务执行失败: {e}")
+                        print(f"Task execution failed: {e}")
 
-            # 立即提交新任务
+            # Immediately submit new task
             task = asyncio.create_task(self._process_data_async(data_item), name=f"active {data_item}")
             active_tasks.add(task)
 
-            print(f"提交任务 {data_item['id']}，当前并发数: {len(active_tasks)}")
+            print(f"Submitted task {data_item['id']}, current concurrency: {len(active_tasks)}")
 
     async def _consumer_worker(self):
-        """结果消费协程"""
-        print("消费者启动...")
+        """Result consumer coroutine"""
+        print("Consumer started...")
 
         while True:
             try:
-                # 从结果队列获取处理结果
+                # Get processing result from result queue
                 result = await asyncio.wait_for(self.result_queue.get(), timeout=2.0)
 
                 self.consumer_count += 1
 
                 print(
-                    f"消费 #{self.consumer_count}: {result['id']} "
-                    f"(处理时间 {result['processing_time']}s) - {result['processed_content']}"
+                    f"Consumed #{self.consumer_count}: {result['id']} "
+                    f"(processing time {result['processing_time']}s) - {result['processed_content']}"
                 )
 
             except asyncio.TimeoutError:
-                print("    消费者等待中...")
+                print("    Consumer waiting...")
                 await asyncio.sleep(0.5)
 
     async def run_demo(self):
-        """运行演示"""
+        """Run demonstration"""
         print("=" * 60)
-        print(f"最大并发数: {self.max_concurrent_tasks}")
+        print(f"Maximum concurrency: {self.max_concurrent_tasks}")
         print("=" * 60)
 
-        # 启动核心协程
+        # Start core coroutines
         stream_task = asyncio.create_task(self.data_stream())
         submit_task = asyncio.create_task(self._submit_worker())
         consumer_task = asyncio.create_task(self._consumer_worker())
 
         try:
-            # 等待数据流完成
+            # Wait for data stream to complete
             await stream_task
-            print("数据流完成")
+            print("Data stream completed")
 
-            # 等待处理完成
+            # Wait for processing to complete
             await submit_task
-            print("所有任务处理完成")
+            print("All tasks processed")
 
         finally:
-            # 清理
+            # Cleanup
             submit_task.cancel()
             consumer_task.cancel()
             await asyncio.gather(submit_task, consumer_task, return_exceptions=True)
 
-        print(f"\n最终统计: 消费了 {self.consumer_count} 个结果")
+        print(f"\nFinal statistics: Consumed {self.consumer_count} results")
 
 
 async def main():
-    """主函数"""
+    """Main function"""
     system = SimpleStreamingSystem(max_concurrent_tasks=3)
     await system.run_demo()
 

From 26849432bc2328b3fea7b15d93d937a21af7cc49 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 21:52:25 +0800
Subject: [PATCH 156/182] refactor 10

---
 .../exp/qwen2-32B_128/fsdp2_colocate/run.sh   | 133 --------------
 .../fsdp2_colocate/runtime_env.yaml           |   5 -
 .../fsdp2_fully-async_64-64/run.sh            | 153 ----------------
 .../fsdp2_fully-async_64-64/runtime_env.yaml  |   4 -
 .../fsdp2_fully-async_80-48/run.sh            | 153 ----------------
 .../fsdp2_fully-async_80-48/runtime_env.yaml  |   4 -
 .../fsdp2_fully-async_96-32/run.sh            | 153 ----------------
 .../fsdp2_fully-async_96-32/runtime_env.yaml  |   4 -
 .../qwen2-7B-math_128/fsdp2_colocate/run.sh   | 133 --------------
 .../fsdp2_colocate/runtime_env.yaml           |   3 -
 .../fsdp2_fully-async_64-64/runtime_env.yaml  |   4 -
 .../runtime_env.yaml                          |   4 -
 .../megatron_colocate/run.sh                  | 135 --------------
 .../megatron_colocate/runtime_env.yaml        |   3 -
 .../qwen2-7B-math_32/fsdp2_colocate/run.sh    | 133 --------------
 .../fsdp2_colocate/runtime_env.yaml           |   3 -
 .../fsdp2_fully-async_16-16/run.sh            | 154 ----------------
 .../fsdp2_fully-async_16-16/runtime_env.yaml  |   4 -
 .../fsdp2_fully-async_24-8/run.sh             | 168 ------------------
 .../fsdp2_fully-async_24-8/runtime_env.yaml   |   5 -
 .../fsdp2_fully-async_8-24/run.sh             | 168 ------------------
 .../fsdp2_fully-async_8-24/runtime_env.yaml   |   5 -
 .../qwen2-7B-math_32/megatron_colocate/run.sh | 135 --------------
 .../megatron_colocate/runtime_env.yaml        |   3 -
 .../qwen2-7B-math_64/fsdp2_colocate/run.sh    | 133 --------------
 .../fsdp2_colocate/runtime_env.yaml           |   3 -
 .../fsdp2_fully-async_24-40/run.sh            | 168 ------------------
 .../fsdp2_fully-async_24-40/runtime_env.yaml  |   5 -
 .../fsdp2_fully-async_32-32/run.sh            | 168 ------------------
 .../fsdp2_fully-async_32-32/runtime_env.yaml  |   4 -
 .../fsdp2_fully-async_40-24/run.sh            | 168 ------------------
 .../fsdp2_fully-async_40-24/runtime_env.yaml  |   5 -
 .../qwen2-7B-math_64/megatron_colocate/run.sh | 135 --------------
 .../megatron_colocate/runtime_env.yaml        |   3 -
 .../qwen3-30BA3B_128/fsdp2_colocate/run.sh    | 125 -------------
 .../fsdp2_colocate/runtime_env.yaml           |   3 -
 .../qwen3-30BA3B_128/megatron_colocate/run.sh | 161 -----------------
 .../megatron_colocate/runtime_env.yaml        |   5 -
 .../exp/qwen3-32B_128/fsdp2_colocate/run.sh   | 125 -------------
 .../fsdp2_colocate/runtime_env.yaml           |   3 -
 .../qwen3-32B_128/megatron_colocate/run.sh    | 156 ----------------
 .../megatron_colocate/runtime_env.yaml        |   5 -
 recipe/fully_async_policy/fully_async_main.py |   6 +-
 .../fully_async_rollouter.py                  |   6 +-
 .../fully_async_policy/fully_async_trainer.py |   5 +-
 .../run.sh => shell/dapo-32B_fsdp2_64_64.sh}  |  20 +--
 .../shell/dapo_7b_math_fsdp2_4_12.sh          |  10 +-
 ...fsdp2_2_6.sh => dapo_7b_math_fsdp2_4_4.sh} |  14 +-
 .../dapo_7b_math_fsdp2_64_64.sh}              |  10 +-
 .../shell/dapo_7b_math_fsdp2_8_8.sh           |  22 +--
 .../shell/dapo_7b_math_fsdp2_colocate.sh      | 141 ---------------
 .../shell/dapo_7b_math_fsdp2_server.sh        | 148 ---------------
 .../shell/dapo_7b_math_megatron_colocate.sh   | 142 ---------------
 .../fully_async_policy/shell/runtime_env.yaml |   1 -
 recipe/one_step_off_policy/fsdp_workers.py    |   1 -
 .../rollout/vllm_rollout/vllm_rollout_spmd.py |   1 -
 56 files changed, 43 insertions(+), 3528 deletions(-)
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh
 delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml
 rename recipe/fully_async_policy/{exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh => shell/dapo-32B_fsdp2_64_64.sh} (91%)
 rename recipe/fully_async_policy/shell/{dapo_7b_math_fsdp2_2_6.sh => dapo_7b_math_fsdp2_4_4.sh} (94%)
 rename recipe/fully_async_policy/{exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh => shell/dapo_7b_math_fsdp2_64_64.sh} (93%)
 delete mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh
 delete mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh
 delete mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh

diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh
deleted file mode 100644
index 92203a7d87a..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_colocate_128'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 20))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-16}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-sp_size=8
-fsdp_size=-1
-
-# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
-
-
-python -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml
deleted file mode 100644
index e33cfd681ca..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_colocate_128"
-  HYDRA_FULL_ERROR: "1"
-  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  VLLM_USE_V1: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
deleted file mode 100644
index 48be3ab3c84..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64'
-
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 20))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=8
-fsdp_size=-1
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
-NNODES_TRAIN=${NNODES_TRAIN:-8}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=16
-total_rollout_steps=$(((512*200)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=8
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
deleted file mode 100644
index ea506be787e..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh
deleted file mode 100644
index fd2874d0f98..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1'
-
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 20))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=8
-fsdp_size=-1
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-10}
-NNODES_TRAIN=${NNODES_TRAIN:-6}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=128
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=1
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml
deleted file mode 100644
index 9997c4130f2..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh
deleted file mode 100644
index 827e9a30e41..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1'
-
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 20))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=8
-fsdp_size=-1
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-12}
-NNODES_TRAIN=${NNODES_TRAIN:-4}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=128
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=2
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml
deleted file mode 100644
index be4ab6a6349..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh
deleted file mode 100644
index 3538722d8a1..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-16}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
-
-
-python -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml
deleted file mode 100644
index 8fc2de3e70b..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
deleted file mode 100644
index 5dfe2294911..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml
deleted file mode 100644
index 92bacbdd204..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh
deleted file mode 100644
index f98aeb86b57..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-8}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-train_tp=4
-train_pp=2
-
-# TODO: support dynamic_bsz for megatron
-# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=megatron \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index 6e33f46a65a..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_megatron_colocate_128_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh
deleted file mode 100644
index 8d42dca04ca..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-4}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
-
-
-python -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml
deleted file mode 100644
index 39c5a3593e8..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
deleted file mode 100644
index 9fca6da9878..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs'
-
-# Ray
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=4
-fsdp_size=8
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-2}
-NNODES_TRAIN=${NNODES_TRAIN:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=16
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
deleted file mode 100644
index 5f0292d2c0d..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16-fsdpsize_8"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh
deleted file mode 100644
index 3de9279a9bc..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-3}
-NNODES_TRAIN=${NNODES_TRAIN:-1}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=32
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
deleted file mode 100644
index 7402c1b37b0..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh
deleted file mode 100644
index 4ba49146329..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-1}
-NNODES_TRAIN=${NNODES_TRAIN:-3}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=11
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
deleted file mode 100644
index fc404cfd985..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh
deleted file mode 100644
index 3879a99df67..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-4}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-train_tp=4
-train_pp=2
-
-# TODO: support dynamic_bsz for megatron
-# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=megatron \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index 3a35b4a52ad..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh
deleted file mode 100644
index e6ab551869d..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-8}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
-
-
-python -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml
deleted file mode 100644
index 514ab9a73f0..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh
deleted file mode 100644
index 3d56ea8b403..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-3}
-NNODES_TRAIN=${NNODES_TRAIN:-5}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=6
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml
deleted file mode 100644
index ef67409ba6f..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh
deleted file mode 100644
index cc26be4f100..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-4}
-NNODES_TRAIN=${NNODES_TRAIN:-4}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=8
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
deleted file mode 100644
index 160cd46c499..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh
deleted file mode 100644
index 0a67a563819..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11'
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-# Algorithm parameters
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-# Response length parameters
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-# Training parameters
-loss_agg_mode="token-mean"
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-ref_offload=True
-actor_offload=False
-gen_tp=4
-sp_size=4
-fsdp_size=2
-
-# Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-5}
-NNODES_TRAIN=${NNODES_TRAIN:-3}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-
-
-train_prompt_bsz=0
-gen_prompt_bsz=1
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
-test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=11
-partial_rollout=True
-
-python -m recipe.fully_async_policy.fully_async_main \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.hybrid_engine=False \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.rollout.calculate_log_probs=True \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.val_before_train=True \
-    trainer.save_freq=-1 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES_TRAIN}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.total_rollout_steps="${total_rollout_steps}" \
-    rollout.total_epochs=10 \
-    rollout.test_freq="${test_freq}" \
-    async_training.staleness_threshold="${staleness_threshold}" \
-    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml
deleted file mode 100644
index 93ae17ebb6f..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11"
-  NCCL_DEBUG: "INFO"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh
deleted file mode 100644
index f98aeb86b57..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-8}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-train_tp=4
-train_pp=2
-
-# TODO: support dynamic_bsz for megatron
-# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=megatron \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index a8cd045e180..00000000000
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh
deleted file mode 100644
index 591ac8533ee..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-16}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-sp_size=4
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-fsdp_size=32
-
-python3 -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml
deleted file mode 100644
index 069b1f14aa0..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-30BA3B/dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh
deleted file mode 100644
index c666034ffc3..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-train_prompt_mini_bsz=32
-n_resp_per_prompt=16
-
-NNODES=${NNODES:-16}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
-infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
-offload=True
-gen_tp=4
-train_tp=1
-train_pp=1
-EP=8
-ETP=1
-CP=1
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10 \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.strategy=megatron \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
-    actor_rollout_ref.actor.megatron.use_mbridge=True
-
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index 4a714f40f43..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-128/dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32"
-  HYDRA_FULL_ERROR: "1"
-  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh
deleted file mode 100644
index 8f2e636c59f..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-16}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-sp_size=4
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=4
-fsdp_size=32
-
-python3 -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml
deleted file mode 100644
index 1b4a8ff4b82..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-32B/dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32"
-  HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh
deleted file mode 100644
index a7535e3575d..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='dapo_qwen3-32B_32k_megatron_colocate_128_mbs32'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 32))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-train_prompt_mini_bsz=32
-n_resp_per_prompt=16
-
-NNODES=${NNODES:-16}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
-infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
-offload=True
-gen_tp=4
-train_tp=4
-train_pp=2
-EP=1
-ETP=1
-CP=1
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=20 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=400 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10 \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.strategy=megatron \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \
-    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \
-    actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
-    actor_rollout_ref.actor.megatron.use_mbridge=True
-
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \
-    # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
-    # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \
\ No newline at end of file
diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml
deleted file mode 100644
index 1bbc3faadc9..00000000000
--- a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-env_vars:
-  TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-128/dapo_qwen3-32B_32k_megatron_colocate_128_mbs32"
-  HYDRA_FULL_ERROR: "1"
-  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
-  CUDA_DEVICE_MAX_CONNECTIONS: "1"
\ No newline at end of file
diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index b98b3f426e0..2f4ab8ccc6b 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -175,10 +175,6 @@ def _initialize_components(self, config) -> None:
         print("[ASYNC MAIN] Creating FullyAsyncTrainer...")
         self._create_trainer(config)
 
-        # sync require samples between rollouter and trainer
-        required_samples = ray.get(self.components["trainer"].get_required_samples.remote())
-        ray.get(self.components["rollouter"].set_required_samples.remote(required_samples))
-
         # sync total_train_steps between rollouter and trainer
         total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote())
         print(f"total_train_steps {total_train_steps}")
@@ -228,6 +224,8 @@ def _create_rollouter(self, config) -> None:
         )
 
         ray.get(rollouter.init_workers.remote())
+        ray.get(rollouter.set_max_required_samples.remote())
+
         self.components["rollouter"] = rollouter
         print("[ASYNC MAIN] Rollouter created and initialized successfully")
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index d10c0684be4..ed6a279ed25 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -115,7 +115,8 @@ def __init__(
 
         # Config
         self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1)
-        self.required_samples = None
+        # required_samples use ppo_mini_batch_size as the minimum number of samples.
+        self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size
         self.max_required_samples = None
         # 单次最多扔一次更新需要的样本
         self.max_concurrent_samples = None
@@ -153,9 +154,8 @@ async def set_message_queue_client(self, message_queue_client: MessageQueueClien
         async with self.lock:
             self.message_queue_client = message_queue_client
 
-    async def set_required_samples(self, required_samples: int):
+    async def set_max_required_samples(self):
         async with self.lock:
-            self.required_samples = int(required_samples)
             self.max_required_samples = int(
                 self.required_samples
                 * (self.staleness_threshold + 1)
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index a4c59c33701..b20ca764a7a 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -97,7 +97,7 @@ def __init__(
         self.progress_bar = None
         self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step
 
-        # calculate required_samples
+        # required_samples use ppo_mini_batch_size as the minimum number of samples.
         self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size
         total_gpus = (
             config.trainer.nnodes * config.trainer.n_gpus_per_node
@@ -121,9 +121,6 @@ def get_actor_wg(self):
         """Get actor worker group"""
         return self.actor_wg
 
-    def get_required_samples(self):
-        return self.required_samples
-
     def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         """
         Get samples from message queue and compose gen_batch_output
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh b/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh
similarity index 91%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh
rename to recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh
index e9133e50eac..324a7d9470e 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh
+++ b/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1'
+exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -16,11 +16,8 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
 MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
 CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
 TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
 TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
 
@@ -44,7 +41,7 @@ clip_ratio_high=0.28
 
 # Response length parameters
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 28))
+max_response_length=$((1024 * 20))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
@@ -65,24 +62,23 @@ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 ref_offload=True
 actor_offload=False
 gen_tp=4
-sp_size=4
-fsdp_size=2
+sp_size=8
+fsdp_size=-1
 
 # Fully async specific parameters
 NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
 NNODES_TRAIN=${NNODES_TRAIN:-8}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*400)))
+total_rollout_steps=$(((512*200)))
 test_freq=20
-staleness_threshold=0.1
-trigger_parameter_sync_step=4
-partial_rollout=True
+staleness_threshold=0
+trigger_parameter_sync_step=16
+partial_rollout=False
 
 python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index fc9b2ad6607..f560468a4cf 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-4-12'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -16,10 +16,10 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
 CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
 
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
@@ -75,7 +75,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=64
+train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*100)))
 test_freq=10
 staleness_threshold=0.1
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
similarity index 94%
rename from recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
index 10563218878..ef00feb9d05 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
@@ -16,8 +16,12 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
 CKPTS_DIR=./ckpts/${project_name}/${exp_name}
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
+
+MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
 TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
 TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
 
@@ -65,19 +69,19 @@ gen_tp=1
 sp_size=1
 fsdp_size=2
 
+# Fully async specific parameters
 NNODES=${NNODES:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-# Fully async specific parameters
-n_gpus_rollout=2
+n_gpus_rollout=4
 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout))
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=64
+train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*100)))
-test_freq=2
+test_freq=10
 staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
similarity index 93%
rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
index 03ebab25cea..1d1958fda79 100644
--- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -16,11 +16,8 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
 MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
 CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
 TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
 TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
 
@@ -73,15 +70,14 @@ NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
 NNODES_TRAIN=${NNODES_TRAIN:-8}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
-test_freq=20
+test_freq=10
 staleness_threshold=0.1
-trigger_parameter_sync_step=4
+trigger_parameter_sync_step=16
 partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
index c59877d97f9..85cdaa03fc5 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
@@ -16,10 +16,10 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
+MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
 CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
+TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
+TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
 
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
@@ -65,17 +65,19 @@ gen_tp=1
 sp_size=1
 fsdp_size=2
 
-NNODES=${NNODES:-1}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
 # Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-1}
+NNODES_TRAIN=${NNODES_TRAIN:-1}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
 n_gpus_rollout=8
 n_gpus_training=8
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=64
+train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*100)))
 test_freq=10
 staleness_threshold=0.1
@@ -159,10 +161,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
     trainer.save_freq=-1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     trainer.resume_mode=auto \
-    trainer.nnodes="${NNODES}" \
-    trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh
deleted file mode 100644
index 33f9836e095..00000000000
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-colocate'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 8))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=2
-sp_size=4
-fsdp_size=2
-
-# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
-
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=10 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=100 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh
deleted file mode 100644
index 087dea05121..00000000000
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-server'
-
-
-rollout_mode="async"
-rollout_name="vllm" # sglang or vllm
-if [ "$rollout_mode" = "async" ]; then
-    export VLLM_USE_V1=1
-    return_raw_chat="True"
-fi
-
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 8))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-1}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=1
-sp_size=1
-fsdp_size=2
-
-# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
-
-/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.return_raw_chat=${return_raw_chat} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=fsdp2 \
-    critic.strategy=fsdp2 \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.grad_clip=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.rollout.name=${rollout_name} \
-    actor_rollout_ref.rollout.mode=${rollout_mode} \
-    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=-1 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=10 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh
deleted file mode 100644
index d05f5571876..00000000000
--- a/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-project_name='DAPO'
-exp_name='DAPO-Qwen2.5-7b-MATH-0519a1-megatron-colocate'
-
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 8))
-enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-train_prompt_bsz=512
-n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-
-# Ray
-# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
-# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
-# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
-NNODES=${NNODES:-2}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-# Paths
-RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
-# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
-MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
-CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
-TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
-TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
-
-MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-
-# Algorithm
-temperature=1.0
-top_p=1.0
-top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
-val_top_p=0.7
-
-# Performance Related Parameter
-use_dynamic_bsz=True
-actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
-infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
-offload=True
-gen_tp=2
-train_tp=2
-train_pp=2
-
-# TODO: support dynamic_bsz for megatron
-# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
-# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
-# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-
-python3 -m verl.trainer.main_ppo \
-    --config-path=config \
-    --config-name='ppo_megatron_trainer.yaml' \
-    data.train_files="${TRAIN_FILE}" \
-    data.val_files="${TEST_FILE}" \
-    data.prompt_key=prompt \
-    data.truncation='left' \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    data.train_batch_size=${train_prompt_bsz} \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.strategy=megatron \
-    critic.strategy=megatron \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.clip_ratio_c=10.0 \
-    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
-    actor_rollout_ref.actor.optim.weight_decay=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.megatron.param_offload=${offload} \
-    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
-    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
-    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.clip_grad=1.0 \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
-    actor_rollout_ref.rollout.enable_chunked_prefill=True \
-    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
-    actor_rollout_ref.rollout.temperature=${temperature} \
-    actor_rollout_ref.rollout.top_p=${top_p} \
-    actor_rollout_ref.rollout.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
-    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
-    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
-    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
-    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
-    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
-    actor_rollout_ref.ref.megatron.param_offload=${offload} \
-    reward_model.reward_manager=dapo \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
-    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
-    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
-    trainer.logger=['console','tensorboard'] \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes="${NNODES}" \
-    trainer.val_before_train=True \
-    trainer.test_freq=10 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=10 \
-    trainer.total_training_steps=100 \
-    trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto \
-    trainer.log_val_generations=10
diff --git a/recipe/fully_async_policy/shell/runtime_env.yaml b/recipe/fully_async_policy/shell/runtime_env.yaml
index dcca08e67f7..88467b8c243 100644
--- a/recipe/fully_async_policy/shell/runtime_env.yaml
+++ b/recipe/fully_async_policy/shell/runtime_env.yaml
@@ -1,5 +1,4 @@
 env_vars:
   VLLM_USE_V1: "1"
-  TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12"
   NCCL_DEBUG: "INFO"
   HYDRA_FULL_ERROR: "1"
\ No newline at end of file
diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py
index 350c105087e..5bb96e2c0b3 100644
--- a/recipe/one_step_off_policy/fsdp_workers.py
+++ b/recipe/one_step_off_policy/fsdp_workers.py
@@ -281,7 +281,6 @@ def async_generate_sequences(self, prompts):
         output = output.to("cpu")
 
         # clear kv cache
-        get_torch_device().empty_cache()
         return output
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index baef0c9315e..456329f59ea 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -532,7 +532,6 @@ async def _loop_forever(self):
 
     def _init_worker(self, all_kwargs: list[dict[str, Any]]):
         """Initialize worker engine."""
-
         all_kwargs[0]["rank"] = int(os.environ["RANK"])
         device_name = "NPU" if is_npu_available else "GPU"
         all_kwargs[0]["local_rank"] = (

From 5adde906bea75ac59f775704dd9b3bfd5cd0705f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 18 Sep 2025 22:13:05 +0800
Subject: [PATCH 157/182] update shel

---
 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh | 7 +------
 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh  | 7 +------
 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh  | 7 +------
 3 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index f560468a4cf..dbfbee8fdfc 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -82,12 +82,7 @@ staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
index ef00feb9d05..6f64caaea0a 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
@@ -86,12 +86,7 @@ staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
index 85cdaa03fc5..02f7664360f 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
@@ -84,12 +84,7 @@ staleness_threshold=0.1
 trigger_parameter_sync_step=16
 partial_rollout=True
 
-PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python"
-if [ ! -x "$PYTHON_INTERPRETER" ]; then
-    PYTHON_INTERPRETER="python3"
-fi
-
-$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \
+python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \

From 3155b444286e358583a2a3d988b44b7aa9dc1362 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 19 Sep 2025 11:53:13 +0800
Subject: [PATCH 158/182] fix notation

---
 recipe/fully_async_policy/fully_async_main.py      | 2 +-
 recipe/fully_async_policy/fully_async_rollouter.py | 1 -
 recipe/fully_async_policy/param_sync.py            | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py
index 2f4ab8ccc6b..4dafd44844f 100644
--- a/recipe/fully_async_policy/fully_async_main.py
+++ b/recipe/fully_async_policy/fully_async_main.py
@@ -110,7 +110,7 @@ def create_role_worker_mapping(config):
 
         role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
 
-    # 添加reference policy（如果需要KL loss或reward）
+    # Add reference policy (if KL loss or reward is required)
     if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
         role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker)
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index b002d892e6a..00411b21a8f 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -165,7 +165,6 @@ async def set_max_required_samples(self):
                 / (self.required_samples * self.config.async_training.trigger_parameter_sync_step)
             )
 
-            # 单次最多扔一次更新需要的样本
             self.max_concurrent_samples = len(self.async_rollout_manager.server_handles) * 16
             self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples)
             self.max_queue_size = self.max_required_samples
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 55d11d236c0..b841019837a 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -96,10 +96,10 @@ def sync_weights(self, version, validate=False, global_steps=0):
         self.wait_last_resume = self.rollouter.resume.remote()
 
     def wait_last_valid(self):
-        print("[ParameterSynchronizer] waiting last validate...")
+        print("[ParameterSynchronizer] Waiting last validate...")
         start_time = time.time()
         if self.wait_last_update:
             ray.get(self.wait_last_update)
         if self.wait_last_resume:
             ray.get(self.wait_last_resume)
-        print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds")
+        print(f"[ParameterSynchronizer] Wait last validate cost: {time.time() - start_time:.2f} seconds")

From c39f283ef4702d8cc57412ed8a2c83d2038d1380 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 19 Sep 2025 15:56:56 +0800
Subject: [PATCH 159/182] rm print

---
 recipe/fully_async_policy/fsdp_workers.py        | 3 ---
 recipe/fully_async_policy/fully_async_trainer.py | 5 +++--
 verl/experimental/agent_loop/agent_loop.py       | 8 ++++++++
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py
index 1ee54112c54..ad6b0db8b51 100644
--- a/recipe/fully_async_policy/fsdp_workers.py
+++ b/recipe/fully_async_policy/fsdp_workers.py
@@ -47,9 +47,6 @@ def get_inference_model(rollout):
     Returns:
         model: model object
     """
-
-    print(rollout)
-    print(dir(rollout))
     inference_engine = rollout.inference_engine
     if hasattr(inference_engine, "llm_engine"):
         inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index eb2a23867f8..a391a0c9c38 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -137,7 +137,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
         # Collect samples using a simple loop calling get_sample
         consumer_start = time.time()
         queue_samples = []
-
+        queue_len = 0
         while len(queue_samples) < self.required_samples:
             # Get a single sample and wait until there is a sample or None is received
             sample, queue_len = self.message_queue_client.get_sample_sync()
@@ -166,7 +166,8 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]:
 
         print(
             f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{self.required_samples} samples, "
-            f"total wait time: {total_wait_time:.2f} seconds"
+            f"total wait time: {total_wait_time:.2f} seconds."
+            f"mq_len: {queue_len}"
         )
 
         queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples]
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index ae56c2a187a..70b4d2e877e 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -691,9 +691,17 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto:
 
 @ray.remote
 class AgentLoopWorker(AgentLoopWorkerBase):
+    """Agent loop worker takes a batch of messages and run each message in an agent loop."""
+
     def __init__(
         self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
+        """Initialize agent loop manager.
+
+        Args:
+            config (DictConfig): YAML config.
+            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
+        """
         super().__init__(config, server_handles, rm_executor)
 
 

From e5116944a6c331b2a8ecb4d6e89a5dc7b78f0d58 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Sun, 28 Sep 2025 17:20:34 +0800
Subject: [PATCH 160/182] fix log prob in hybird&streaming mode

---
 verl/trainer/config/actor/dp_actor.yaml | 3 +++
 verl/workers/actor/dp_actor.py          | 2 +-
 verl/workers/config/actor.py            | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/verl/trainer/config/actor/dp_actor.yaml b/verl/trainer/config/actor/dp_actor.yaml
index ab27304f736..9969f7635b9 100644
--- a/verl/trainer/config/actor/dp_actor.yaml
+++ b/verl/trainer/config/actor/dp_actor.yaml
@@ -40,3 +40,6 @@ entropy_checkpointing: False
 
 # Whether to remove padding tokens in inputs during training
 use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+
+# Whether it's a hybrid engine
+hybrid_engine: ${oc.select:actor_rollout_ref.hybrid_engine, True}
\ No newline at end of file
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
index d26a7244ee8..4c3b0dc9d23 100644
--- a/verl/workers/actor/dp_actor.py
+++ b/verl/workers/actor/dp_actor.py
@@ -394,7 +394,7 @@ def update_policy(self, data: DataProto):
         # See PPO paper for details. https://arxiv.org/abs/1707.06347
         mini_batches = data.split(self.config.ppo_mini_batch_size)
 
-        on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1
+        on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1 and self.config.hybrid_engine
 
         metrics = {}
         for _ in range(self.config.ppo_epochs):
diff --git a/verl/workers/config/actor.py b/verl/workers/config/actor.py
index af6199732b7..db1b5967abb 100644
--- a/verl/workers/config/actor.py
+++ b/verl/workers/config/actor.py
@@ -232,6 +232,7 @@ class FSDPActorConfig(ActorConfig):
     fsdp_config: FSDPEngineConfig = field(default_factory=FSDPEngineConfig)
     use_remove_padding: bool = False
     profiler: ProfilerConfig = field(default_factory=ProfilerConfig)
+    hybrid_engine: bool = True
 
     def __post_init__(self):
         """Validate FSDP actor configuration parameters."""

From 41cea0fb81d45005e81820dd96b47bb87a8104cd Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 29 Sep 2025 16:06:57 +0800
Subject: [PATCH 161/182] fix stale_samples_processed and
 stale_trajectory_processed metrics

---
 recipe/fully_async_policy/fully_async_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index a391a0c9c38..8559433b625 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -314,10 +314,10 @@ def _collect_metrics_from_samples(self, batch, metrics):
         """
         if hasattr(batch, "meta_info") and batch.meta_info:
             samples_param_versions = batch.meta_info["rollout_param_versions"]
-            stale_count = sum(1 for v in samples_param_versions if self.current_param_version - v > 1)
+            stale_count = sum(1 for v in samples_param_versions if self.current_param_version - v >= 1)
             self.stale_samples_processed += stale_count
             trajectory_param_versions = batch.meta_info["trajectory_param_versions"]
-            stale_traj_count = sum(1 for v in trajectory_param_versions if self.current_param_version - v > 1)
+            stale_traj_count = sum(1 for v in trajectory_param_versions if self.current_param_version - v >= 1)
             self.stale_trajectory_processed += stale_traj_count
             metrics.update(
                 {

From 97615b4b3d13de63d317bd58c31142a95a0c24be Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Sat, 11 Oct 2025 15:52:50 +0800
Subject: [PATCH 162/182] add require_batches config param

---
 .../fully_async_policy/config/fully_async_ppo_trainer.yaml   | 3 +++
 recipe/fully_async_policy/fully_async_rollouter.py           | 5 +++--
 recipe/fully_async_policy/fully_async_trainer.py             | 5 +++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index c2708b975be..84a3cb7c290 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -15,6 +15,9 @@ async_training:
   # One step means trainer obtains a batch of required samples
   trigger_parameter_sync_step: 4
   
+  # The number of ppo_mini_batches that the FullyAsyncTrainer obtains once
+  require_batches: 1  
+
   # When synchronizing parameters, whether to interrupt rollouter and perform partial rollout
   partial_rollout: True
 
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 00411b21a8f..c3ba74b5640 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -115,8 +115,9 @@ def __init__(
 
         # Config
         self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1)
-        # required_samples use ppo_mini_batch_size as the minimum number of samples.
-        self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size
+        # required_samples use ppo_mini_batch_size*require_batches as the minimum number of samples.
+        self.require_batches = config.async_training.require_batches
+        self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches
         self.max_required_samples = None
         self.max_concurrent_samples = None
         # queue size
diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py
index 8559433b625..6693eac7406 100644
--- a/recipe/fully_async_policy/fully_async_trainer.py
+++ b/recipe/fully_async_policy/fully_async_trainer.py
@@ -97,8 +97,9 @@ def __init__(
         self.progress_bar = None
         self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step
 
-        # required_samples use ppo_mini_batch_size as the minimum number of samples.
-        self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size
+        # required_samples use ppo_mini_batch_size*require_batches as the minimum number of samples.
+        self.require_batches = config.async_training.require_batches
+        self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches
         total_gpus = (
             config.trainer.nnodes * config.trainer.n_gpus_per_node
             + config.rollout.nnodes * config.rollout.n_gpus_per_node

From 211a441f3a3b124409a4462785ddfe6289aec6be Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Mon, 13 Oct 2025 20:54:33 +0800
Subject: [PATCH 163/182] fix staleness_samples reset bug

---
 .../fully_async_rollouter.py                  | 28 +++++++++++--------
 recipe/fully_async_policy/param_sync.py       |  4 +--
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index c3ba74b5640..4a81051a686 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -16,6 +16,7 @@
 from pprint import pformat
 
 import ray
+from ray import ObjectRef
 
 from recipe.fully_async_policy.detach_utils import (
     RolloutSample,
@@ -333,16 +334,7 @@ async def _processor_worker(self):
         """
         Streaming worker coroutines, a sample is submitted for processing without waiting for batches
         """
-
         while True:
-            simple_from_cancel_queue = False
-            if not self.cancel_queue.empty():
-                rollout_sample = await self.cancel_queue.get()
-                simple_from_cancel_queue = True
-            else:
-                rollout_sample = await self.pending_queue.get()
-                self.staleness_samples += 1
-
             if self.paused or await self._should_pause_generation():
                 print(
                     "[FullyAsyncRollouter][Processor] Received pause signal, waiting for remaining tasks to return..."
@@ -363,6 +355,15 @@ async def _processor_worker(self):
                     while self.paused:
                         self.idle_start_time = time.time()
                         await self.condition.wait()
+                continue
+
+            simple_from_cancel_queue = False
+            if not self.cancel_queue.empty():
+                rollout_sample = await self.cancel_queue.get()
+                simple_from_cancel_queue = True
+            else:
+                rollout_sample = await self.pending_queue.get()
+                self.staleness_samples += 1
 
             if rollout_sample == "DONE":
                 print(
@@ -567,6 +568,7 @@ async def _async_monitor_loop(self):
                     async with self.lock:
                         self.paused = False
                         self.condition.notify_all()
+                        print("[FullyAsyncRollouter][MonitorLoop] Trigger rollout recovery in MonitorLoop")
 
     async def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""
@@ -581,12 +583,12 @@ async def _should_pause_generation(self) -> bool:
                 )
             return True
 
-        if self.staleness_samples > self.max_required_samples:
+        if self.staleness_samples >= self.max_required_samples:
             if not self.paused:
                 print(
                     "[FullyAsyncRollouter][ShouldPause] "
                     f"due to "
-                    f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} "
+                    f"staleness_samples {self.staleness_samples} >= max_required_samples {self.max_required_samples} "
                 )
             return True
 
@@ -607,7 +609,9 @@ async def pause(self):
             await self.async_rollout_manager.reset_prefix_cache()
             self.monitor_loop_trigger = False
 
-    async def resume(self):
+    async def resume(self, dependency_ref: ObjectRef = None):
+        if dependency_ref is not None:
+            ray.get(dependency_ref)
         print("[FullyAsyncRollouter][Public][Resume]")
         async with self.lock:
             self.paused = False
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index b841019837a..d6c67ceb409 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -93,10 +93,10 @@ def sync_weights(self, version, validate=False, global_steps=0):
 
         # Async Update rollout version & validation
         self.wait_last_update = self.rollouter.update_param_version.remote(version, validate, global_steps)
-        self.wait_last_resume = self.rollouter.resume.remote()
+        self.wait_last_resume = self.rollouter.resume.remote(self.wait_last_update)
 
     def wait_last_valid(self):
-        print("[ParameterSynchronizer] Waiting last validate...")
+        print("[ParameterSynchronizer] Waiting last sync and validate...")
         start_time = time.time()
         if self.wait_last_update:
             ray.get(self.wait_last_update)

From f7a8e96608cb44eedccb1c02abbcd79bfe996665 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Tue, 14 Oct 2025 10:15:21 +0800
Subject: [PATCH 164/182] del debug code

---
 recipe/fully_async_policy/fully_async_rollouter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index 4a81051a686..503c6ae6d5f 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -568,7 +568,6 @@ async def _async_monitor_loop(self):
                     async with self.lock:
                         self.paused = False
                         self.condition.notify_all()
-                        print("[FullyAsyncRollouter][MonitorLoop] Trigger rollout recovery in MonitorLoop")
 
     async def _should_pause_generation(self) -> bool:
         """Determine whether the build should be paused"""

From 3de3ed04cfc4294cdbc1b19597b0978246b4c12f Mon Sep 17 00:00:00 2001
From: arron <arron@MBP-VH9RV7LTJC-1907.local>
Date: Tue, 14 Oct 2025 11:31:54 +0800
Subject: [PATCH 165/182] add README_zh.md

---
 recipe/fully_async_policy/README.md    |  66 ------
 recipe/fully_async_policy/README_zh.md | 316 +++++++++++++++++++++++++
 2 files changed, 316 insertions(+), 66 deletions(-)
 delete mode 100644 recipe/fully_async_policy/README.md
 create mode 100644 recipe/fully_async_policy/README_zh.md

diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md
deleted file mode 100644
index 0509969216b..00000000000
--- a/recipe/fully_async_policy/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# 基于verl的改造方案
-
-## 方案
-
-### 方案1 (StreamRL, AsyncFlow)
-
-![StreamRL](
-https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/StreamRL.png?raw=true)
-
-在分离架构的基础上，修改在Rollout和Train的样本传递过程中，将离线策略生成一批global样本修改为生成一批batch的方式，实现生成和训练两阶段的高度重叠。
-训练阶段一收到足够样本就开始处理，训练一定步数后，将参数同步到PS侧， Rollout在每次样本生成完成后，check是否有新的参数，如果有就进行一次同步。
-
-### 方案2 (Mistralai, Areal)
-
-![mistralai](
-https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/mistralai.png?raw=true)
-
-在分离架构的基础上，实现Rollout的partial rollout逻辑。样本仍然修改为batch的方式进行传递，实现生成和训练两阶段的高度重叠。
-在参数同步方面，训练阶段主动触发Rollout的暂停，参数同步以及恢复。 Rollout使用Rollout Server的方式，支持样本生成的中断与恢复，
-产生的的样本所使用的参数版本会有所不同。
-
-### 折中
-
-上述两种方案的核心都是将训练与生成进行overlap，核心区别主要集中在参数同步的处理方式不同，方案1需要实现PS完成参数的异步加载。
-方案2使用同步的方式进行参数同步，但需要完成PartialRollout的逻辑。综合已有代码，以及社区进行中的工作，我们希望先将异步的工作流搭建完成，先以方案1进行开发，后续再进一步开发方案2。
-
-## 设计
-
-### 架构图
-
-![full_async](
-https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/full_async.svg?raw=true)
-
-为实现纯异步训练工作流，基于已有的 one step off policy 代码，扩增实现 Rollouter 以及 Message Queue，以及对Trainer进行更新。
-
-整体的训练流程参考StreamRL，将原有流程中生成 train_batch_size 个样本后进行下一步训练的过程，修改为流式的样本传递，train
-拿到一次前向的样本后就进行样本分发（ppo_mini_batch_size*worker）。与one-step-off相比，我们将一次step的异步，继续细化到一次batch的异步。
-
-**MessageQueue** 作为Ray的Actor存在，支持zeromq消息队列保存生成的样本，并提供给Trainer使用。Trainer 和 Rollouter 都持有
-MessageQueue 的Handler，通过接口完成样本的插入与消费。
-
-**FullyAsyncRollouter** 类似于现有的 Trainer，实现fit()工作流，循环调用 Rollout 进行样本的生成。FullyAsyncRollouter 对于已有的
-vLLMAsyncRollout SGLangAsyncRollout 进行封装。
-
-* 方案1，使用异步更新策略，FullyAsyncRollouter 根据样本生成的进展，自动访问PS，判断是否进行新的参数加载。
-* 方案2，参考PR https://github.com/volcengine/verl/pull/2246 https://github.com/volcengine/verl/pull/2200 Rollout
-  组件需要支持暂停及恢复，从而进行参数的更新。暂停时，需要保存进行中的rollout样本，下次继续恢复生产。
-
-**FullyAsyncTrainer** 与当前实现类似，区别是样本的获取修改为从Queue中获取，Queue有最少batch样本就开始进行分发。rainer完成一次step的训练后，
-与FullyAsyncRollouter的使用策略对应：
-
-* 方案1，使用异步更新策略，参数产生后，主动同步到PS中。
-* 方案2，直接调用Rollouter进行同步，主动通知Rollouter暂停生成，进行参数的同步更新。
-
-## 总结
-
-当Rollouter生产快于Trainer消费时，queue中会存在多步过期的样本，我们需要在Rollouter中设置“陈旧度 staleness”阈值，
-由当前的参数版本以及生成的样本数量，决定是否要暂停生成。zeromq 的最大长度应为 staleness * total_size，并且实现基于陈旧度的拒绝策略，进行防御性编程。
-
-* 当使用方案1时，参数的同步由FullyAsyncRollouter主动控制，触发时机取决预先设置的固定数量样本完成以及参数已就绪，产生的样本所使用的参数版本一致，
-  但是避免不了长尾的问题，会有"rollout空洞"产生。
-
-* 当使用方案2时，参数的同步会更加及时，陈旧度低的样本数量较多，但是长尾样本由不同的参数产生，长尾样本的不同token所对应的参数版本会传递给训练引擎，
-  后续可以根据这一信息对loss进行加权处理。
-
-当Rollouter生产慢于Trainer消费时，队列长时间为空，基本等价于同步训练。
\ No newline at end of file
diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
new file mode 100644
index 00000000000..6f88c8187be
--- /dev/null
+++ b/recipe/fully_async_policy/README_zh.md
@@ -0,0 +1,316 @@
+# Recipe: Fully Async Policy Async Trainer
+
+**Author:**  `https://github.com/meituan-search`
+
+Last updated: 10/13/2025.
+
+本文档介绍了完全异步PPO训练系统，该系统实现了 Trainer 和 Rollouter 的完全解耦，支持异步样本生成和训练。
+
+## Introduction
+
+### Background
+
+rollout和train分离架构相较于colocate的架构能够更加灵活地分配资源，设计更加灵活的训练逻辑，从而处理长尾等问题带来的GPU利用率低，训练效率低的问题。
+one_step_off_policy通过分离架构的设计并进行rollout和train一轮异步的训练方法，缓解了rollout时间过长的问题，并在训练效率上取得了一些收益，
+但其强制使用一轮异步的数据，存在不够灵活等问题，而且并不能完全去除长尾对训练效率带来的的影响；在其他框架如areal、Magistral、streamrl、asyncflow上，
+已经基于分离架构实现了异步训练、流式训练，并取得了收益；我们借鉴其方法，在verl上进行了实现。fully_async_policy支持异步、流式、partial
+rollout的训练，
+通过合理设置资源分配情况、参数同步频率等参数，fully_async_policy能够显著提高训练效率。
+
+> Magistral https://arxiv.org/abs/2506.10910
+>
+> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language
+> Reasoning https://arxiv.org/abs/2505.24298
+>
+> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream
+> Generation https://arxiv.org/abs/2504.15930
+>
+> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663
+>
+
+### 核心贡献
+
+* 资源隔离：与使用hybrid_engine不同，Rollouter和Trainer使用分离的计算资源，需要分别指定所占用的资源。
+* 生成与训练并行：Trainer在训练的同时，Rollouter在生成新的样本。
+* 多步异步: 相比 one step off policy 支持0.x步到多步的异步设定，异步方案更加灵活。
+* nccl参数同步：使用nccl通信原语进行Rollouter与Trainer参数的通信。
+* Stream推理与训练：Rollouter逐样本生成数据，同时数据传输以单个sample为最小传输单位。
+* 异步训练与新鲜度控制：通过设置参数async_training.staleness_threshold，支持使用旧参数生成的样本进行训练。
+* PartialRollout: Rollouter推理过程支持partial rollout逻辑，通过参数同步时，添加sleep()和resume()
+  逻辑，保存进行中的rollout的样本，并在下一次rollout中继续使用，减少参数同步等待进行中的任务结束时间。
+
+目前支持使用模式为 fsdp+vllm。vllm必须使用基于AgentLoop的server模式。
+
+## 设计
+
+fully_async_policy的整体架构如下图所示，fully_async_policy主要由Rollouter、MessageQueue、Trainer、ParameterSynchronizer四部分组成。
+
+![fully_async_policy_structure](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_structure.svg?raw=true)
+
+1. Rollouter逐样本生成序列，并将生成的sample放入MessageQueue中，生产的速度受新鲜度控制。
+2. MessageQueue用于暂存Rollouter生成的sample。
+3. Trainer逐样本从MessageQueue中获取，获取到require_batches*
+   ppo_mini_batch_size数量的样本后，就会进行训练，训练async_training.trigger_parameter_sync_step轮后，触发与Rollouter的一次参数同步。
+4. ParameterSynchronizer 实现了Nccl的同步参数同步能力。
+
+当前方案对比base的收益来源，在于colocate情况下，rollout使用更多的资源无法解决长尾样本带来的空闲，当我们进行资源隔离后，rollout的时间和train的时间都可能相较于之前更长（因为使用的资源变少了），但是相互之间的耗时overlap，端到端的耗时反而有所缩减。
+
+![fully_async_policy_revenue](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true)
+
+## 使用方式
+
+### 参数说明
+
+| super params                                  | implication                                                     |
+|-----------------------------------------------|-----------------------------------------------------------------|
+| `trainer.nnodes`                              | 表示Trainer的node数量                                                |
+| `trainer.n_gpus_per_node`                     | 表示Trainer每个node上gpu的数量                                          |
+| `rollout.nnodes`                              | 表示Rollouter的node数量                                              |
+| `rollout.n_gpus_per_node`                     | 表示Rollouter每个node上gpu的数量                                        |
+| `data.train_batch_size`                       | 在fully async策略中，该值不生效（默认设置为0）                                   |
+| `data.gen_batch_size`                         | 在fully async策略中，使用流式的样本生产逻辑（默认设置为1)                             |
+| `rollout.total_rollout_steps`                 | 总的rollout的sample数量                                              |
+| `rollout.test_freq`                           | 表示Rollouter每更新多少次参数，进行一次validation                              |
+| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus |
+| `async_training.require_batches`              | FullyAsyncTrainer一次性获取的ppo_mini_batch_size的数量                   |
+| `async_training.trigger_parameter_sync_step`  | 表示FullyAsyncTrainer进行多少次本地更新后,进行一次参数同步                          |
+| `async_training.staleness_threshold`          | 新鲜度控制                                                           |
+| `async_training.partial_rollout`              | 是否进行partial_rollout                                             |
+| `async_training.use_rollout_log_probs`        | 使用rollout产生的log_probs                                           |
+
+进一步的解释：
+
+`rollout.total_rollout_steps`
+
+rollout.total_rollout_steps = data.train_batch_size * step
+
+`async_training.trigger_parameter_sync_step`
+
+在fully async策略中，表示Trainer进行多少次本地更新后（也就是获取多少次require_batches
+*ppo_mini_batch_size数量样本），与Rollouter之间进行一次参数同步。
+每两次Rollouter和Trainer参数同步之间，Trainer将会处理trigger_parameter_sync_step*require_batches*
+ppo_mini_batch_size份sample。
+如果为了与colocate比较，在公平的情况下对比速度，trigger_parameter_sync_step应该设置为 data.train_batch_size / (
+require_batches * ppo_mini_batch_size)。
+
+`async_training.staleness_threshold`
+
+在fully async策略中，表示最大允许使用的staleness样本的比例。
+staleness_threshold=0，表示同步训练。
+Rollouter两次参数更新之间将会生成固定数量的样本，样本数为：
+
+$$$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$$
+
+staleness_threshold>0，表示异步训练， 可以设置为小数，支持更灵活的异步调用。
+
+Rollouter两次参数更新之间将会最多生成的样本数为：
+
+$$$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$$
+
+num_staleness_sample 表示上一次rollout多生成的陈旧样本数。
+由于是流式系统，rollout持续生成，trainer持续消费。如果rollouter较慢，trainer会更早触发参数同步，rollouter并不会实际生产rollout_num个样本。
+当rollout 足够快时，staleness_threshold设置为1，基本上等价于one_step_off policy。
+为了避免过期样本太多影响训练精度，建议该值设置小于1。
+
+`async_training.partial_rollout`
+partial_rollout只会在staleness_threshold>0时才实际上起作用。
+
+`async_training.use_rollout_log_probs`
+在强化学习算法中，log_probs与参数版本，token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定，我们在计算重要性采样时，
+即 old_log_prob必须使用rollout参数及token所对应log_probs，才能保证算法的正确性。在fully
+async策略中，我们默认old_log_prob是有rollout所计算的，而不是由trainer所计算。
+
+### 模式支持
+
+1. on policy pipeline:
+    1. trigger_parameter_sync_step=1，staleness_threshold=0;
+    2. Rollouter一次生产require_batches*
+       ppo_mini_batch_size的samples，Trainer获取这些samples后进行训练，训练完后Trainer和Rollouter之间进行一次参数同步;
+    3. 在rollout阶段，如果存在长尾的样本，但是rollout样本数较少时，较短的样本无法填充到空闲的资源中，会造成一定的资源浪费。
+    4. 如图a所示；
+
+2. stream off policy pipeline:
+    1. trigger_parameter_sync_step>1，staleness_threshold=0。
+    2. 将会进行同步的流式训练，Rollouter一次生产require_batches*ppo_mini_batch_size*
+       trigger_parameter_sync_step的samples，Trainer每获取require_batches*
+       ppo_mini_batch_size就进行一次本地训练，训练trigger_parameter_sync_step次后，Trainer和Rollouter之间进行一次参数同步;
+    3. 相较于a，由于一次生成的样本更多，资源的空闲会更低。
+    4. 在一次step训练中，会存在两次资源闲置的时间，分别是在第一次获取样本时，train等待require_batches*
+       ppo_mini_batch_size个样本生产，以及最后一次参数更新时，rollout等待训练完成。
+    5. 如图b所示；
+
+3. async stream pipeline with staleness samples:
+    1. trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=Flase。
+    2. Rollouter在每次参数更新后将计划最多生产rollout_num个样本（实际根据rollout速度，生成的样本可能会少与这个值）。
+    3.
+   如果rollout过程比较快，Rollouter将会在参数同步前额外生成一部分样本num_stale_samples，用于参数同步后立即给Trainer使用，如图c所示。触发参数同步时，如果Rollouter有正在生产的任务，将会等待任务完成，同时不会添加新的任务；
+    4. 相较于b，除第一次step训练外，后续的训练都不会有wait first batch rollout finish的时间，但是会有wait active task
+       finish的时间。
+
+4. async stream pipeline with partial rollout:
+    1. trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=True。
+    2. 相较于c，触发参数同步时，Rollouter如果有正在生产的sample，会打断rollout过程并进行参数同步，被中断的sample会在参数同步后继续生成。减少了wait
+       active task finish的时间。
+
+![fully_async_policy_mode](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true)
+
+### 关键指标
+
+| metrics                                        | implication                                               |
+|------------------------------------------------|-----------------------------------------------------------|
+| `trainer/idle_ratio`                           | Trainer闲置率                                                |
+| `rollouter/idle_ratio`                         | Rollouter闲置率                                              |
+| `fully_async/count/stale_samples_processed`    | 训练使用的旧sample总数                                            |
+| `fully_async/count/stale_trajectory_processed` | 训练使用的旧trajectory总数(一个sample会生产rollout.n条trajectory)       |
+| `fully_async/partial/total_partial_num`        | 两次trigger_parameter_sync_step之间Trainer处理的partial样本数       |
+| `fully_async/partial/partial_ratio`            | 两次trigger_parameter_sync_step之间Trainer处理的partial样本的比例     |
+| `fully_async/partial/max_partial_span`         | 两次trigger_parameter_sync_step之间Trainer处理的partial样本的最大参数跨度 |
+
+### 调参建议
+
+* 资源分配与调整:
+    *
+  合理的资源分配是获得好的训练效率的前提。理想的资源分配情况应该是使得Rollout的时间和Train的时间接近，从而使得整个训练过程流水气泡最小，避免资源闲置，同时Trainer不会使用旧样本。在真实训练场景下，可以根据实际训练过程中rollout和train的空闲时间调整资源分配，可从rollouter/idle_ratio和trainer/idle_ratio获得，如果rollouter/idle_ratio较高trainer/idle_ratio较低，应该增多Trainer的资源减少Rollouter的资源，反之亦然。
+
+* 关键参数：
+    * staleness_threshold: 设置太大会导致较多的旧样本使用，影响模型效果，建议设置小于1。
+    * require_batches：越接近1，越接近纯流式过程，训练过程中bubble越小，能够在速度上获得更快的加速效果，但会对样本的处理顺序产生影响；
+    * trigger_parameter_sync_step: 设置的越小越接近on
+      policy但会导致频繁的参数同步，同时server模式，长尾样本浪费的资源无法被短样本填充，资源利用率低。设置的越大有更高的计算效率，但是精度上会受到off
+      policy的影响。
+    * rollout.test_freq: 会占用Rollouter资源，不建议设置太小。
+
+* 模式选择：正如[模式支持]章节介绍，通过调整不同的参数，Fully Async架构支持不同程度上的优化加速，适用于不同场景的任务。
+    * 对于小规模任务，需要保证训练的稳定性和 on-policy 性，对速度要求不高的场景，可以尝试使用on policy pipeline的模式（模式1）。
+    * 对于需要提高训练吞吐量，但对 staleness 敏感的场景，可以尝试使用 stream off policy pipeline 的模式。即通过
+      设置trigger_parameter_sync_step>1 ，提高 训练效率，但仍保持同步机制 (staleness_threshold=0 )（模式2）。
+    * 对于大规模任务，对训练速度有较高要求，且可以容忍一定 off-policy 程度、staleness的场景，可以设置staleness_threshold>
+      0、partial_rollout=True提高训练效率，使用 async stream pipeline 模式（模式 3 或 4）。
+
+### 快速开始
+
+```shell
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=10
+staleness_threshold=0
+trigger_parameter_sync_step=16
+partial_rollout=False
+
+
+python -m recipe.fully_async_policy.fully_async_main \
+	train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.hybrid_engine=False \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
+```
+
+## 实验
+
+### 在7B模型上进行异步训练
+
+* 机器：H20
+* 模型：Qwen2.5-Math-7B
+* rollout长度：max_response_length FSDP2: 28K tokens;
+* 算法：DAPO
+* 数据集： TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+* engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 20
+
+* colocate sync:
+    * step: 400
+    * train_batch_size: 512
+
+* fully_async_policy
+    * total_rollout_steps: 512*400
+    * require_batches: 4
+    * trigger_parameter_sync_step: 4
+    * staleness_threshold: 0.3
+    * partial_rollout: True
+
+| training mode      | Resource allocation | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|------|--------------------|--------------|--------------|------------|------------------|
+| colocate sync      | 32                  |      |                    |              |              |            |                  |
+| fully_async_policy | 16:16               |      |                    |              |              |            |                  |
+| colocate sync      | 64                  |      |                    |              |              |            |                  |
+| fully_async_policy | 32:32               |      |                    |              |              |            |                  |
+| colocate sync      | 128                 |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               |      |                    |              |              |            |                  |
+
+> https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 30B模型模式实验
+
+* 机器: H20
+* 模型：Qwen2.5-32B
+* rollout长度：max_response_length FSDP2: 20K tokens;
+* 算法：DAPO
+* engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 20
+
+* colacate sync:
+    * step:200
+    * train_batch_size: 512
+
+* fully_async_policy
+    * total_rollout_steps: 512*200
+    * trigger_parameter_sync_step: 512/32 = 16
+    * staleness_threshold: 0
+    * partial_rollout: False
+
+| training mode      | Resource allocation | mode                                         | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|----------------------------------------------|------|--------------------|--------------|--------------|------------|------------------|
+| colocate sync      | 128                 |                                              |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | stream off policy pipeline                   |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | async stream pipeline with staleness samples |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
+
+### 128卡  require_batches 消融实验
+
+### 128卡 stale 消融实验
+
+## 后续计划
+
+* GRPO实验
+* megatron 适配
+* sglang 集成
+* transfer queue 集成 
+* 异步参数同步
+* Areal异步算法实现
+* TPPO算法实现
+* 多轮及Tool的支持
\ No newline at end of file

From f658643036cf6016538a1a185727560e92e10738 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 14 Oct 2025 11:49:19 +0800
Subject: [PATCH 166/182] update README_zh.md

---
 recipe/fully_async_policy/README_zh.md | 96 +++++++++++++++-----------
 1 file changed, 56 insertions(+), 40 deletions(-)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index 6f88c8187be..8d7ce356335 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -14,8 +14,7 @@ rollout和train分离架构相较于colocate的架构能够更加灵活地分配
 one_step_off_policy通过分离架构的设计并进行rollout和train一轮异步的训练方法，缓解了rollout时间过长的问题，并在训练效率上取得了一些收益，
 但其强制使用一轮异步的数据，存在不够灵活等问题，而且并不能完全去除长尾对训练效率带来的的影响；在其他框架如areal、Magistral、streamrl、asyncflow上，
 已经基于分离架构实现了异步训练、流式训练，并取得了收益；我们借鉴其方法，在verl上进行了实现。fully_async_policy支持异步、流式、partial
-rollout的训练，
-通过合理设置资源分配情况、参数同步频率等参数，fully_async_policy能够显著提高训练效率。
+rollout的训练， 通过合理设置资源分配情况、参数同步频率等参数，fully_async_policy能够显著提高训练效率。
 
 > Magistral https://arxiv.org/abs/2506.10910
 >
@@ -30,13 +29,13 @@ rollout的训练，
 
 ### 核心贡献
 
-* 资源隔离：与使用hybrid_engine不同，Rollouter和Trainer使用分离的计算资源，需要分别指定所占用的资源。
-* 生成与训练并行：Trainer在训练的同时，Rollouter在生成新的样本。
-* 多步异步: 相比 one step off policy 支持0.x步到多步的异步设定，异步方案更加灵活。
-* nccl参数同步：使用nccl通信原语进行Rollouter与Trainer参数的通信。
-* Stream推理与训练：Rollouter逐样本生成数据，同时数据传输以单个sample为最小传输单位。
-* 异步训练与新鲜度控制：通过设置参数async_training.staleness_threshold，支持使用旧参数生成的样本进行训练。
-* PartialRollout: Rollouter推理过程支持partial rollout逻辑，通过参数同步时，添加sleep()和resume()
+* **资源隔离**：与使用hybrid_engine不同，Rollouter和Trainer使用分离的计算资源，需要分别指定所占用的资源。
+* **生成与训练并行**：Trainer在训练的同时，Rollouter在生成新的样本。
+* **多步异步**: 相比 one step off policy 支持0.x步到多步的异步设定，异步方案更加灵活。
+* **nccl参数同步**：使用nccl通信原语进行Rollouter与Trainer参数的通信。
+* **Stream推理与训练**：Rollouter逐样本生成数据，同时数据传输以单个sample为最小传输单位。
+* **异步训练与新鲜度控制**：通过设置参数async_training.staleness_threshold，支持使用旧参数生成的样本进行训练。
+* **PartialRollout**: Rollouter推理过程支持partial rollout逻辑，通过参数同步时，添加`sleep()`和`resume()`
   逻辑，保存进行中的rollout的样本，并在下一次rollout中继续使用，减少参数同步等待进行中的任务结束时间。
 
 目前支持使用模式为 fsdp+vllm。vllm必须使用基于AgentLoop的server模式。
@@ -50,11 +49,13 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a
 
 1. Rollouter逐样本生成序列，并将生成的sample放入MessageQueue中，生产的速度受新鲜度控制。
 2. MessageQueue用于暂存Rollouter生成的sample。
-3. Trainer逐样本从MessageQueue中获取，获取到require_batches*
-   ppo_mini_batch_size数量的样本后，就会进行训练，训练async_training.trigger_parameter_sync_step轮后，触发与Rollouter的一次参数同步。
+3. Trainer逐样本从MessageQueue中获取，获取到`require_batches*ppo_mini_batch_size`
+   数量的样本后，就会进行训练，训练async_training.trigger_parameter_sync_step轮后，触发与Rollouter的一次参数同步。
 4. ParameterSynchronizer 实现了Nccl的同步参数同步能力。
 
-当前方案对比base的收益来源，在于colocate情况下，rollout使用更多的资源无法解决长尾样本带来的空闲，当我们进行资源隔离后，rollout的时间和train的时间都可能相较于之前更长（因为使用的资源变少了），但是相互之间的耗时overlap，端到端的耗时反而有所缩减。
+当前方案对比base的收益来源，在于colocate情况下，rollout使用更多的资源无法解决长尾样本带来的空闲，
+当我们进行资源隔离后，rollout的时间和train的时间都可能相较于之前更长（因为使用的资源变少了），
+但是相互之间的耗时overlap，端到端的耗时反而有所缩减。
 
 ![fully_async_policy_revenue](
 https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true)
@@ -65,14 +66,14 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a
 
 | super params                                  | implication                                                     |
 |-----------------------------------------------|-----------------------------------------------------------------|
-| `trainer.nnodes`                              | 表示Trainer的node数量                                                |
-| `trainer.n_gpus_per_node`                     | 表示Trainer每个node上gpu的数量                                          |
-| `rollout.nnodes`                              | 表示Rollouter的node数量                                              |
-| `rollout.n_gpus_per_node`                     | 表示Rollouter每个node上gpu的数量                                        |
+| `trainer.nnodes`                              | Trainer的node数量                                                  |
+| `trainer.n_gpus_per_node`                     | Trainer每个node上gpu的数量                                            |
+| `rollout.nnodes`                              | Rollouter的node数量                                                |
+| `rollout.n_gpus_per_node`                     | Rollouter每个node上gpu的数量                                          |
 | `data.train_batch_size`                       | 在fully async策略中，该值不生效（默认设置为0）                                   |
 | `data.gen_batch_size`                         | 在fully async策略中，使用流式的样本生产逻辑（默认设置为1)                             |
 | `rollout.total_rollout_steps`                 | 总的rollout的sample数量                                              |
-| `rollout.test_freq`                           | 表示Rollouter每更新多少次参数，进行一次validation                              |
+| `rollout.test_freq`                           | Rollouter每更新多少次参数，进行一次validation                                |
 | `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus |
 | `async_training.require_batches`              | FullyAsyncTrainer一次性获取的ppo_mini_batch_size的数量                   |
 | `async_training.trigger_parameter_sync_step`  | 表示FullyAsyncTrainer进行多少次本地更新后,进行一次参数同步                          |
@@ -80,44 +81,45 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a
 | `async_training.partial_rollout`              | 是否进行partial_rollout                                             |
 | `async_training.use_rollout_log_probs`        | 使用rollout产生的log_probs                                           |
 
-进一步的解释：
+**进一步的解释：**
 
 `rollout.total_rollout_steps`
 
-rollout.total_rollout_steps = data.train_batch_size * step
+与 colocate 相比，数量可以通过 train_batch_size 与 step 相乘对齐: rollout.total_rollout_steps = data.train_batch_size *
+step。
 
 `async_training.trigger_parameter_sync_step`
 
-在fully async策略中，表示Trainer进行多少次本地更新后（也就是获取多少次require_batches
-*ppo_mini_batch_size数量样本），与Rollouter之间进行一次参数同步。
-每两次Rollouter和Trainer参数同步之间，Trainer将会处理trigger_parameter_sync_step*require_batches*
+在fully async策略中，表示Trainer进行多少次本地更新后（也就是获取多少次require_batches\* ppo_mini_batch_size数量样本），
+与Rollouter之间进行一次参数同步。
+每两次Rollouter和Trainer参数同步之间，Trainer将会处理trigger_parameter_sync_step\* require_batches\*
 ppo_mini_batch_size份sample。
-如果为了与colocate比较，在公平的情况下对比速度，trigger_parameter_sync_step应该设置为 data.train_batch_size / (
-require_batches * ppo_mini_batch_size)。
+如果为了与colocate在公平的情况下对比速度，trigger_parameter_sync_step应该设置为 data.train_batch_size / (
+require_batches \* ppo_mini_batch_size)。
 
 `async_training.staleness_threshold`
 
 在fully async策略中，表示最大允许使用的staleness样本的比例。
-staleness_threshold=0，表示同步训练。
-Rollouter两次参数更新之间将会生成固定数量的样本，样本数为：
 
-$$$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$$
-
-staleness_threshold>0，表示异步训练， 可以设置为小数，支持更灵活的异步调用。
-
-Rollouter两次参数更新之间将会最多生成的样本数为：
-
-$$$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$$
+* staleness_threshold=0，表示同步训练。
+  Rollouter两次参数更新之间将会生成固定数量的样本，样本数为：
+  $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$
+* staleness_threshold>0，表示异步训练， 可以设置为小数，支持更灵活的异步调用。
+  Rollouter两次参数更新之间将会最多生成的样本数为：
+  $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$
 
 num_staleness_sample 表示上一次rollout多生成的陈旧样本数。
+
 由于是流式系统，rollout持续生成，trainer持续消费。如果rollouter较慢，trainer会更早触发参数同步，rollouter并不会实际生产rollout_num个样本。
 当rollout 足够快时，staleness_threshold设置为1，基本上等价于one_step_off policy。
 为了避免过期样本太多影响训练精度，建议该值设置小于1。
 
 `async_training.partial_rollout`
+
 partial_rollout只会在staleness_threshold>0时才实际上起作用。
 
 `async_training.use_rollout_log_probs`
+
 在强化学习算法中，log_probs与参数版本，token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定，我们在计算重要性采样时，
 即 old_log_prob必须使用rollout参数及token所对应log_probs，才能保证算法的正确性。在fully
 async策略中，我们默认old_log_prob是有rollout所计算的，而不是由trainer所计算。
@@ -144,15 +146,17 @@ async策略中，我们默认old_log_prob是有rollout所计算的，而不是
 3. async stream pipeline with staleness samples:
     1. trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=Flase。
     2. Rollouter在每次参数更新后将计划最多生产rollout_num个样本（实际根据rollout速度，生成的样本可能会少与这个值）。
-    3.
-   如果rollout过程比较快，Rollouter将会在参数同步前额外生成一部分样本num_stale_samples，用于参数同步后立即给Trainer使用，如图c所示。触发参数同步时，如果Rollouter有正在生产的任务，将会等待任务完成，同时不会添加新的任务；
+    3. 如果rollout过程比较快，Rollouter将会在参数同步前额外生成一部分样本num_stale_samples，用于参数同步后立即给Trainer使用。
+       触发参数同步时，如果Rollouter有正在生产的任务，将会等待任务完成，同时不会添加新的任务；
     4. 相较于b，除第一次step训练外，后续的训练都不会有wait first batch rollout finish的时间，但是会有wait active task
        finish的时间。
+    5. 如图c所示；
 
 4. async stream pipeline with partial rollout:
     1. trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=True。
     2. 相较于c，触发参数同步时，Rollouter如果有正在生产的sample，会打断rollout过程并进行参数同步，被中断的sample会在参数同步后继续生成。减少了wait
        active task finish的时间。
+    3. 如图d所示；
 
 ![fully_async_policy_mode](
 https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true)
@@ -178,12 +182,11 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a
 * 关键参数：
     * staleness_threshold: 设置太大会导致较多的旧样本使用，影响模型效果，建议设置小于1。
     * require_batches：越接近1，越接近纯流式过程，训练过程中bubble越小，能够在速度上获得更快的加速效果，但会对样本的处理顺序产生影响；
-    * trigger_parameter_sync_step: 设置的越小越接近on
-      policy但会导致频繁的参数同步，同时server模式，长尾样本浪费的资源无法被短样本填充，资源利用率低。设置的越大有更高的计算效率，但是精度上会受到off
-      policy的影响。
+    * trigger_parameter_sync_step: 设置的越小越接近on policy，但会导致频繁的参数同步，长尾样本浪费的资源无法被短样本填充，资源利用率低。
+      设置的越大有更高的计算效率，但是精度上会受到off policy的影响。
     * rollout.test_freq: 会占用Rollouter资源，不建议设置太小。
 
-* 模式选择：正如[模式支持]章节介绍，通过调整不同的参数，Fully Async架构支持不同程度上的优化加速，适用于不同场景的任务。
+* 模式选择：通过调整不同的参数，Fully Async架构支持不同程度上的优化加速，适用于不同场景的任务。
     * 对于小规模任务，需要保证训练的稳定性和 on-policy 性，对速度要求不高的场景，可以尝试使用on policy pipeline的模式（模式1）。
     * 对于需要提高训练吞吐量，但对 staleness 敏感的场景，可以尝试使用 stream off policy pipeline 的模式。即通过
       设置trigger_parameter_sync_step>1 ，提高 训练效率，但仍保持同步机制 (staleness_threshold=0 )（模式2）。
@@ -302,14 +305,27 @@ python -m recipe.fully_async_policy.fully_async_main \
 
 ### 128卡  require_batches 消融实验
 
+| training mode      | Resource allocation | require_size | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|--------------|------|--------------------|--------------|--------------|------------|------------------|
+| fully_async_policy | 64:64               | 1            |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 2            |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 4            |      |                    |              |              |            |                  |
+
 ### 128卡 stale 消融实验
 
+| training mode      | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------|
+| fully_async_policy | 64:64               | 0         |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 0.1       |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 0.3       |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 0.5       |      |                    |              |              |            |                  |
+
 ## 后续计划
 
 * GRPO实验
 * megatron 适配
 * sglang 集成
-* transfer queue 集成 
+* transfer queue 集成
 * 异步参数同步
 * Areal异步算法实现
 * TPPO算法实现

From 1a3759e30e7d390d803b0db1a9f1dcee579072ff Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 14 Oct 2025 11:50:57 +0800
Subject: [PATCH 167/182] update README_zh.md

---
 recipe/fully_async_policy/README_zh.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index 8d7ce356335..d1080d584a4 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -273,7 +273,7 @@ python -m recipe.fully_async_policy.fully_async_main \
 | colocate sync      | 128                 |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               |      |                    |              |              |            |                  |
 
-> https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ### 30B模型模式实验
 
@@ -303,6 +303,8 @@ python -m recipe.fully_async_policy.fully_async_main \
 | fully_async_policy | 64:64               | async stream pipeline with staleness samples |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
 
+>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
 ### 128卡  require_batches 消融实验
 
 | training mode      | Resource allocation | require_size | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
@@ -311,6 +313,8 @@ python -m recipe.fully_async_policy.fully_async_main \
 | fully_async_policy | 64:64               | 2            |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | 4            |      |                    |              |              |            |                  |
 
+>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
 ### 128卡 stale 消融实验
 
 | training mode      | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
@@ -320,6 +324,8 @@ python -m recipe.fully_async_policy.fully_async_main \
 | fully_async_policy | 64:64               | 0.3       |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | 0.5       |      |                    |              |              |            |                  |
 
+>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
 ## 后续计划
 
 * GRPO实验

From ed73079fb67352d20ab0bf6b9bceffea611d7350 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 14 Oct 2025 18:56:25 +0800
Subject: [PATCH 168/182] update README

---
 recipe/fully_async_policy/README_zh.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index d1080d584a4..7d385e03abd 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -176,8 +176,10 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a
 ### 调参建议
 
 * 资源分配与调整:
-    *
-  合理的资源分配是获得好的训练效率的前提。理想的资源分配情况应该是使得Rollout的时间和Train的时间接近，从而使得整个训练过程流水气泡最小，避免资源闲置，同时Trainer不会使用旧样本。在真实训练场景下，可以根据实际训练过程中rollout和train的空闲时间调整资源分配，可从rollouter/idle_ratio和trainer/idle_ratio获得，如果rollouter/idle_ratio较高trainer/idle_ratio较低，应该增多Trainer的资源减少Rollouter的资源，反之亦然。
+    * 合理的资源分配是获得好的训练效率的前提。理想的资源分配情况应该是使得Rollout的时间和Train的时间接近，从而使得整个训练过程流水气泡最小，
+      避免资源闲置，同时Trainer不会使用旧样本。在真实训练场景下，可以根据实际训练过程中rollout和train的空闲时间调整资源分配，
+      可从rollouter/idle_ratio和trainer/idle_ratio获得，如果rollouter/idle_ratio较高trainer/idle_ratio较低，
+      应该增多Trainer的资源减少Rollouter的资源，反之亦然。
 
 * 关键参数：
     * staleness_threshold: 设置太大会导致较多的旧样本使用，影响模型效果，建议设置小于1。
@@ -273,7 +275,7 @@ python -m recipe.fully_async_policy.fully_async_main \
 | colocate sync      | 128                 |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               |      |                    |              |              |            |                  |
 
->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ### 30B模型模式实验
 
@@ -303,7 +305,7 @@ python -m recipe.fully_async_policy.fully_async_main \
 | fully_async_policy | 64:64               | async stream pipeline with staleness samples |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
 
->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ### 128卡  require_batches 消融实验
 
@@ -313,7 +315,7 @@ python -m recipe.fully_async_policy.fully_async_main \
 | fully_async_policy | 64:64               | 2            |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | 4            |      |                    |              |              |            |                  |
 
->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ### 128卡 stale 消融实验
 
@@ -324,7 +326,7 @@ python -m recipe.fully_async_policy.fully_async_main \
 | fully_async_policy | 64:64               | 0.3       |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | 0.5       |      |                    |              |              |            |                  |
 
->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ## 后续计划
 

From ad595f719d8ec897672bf5bbd8945d678165085d Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 14 Oct 2025 21:05:58 +0800
Subject: [PATCH 169/182] add ci

---
 .github/workflows/e2e_fully_async_policy.yml | 149 +++++++++++++++++++
 docs/advance/fully_async.md                  |   0
 docs/index.rst                               |   1 +
 tests/special_e2e/run_fully_async_policy.sh  |   8 +-
 4 files changed, 154 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/e2e_fully_async_policy.yml
 create mode 100644 docs/advance/fully_async.md

diff --git a/.github/workflows/e2e_fully_async_policy.yml b/.github/workflows/e2e_fully_async_policy.yml
new file mode 100644
index 00000000000..e2cf0d1c061
--- /dev/null
+++ b/.github/workflows/e2e_fully_async_policy.yml
@@ -0,0 +1,149 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
+name: e2e_fully_async_policy
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  # For push, for now only anti-patterns are specified so it is more conservative
+  # and achieves higher coverage.
+  push:
+    branches:
+      - main
+      - v0.*
+    paths:
+      - "**/*.py"
+      - "!**/*.md"
+      - "!**/*.sh"
+      # Other entrypoints
+      - "!examples/*trainer*"
+      - "!tests/**"
+      - "!verl/trainer/main_*.py"
+      - "!verl/trainer/fsdp_sft_trainer.py"
+      - "!recipe/**"
+      - "recipe/fully_async_policy"
+  pull_request:
+    branches:
+      - main
+      - v0.*
+    paths:
+      - "**/*.py"
+      - "!**/*.md"
+      - "!**/*.sh"
+      # Other entrypoints
+      - "!examples/**"
+      - "!tests/**"
+      - "!verl/trainer/main_*.py"
+      - "!verl/trainer/fsdp_sft_trainer.py"
+      # Other recipes
+      - "!recipe/**"
+      # Home
+      - "recipe/fully_async_policy"
+      # Entrypoints
+      - ".github/workflows/e2e_fully_async_policy.yml"
+      - "examples/data_preprocess/gsm8k.py"
+      - "tests/special_e2e/run_fully_async_policy.sh"
+
+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+# Declare permissions just read content.
+permissions:
+  contents: read
+
+env:
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2"
+  DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
+  TRANSFORMERS_VERSION: "4.56.2"
+
+jobs:
+  setup:
+    if: github.repository_owner == 'volcengine'
+    runs-on: ubuntu-latest
+    outputs:
+      runner-label: ${{ steps.create-runner.outputs.runner-label }}
+      mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
+    steps:
+      - uses: actions/checkout@v4
+      - id: create-runner
+        uses: volcengine/vemlp-github-runner@v1
+        with:
+          mode: "create"
+          faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
+          mlp-image: "${{ env.IMAGE }}"
+
+  # Test FSDP2 strategy
+  e2e_fully_async_policy_fsdp2:
+    needs: setup
+    runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
+    timeout-minutes: 10 # Increase timeout for async training
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+      ACTOR_STRATEGY: "fsdp2"
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install --no-deps -e .[test,gpu]
+          pip3 install transformers==$TRANSFORMERS_VERSION
+      - name: Prepare GSM8K dataset
+        run: |
+          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
+      - name: Running the E2E test with fully_async_policy algorithm (FSDP2)
+        run: |
+          ray stop --force
+          bash tests/special_e2e/run_fully_async_policy.sh
+
+  cleanup:
+    runs-on: ubuntu-latest
+    needs:
+      [
+        setup,
+        e2e_fully_async_policy_fsdp2
+      ]
+    if: always()
+    steps:
+      - id: destroy-runner
+        uses: volcengine/vemlp-github-runner@v1
+        with:
+          mode: "destroy"
+          faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
+          mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
\ No newline at end of file
diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/docs/index.rst b/docs/index.rst
index 68e37545dba..e8467dc965a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -124,6 +124,7 @@ verl is fast with:
    advance/rollout_is_migration.md
    advance/one_step_off
    advance/agent_loop
+   advance/fully_async
 
 .. toctree::
    :maxdepth: 1
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 096cb05c7a1..a2f99f0d67b 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -55,11 +55,11 @@ n_gpus_training=4
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
-train_prompt_mini_bsz=32
-total_rollout_steps=$(((128*2)))
-test_freq=10
+train_prompt_mini_bsz=16
+total_rollout_steps=$(((128)))
+test_freq=-1
 staleness_threshold=0.1
-trigger_parameter_sync_step=16
+trigger_parameter_sync_step=4
 partial_rollout=True
 
 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal"

From 1f51b0dfaea9105bf365d2bf07b7d48b23717a7f Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Tue, 14 Oct 2025 21:32:58 +0800
Subject: [PATCH 170/182] update readme

---
 recipe/fully_async_policy/README_zh.md | 143 ++++++++++++++-----------
 verl/trainer/ppo/ray_trainer.py        |   2 -
 2 files changed, 83 insertions(+), 62 deletions(-)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index 7d385e03abd..040dfe47dd1 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -83,68 +83,75 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a
 
 **进一步的解释：**
 
-`rollout.total_rollout_steps`
+* `rollout.total_rollout_steps`
 
-与 colocate 相比，数量可以通过 train_batch_size 与 step 相乘对齐: rollout.total_rollout_steps = data.train_batch_size *
-step。
+  与 colocate 相比，数量可以通过 train_batch_size 与 step 相乘对齐:
+  `rollout.total_rollout_steps = data.train_batch_size * step`。
 
-`async_training.trigger_parameter_sync_step`
+* `async_training.trigger_parameter_sync_step`
 
-在fully async策略中，表示Trainer进行多少次本地更新后（也就是获取多少次require_batches\* ppo_mini_batch_size数量样本），
-与Rollouter之间进行一次参数同步。
-每两次Rollouter和Trainer参数同步之间，Trainer将会处理trigger_parameter_sync_step\* require_batches\*
-ppo_mini_batch_size份sample。
-如果为了与colocate在公平的情况下对比速度，trigger_parameter_sync_step应该设置为 data.train_batch_size / (
-require_batches \* ppo_mini_batch_size)。
+  在fully async策略中，表示Trainer进行多少次本地更新后（也就是获取多少次`require_batches * ppo_mini_batch_size`数量样本），
+  与Rollouter之间进行一次参数同步。
+  每两次Rollouter和Trainer参数同步之间，Trainer将会处理`trigger_parameter_sync_step* require_batches\
+  ppo_mini_batch_size`份sample。
+  如果为了与colocate在公平的情况下对比速度，trigger_parameter_sync_step应该设置为 `data.train_batch_size / (
+  require_batches * ppo_mini_batch_size)`。
 
-`async_training.staleness_threshold`
+* `async_training.staleness_threshold`
 
-在fully async策略中，表示最大允许使用的staleness样本的比例。
+  在fully async策略中，表示最大允许使用的staleness样本的比例。
 
-* staleness_threshold=0，表示同步训练。
-  Rollouter两次参数更新之间将会生成固定数量的样本，样本数为：
-  $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$
-* staleness_threshold>0，表示异步训练， 可以设置为小数，支持更灵活的异步调用。
-  Rollouter两次参数更新之间将会最多生成的样本数为：
-  $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$
+    * staleness_threshold=0，表示同步训练。
+      Rollouter两次参数更新之间将会生成固定数量的样本，样本数为：
+      $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$
+    * staleness_threshold>0，表示异步训练， 可以设置为小数，支持更灵活的异步调用。
+      Rollouter两次参数更新之间将会最多生成的样本数为：
+      $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$
 
-num_staleness_sample 表示上一次rollout多生成的陈旧样本数。
+  num_staleness_sample 表示上一次rollout多生成的陈旧样本数。
 
-由于是流式系统，rollout持续生成，trainer持续消费。如果rollouter较慢，trainer会更早触发参数同步，rollouter并不会实际生产rollout_num个样本。
-当rollout 足够快时，staleness_threshold设置为1，基本上等价于one_step_off policy。
-为了避免过期样本太多影响训练精度，建议该值设置小于1。
+  由于是流式系统，rollout持续生成，trainer持续消费。如果rollouter较慢，trainer会更早触发参数同步，rollouter并不会实际生产rollout_num个样本。
+  当rollout 足够快时，staleness_threshold设置为1，基本上等价于one_step_off policy。
+  为了避免过期样本太多影响训练精度，建议该值设置小于1。
 
-`async_training.partial_rollout`
+* `async_training.partial_rollout`
 
-partial_rollout只会在staleness_threshold>0时才实际上起作用。
+  partial_rollout只会在staleness_threshold>0时才实际上起作用。
 
-`async_training.use_rollout_log_probs`
+* `async_training.use_rollout_log_probs`
 
-在强化学习算法中，log_probs与参数版本，token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定，我们在计算重要性采样时，
-即 old_log_prob必须使用rollout参数及token所对应log_probs，才能保证算法的正确性。在fully
-async策略中，我们默认old_log_prob是有rollout所计算的，而不是由trainer所计算。
+  在强化学习算法中，log_probs与参数版本，token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定，我们在计算重要性采样时，
+  即 old_log_prob必须使用rollout参数及token所对应log_probs，才能保证算法的正确性。在fully
+  async策略中，我们默认old_log_prob是有rollout所计算的，而不是由trainer所计算。
+
+  * `async_training.require_batches`
+  
+  在流式训练中，require_batches 应该设置为1，表示生产够ppo_mini_batch_size样本后，就进行训练。
+  在实际测试中，我们发现，如果单次下发的样本较少，由于数据分发的顺序，会导致训练不稳定，response 长度变长。
+  在这里，我们额外提供 require_batches 进行流式分发，单次参与训练的样本数量控制。
+  
 
 ### 模式支持
 
 1. on policy pipeline:
-    1. trigger_parameter_sync_step=1，staleness_threshold=0;
-    2. Rollouter一次生产require_batches*
-       ppo_mini_batch_size的samples，Trainer获取这些samples后进行训练，训练完后Trainer和Rollouter之间进行一次参数同步;
+    1. **trigger_parameter_sync_step=1，staleness_threshold=0**
+    2. Rollouter一次生产`require_batches*ppo_mini_batch_size`
+       的samples，Trainer获取这些samples后进行训练，训练完后Trainer和Rollouter之间进行一次参数同步;
     3. 在rollout阶段，如果存在长尾的样本，但是rollout样本数较少时，较短的样本无法填充到空闲的资源中，会造成一定的资源浪费。
     4. 如图a所示；
 
 2. stream off policy pipeline:
-    1. trigger_parameter_sync_step>1，staleness_threshold=0。
-    2. 将会进行同步的流式训练，Rollouter一次生产require_batches*ppo_mini_batch_size*
-       trigger_parameter_sync_step的samples，Trainer每获取require_batches*
-       ppo_mini_batch_size就进行一次本地训练，训练trigger_parameter_sync_step次后，Trainer和Rollouter之间进行一次参数同步;
+    1. **trigger_parameter_sync_step>1，staleness_threshold=0**
+    2. 将会进行同步的流式训练，Rollouter一次生产`require_batches*ppo_mini_batch_size*trigger_parameter_sync_step`
+       的samples，Trainer每获取`require_batches*ppo_mini_batch_size`
+       就进行一次本地训练，训练trigger_parameter_sync_step次后，Trainer和Rollouter之间进行一次参数同步;
     3. 相较于a，由于一次生成的样本更多，资源的空闲会更低。
-    4. 在一次step训练中，会存在两次资源闲置的时间，分别是在第一次获取样本时，train等待require_batches*
-       ppo_mini_batch_size个样本生产，以及最后一次参数更新时，rollout等待训练完成。
+    4. 在一次step训练中，会存在两次资源闲置的时间，分别是在第一次获取样本时，train等待`require_batches*ppo_mini_batch_size`
+       个样本生产，以及最后一次参数更新时，rollout等待训练完成。
     5. 如图b所示；
 
 3. async stream pipeline with staleness samples:
-    1. trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=Flase。
+    1. **trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=Flase**
     2. Rollouter在每次参数更新后将计划最多生产rollout_num个样本（实际根据rollout速度，生成的样本可能会少与这个值）。
     3. 如果rollout过程比较快，Rollouter将会在参数同步前额外生成一部分样本num_stale_samples，用于参数同步后立即给Trainer使用。
        触发参数同步时，如果Rollouter有正在生产的任务，将会等待任务完成，同时不会添加新的任务；
@@ -153,7 +160,7 @@ async策略中，我们默认old_log_prob是有rollout所计算的，而不是
     5. 如图c所示；
 
 4. async stream pipeline with partial rollout:
-    1. trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=True。
+    1. **trigger_parameter_sync_step>=1，staleness_threshold>0，partial_rollout=True**
     2. 相较于c，触发参数同步时，Rollouter如果有正在生产的sample，会打断rollout过程并进行参数同步，被中断的sample会在参数同步后继续生成。减少了wait
        active task finish的时间。
     3. 如图d所示；
@@ -245,6 +252,8 @@ python -m recipe.fully_async_policy.fully_async_main \
 
 ### 在7B模型上进行异步训练
 
+我们使用 Qwen2.5-Math-7B 验证 fully async 策略在长候选下，各个资源的收益。
+
 * 机器：H20
 * 模型：Qwen2.5-Math-7B
 * rollout长度：max_response_length FSDP2: 28K tokens;
@@ -277,6 +286,41 @@ python -m recipe.fully_async_policy.fully_async_main \
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
+### 128卡  7B 异步模式实验
+
+我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模型的效果。
+
+| training mode      | Resource allocation | mode                                           | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|------------------------------------------------|------|--------------------|--------------|--------------|------------|------------------|
+| fully_async_policy | 64:64               | `stream off policy pipeline`                   |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | `async stream pipeline with staleness samples` |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | `async stream pipeline with partial rollout`   |      |                    |              |              |            |                  |
+
+### 128卡 stale 消融实验
+
+在 `async stream pipeline with partial rollout` 模式下，我们验证 staleness 的设置对于训练效率的影响。
+
+| training mode      | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------|
+| fully_async_policy | 64:64               | 0         |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 0.1       |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 0.3       |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 0.5       |      |                    |              |              |            |                  |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 128卡  7B require_batches 消融实验
+
+在多次测试下，我们发现流式每次下发样本的数量，会影响训练的结果，我们通过修改 `async_training.require_batches` 验证对与结果的影响。
+
+| training mode      | Resource allocation | async_training.require_batches | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|--------------------------------|------|--------------------|--------------|--------------|------------|------------------|
+| fully_async_policy | 64:64               | 1                              |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 2                              |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | 4                              |      |                    |              |              |            |                  |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
 ### 30B模型模式实验
 
 * 机器: H20
@@ -307,27 +351,6 @@ python -m recipe.fully_async_policy.fully_async_main \
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
-### 128卡  require_batches 消融实验
-
-| training mode      | Resource allocation | require_size | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
-|--------------------|---------------------|--------------|------|--------------------|--------------|--------------|------------|------------------|
-| fully_async_policy | 64:64               | 1            |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 2            |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 4            |      |                    |              |              |            |                  |
-
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
-
-### 128卡 stale 消融实验
-
-| training mode      | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
-|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------|
-| fully_async_policy | 64:64               | 0         |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 0.1       |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 0.3       |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 0.5       |      |                    |              |              |            |                  |
-
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
-
 ## 后续计划
 
 * GRPO实验
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 4a97158809e..0aae90517e8 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -602,11 +602,9 @@ def _validate(self):
             sample_scores.extend(scores)
 
             reward_extra_infos_dict["reward"].extend(scores)
-            print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}")
             if "reward_extra_info" in result:
                 for key, lst in result["reward_extra_info"].items():
                     reward_extra_infos_dict[key].extend(lst)
-                    print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}")
 
             # collect num_turns of each prompt
             if "__num_turns__" in test_batch.non_tensor_batch:

From ead757a95e70386dcfb01641ce9bbe6abd2da594 Mon Sep 17 00:00:00 2001
From: arron <arron@MBP-JFQXPWR11F-1943.local>
Date: Thu, 16 Oct 2025 15:08:56 +0800
Subject: [PATCH 171/182] fix ci

---
 .../agent_loop/agent_loop.py                  | 38 +++++-----
 .../config/fully_async_ppo_trainer.yaml       |  4 ++
 recipe/fully_async_policy/param_sync.py       |  4 +-
 .../vllm_rollout/vllm_async_server.py         | 51 +++++++-------
 verl/trainer/config/actor/dp_actor.yaml       |  5 +-
 verl/workers/actor/dp_actor.py                | 14 ++--
 .../rollout/vllm_rollout/vllm_async_server.py | 70 +++++++++----------
 7 files changed, 97 insertions(+), 89 deletions(-)

diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 8dc7bbf609f..55489d705a5 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -70,13 +70,13 @@ class FullyAsyncAgentLoopOutput(AgentLoopOutput):
 @ray.remote
 class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase):
     def __init__(
-            self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
+        self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None
     ):
         self.server_manager = FullyAsyncLLMServerManager(config, server_handles)
         super().__init__(config, server_handles, rm_executor)
 
     async def generate_sequences_no_post(
-            self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
+        self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]]
     ) -> list[AgentLoopOutput]:
         """Generate sequences from agent loop.
 
@@ -126,19 +126,19 @@ async def generate_sequences_no_post(
         return await asyncio.gather(*tasks)
 
     async def _partial_run_agent_loop(
-            self,
-            sampling_params: dict[str, Any],
-            trajectory: dict[str, Any],
-            *,
-            agent_name: str,
-            **kwargs,
+        self,
+        sampling_params: dict[str, Any],
+        trajectory: dict[str, Any],
+        *,
+        agent_name: str,
+        **kwargs,
     ) -> AgentLoopOutput:
         with rollout_trace_attr(
-                step=trajectory["step"],
-                sample_index=trajectory["sample_index"],
-                rollout_n=trajectory["rollout_n"],
-                validate=trajectory["validate"],
-                name="agent_loop",
+            step=trajectory["step"],
+            sample_index=trajectory["sample_index"],
+            rollout_n=trajectory["rollout_n"],
+            validate=trajectory["validate"],
+            name="agent_loop",
         ):
             assert agent_name in _agent_loop_registry, (
                 f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
@@ -215,8 +215,10 @@ async def _initialize_llm_servers_async(self):
         model_config = self.config.actor_rollout_ref.model
         self.rollout_replicas = [
             self.rollout_replica_class(
-                replica_rank=replica_rank, config=rollout_config,
-                model_config=model_config, gpus_per_node=self.config.trainer.n_gpus_per_node
+                replica_rank=replica_rank,
+                config=rollout_config,
+                model_config=model_config,
+                gpus_per_node=self.config.trainer.n_gpus_per_node,
             )
             for replica_rank in range(num_replicas)
         ]
@@ -230,9 +232,9 @@ async def _initialize_llm_servers_async(self):
         self.server_addresses = [server._server_address for server in self.rollout_replicas]
 
     async def generate_single_sample_async(
-            self,
-            sample: DataProto,
-            partial_output_list: Optional[list[AgentLoopOutput]],
+        self,
+        sample: DataProto,
+        partial_output_list: Optional[list[AgentLoopOutput]],
     ) -> list[AgentLoopOutput]:
         """
         Asynchronously process a single sample
diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 84a3cb7c290..17c3b925476 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -48,3 +48,7 @@ rollout:
 data:
   # Number of samples generated, currently only support 1
   gen_batch_size: 1
+
+actor:
+  # Whether to use rollout log probs for training
+  use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, True}
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index d6c67ceb409..2fdcbb919db 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -18,6 +18,8 @@
 import ray
 from ray.util.collective import collective
 
+from verl.utils.device import get_nccl_backend
+
 logger = logging.getLogger(__name__)
 
 
@@ -69,7 +71,7 @@ def _init_sync_group(self):
             actor_rollout_workers,
             len(actor_rollout_workers),
             list(range(0, len(actor_rollout_workers))),
-            backend="nccl",
+            backend=get_nccl_backend(),
             group_name=self.sync_group_name,
         )
 
diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
index 7a4dc8e7d7d..93381e1bff0 100644
--- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
+++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py
@@ -16,13 +16,12 @@
 from typing import Any, Optional, Sequence
 
 import ray
-from omegaconf import DictConfig
 from ray.actor import ActorHandle
 from vllm import SamplingParams
 from vllm.inputs import TokensPrompt
 from vllm.outputs import RequestOutput
 
-from verl.workers.config import RolloutConfig, RewardModelConfig, HFModelConfig
+from verl.workers.config import HFModelConfig, RewardModelConfig, RolloutConfig
 from verl.workers.rollout.replica import RolloutMode
 from verl.workers.rollout.vllm_rollout.vllm_async_server import (
     _qwen2_5_vl_dedup_image_tokens,
@@ -37,15 +36,15 @@
 @ray.remote(num_cpus=1)
 class vLLMHttpServerForPartial(vLLMHttpServerBase):
     def __init__(
-            self,
-            config: RolloutConfig | RewardModelConfig,
-            model_config: HFModelConfig,
-            rollout_mode: RolloutMode,
-            workers: list[ActorHandle],
-            replica_rank: int,
-            node_rank: int,
-            gpus_per_node: int,
-            nnodes: int,
+        self,
+        config: RolloutConfig | RewardModelConfig,
+        model_config: HFModelConfig,
+        rollout_mode: RolloutMode,
+        workers: list[ActorHandle],
+        replica_rank: int,
+        node_rank: int,
+        gpus_per_node: int,
+        nnodes: int,
     ):
         super().__init__(config, model_config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes)
 
@@ -56,11 +55,11 @@ def __init__(
         self.req_output: dict[str, Optional[RequestOutput]] = {}
 
     async def _generate_step(
-            self,
-            prompt_ids: list[int],
-            sampling_params: dict[str, Any],
-            request_id: str,
-            image_data: Optional[list[Any]] = None,
+        self,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        request_id: str,
+        image_data: Optional[list[Any]] = None,
     ):
         max_tokens = self.config.max_model_len - len(prompt_ids)
         sampling_params["logprobs"] = 1
@@ -79,11 +78,11 @@ async def _generate_step(
         assert self.req_output[request_id] is not None
 
     async def generate_for_partial(
-            self,
-            prompt_ids: list[int],
-            sampling_params: dict[str, Any],
-            request_id: str,
-            image_data: Optional[list[Any]] = None,
+        self,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        request_id: str,
+        image_data: Optional[list[Any]] = None,
     ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]:
         async with self.lock:
             if self.paused:
@@ -133,11 +132,11 @@ async def reset_prefix_cache(self):
 
 class FullyAsyncvLLMReplica(vLLMReplica):
     def __init__(
-            self,
-            replica_rank: int,
-            config: RolloutConfig | RewardModelConfig,
-            model_config: HFModelConfig,
-            gpus_per_node: int = 8,
+        self,
+        replica_rank: int,
+        config: RolloutConfig | RewardModelConfig,
+        model_config: HFModelConfig,
+        gpus_per_node: int = 8,
     ):
         super().__init__(replica_rank, config, model_config, gpus_per_node)
         self.server_class = vLLMHttpServerForPartial
diff --git a/verl/trainer/config/actor/dp_actor.yaml b/verl/trainer/config/actor/dp_actor.yaml
index 9969f7635b9..a2ff54d4854 100644
--- a/verl/trainer/config/actor/dp_actor.yaml
+++ b/verl/trainer/config/actor/dp_actor.yaml
@@ -39,7 +39,4 @@ entropy_from_logits_with_chunking: False
 entropy_checkpointing: False
 
 # Whether to remove padding tokens in inputs during training
-use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
-
-# Whether it's a hybrid engine
-hybrid_engine: ${oc.select:actor_rollout_ref.hybrid_engine, True}
\ No newline at end of file
+use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
\ No newline at end of file
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
index fe78e360365..7dd531ad266 100644
--- a/verl/workers/actor/dp_actor.py
+++ b/verl/workers/actor/dp_actor.py
@@ -78,7 +78,7 @@ def __init__(self, config: ActorConfig, actor_module: nn.Module, actor_optimizer
 
         self.compute_entropy_from_logits = (
             torch.compile(entropy_from_logits, dynamic=True)
-            if self.config.get("use_torch_compile", True)  #  use torch compile by default
+            if self.config.get("use_torch_compile", True)  # use torch compile by default
             else entropy_from_logits
         )
         self.device_name = get_device_name()
@@ -387,7 +387,7 @@ def update_policy(self, data: DataProto):
         # See PPO paper for details. https://arxiv.org/abs/1707.06347
         mini_batches = data.split(self.config.ppo_mini_batch_size)
 
-        on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1 and self.config.hybrid_engine
+        on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1
 
         metrics = {}
         for _ in range(self.config.ppo_epochs):
@@ -427,10 +427,14 @@ def update_policy(self, data: DataProto):
                         model_inputs, temperature=temperature, calculate_entropy=calculate_entropy
                     )
 
-                    if on_policy:
-                        old_log_prob = log_prob.detach()
-                    else:
+                    # for fully_async_policy recipe
+                    if hasattr(self.config, "use_rollout_log_probs") and self.config.use_rollout_log_probs:
                         old_log_prob = model_inputs["old_log_probs"]
+                    else:
+                        if on_policy:
+                            old_log_prob = log_prob.detach()
+                        else:
+                            old_log_prob = model_inputs["old_log_probs"]
 
                     loss_mode = self.config.policy_loss.get("loss_mode", "vanilla")
                     # vanilla -> verl.trainer.ppo.core_algos.compute_policy_loss_vanilla
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index b82d4013cc0..42028ba7a4f 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -61,7 +61,7 @@ def _init_executor(self) -> None:
         tp_size = self.vllm_config.parallel_config.tensor_parallel_size
 
         addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",")
-        addresses = addresses[dp_rank_local * tp_size: (dp_rank_local + 1) * tp_size]
+        addresses = addresses[dp_rank_local * tp_size : (dp_rank_local + 1) * tp_size]
         self.context = zmq.Context()
         self.sockets = []
         for address in addresses:
@@ -81,11 +81,11 @@ def _init_executor(self) -> None:
         self.collective_rpc("load_model")
 
     def collective_rpc(
-            self,
-            method: str | Callable,
-            timeout: Optional[float] = None,
-            args: tuple = (),
-            kwargs: Optional[dict[str, Any]] = None,
+        self,
+        method: str | Callable,
+        timeout: Optional[float] = None,
+        args: tuple = (),
+        kwargs: Optional[dict[str, Any]] = None,
     ) -> list[Any]:
         if isinstance(method, str):
             sent_method = method
@@ -114,15 +114,15 @@ class vLLMHttpServerBase:
     """
 
     def __init__(
-            self,
-            config: RolloutConfig | RewardModelConfig,
-            model_config: HFModelConfig,
-            rollout_mode: RolloutMode,
-            workers: list[ActorHandle],
-            replica_rank: int,
-            node_rank: int,
-            gpus_per_node: int,
-            nnodes: int,
+        self,
+        config: RolloutConfig | RewardModelConfig,
+        model_config: HFModelConfig,
+        rollout_mode: RolloutMode,
+        workers: list[ActorHandle],
+        replica_rank: int,
+        node_rank: int,
+        gpus_per_node: int,
+        nnodes: int,
     ):
         """
         Args:
@@ -337,11 +337,11 @@ async def run_headless(self, args: argparse.Namespace):
         )
 
     async def generate(
-            self,
-            prompt_ids: list[int],
-            sampling_params: dict[str, Any],
-            request_id: str,
-            image_data: Optional[list[Any]] = None,
+        self,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        request_id: str,
+        image_data: Optional[list[Any]] = None,
     ) -> TokenOutput:
         """Generate sequence with token-in-token-out."""
         # TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready.
@@ -403,15 +403,15 @@ class vLLMHttpServer(vLLMHttpServerBase):
     """
 
     def __init__(
-            self,
-            config: RolloutConfig | RewardModelConfig,
-            model_config: HFModelConfig,
-            rollout_mode: RolloutMode,
-            workers: list[ActorHandle],
-            replica_rank: int,
-            node_rank: int,
-            gpus_per_node: int,
-            nnodes: int,
+        self,
+        config: RolloutConfig | RewardModelConfig,
+        model_config: HFModelConfig,
+        rollout_mode: RolloutMode,
+        workers: list[ActorHandle],
+        replica_rank: int,
+        node_rank: int,
+        gpus_per_node: int,
+        nnodes: int,
     ):
         super().__init__(config, model_config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes)
 
@@ -421,11 +421,11 @@ def __init__(
 
 class vLLMReplica(RolloutReplica):
     def __init__(
-            self,
-            replica_rank: int,
-            config: RolloutConfig | RewardModelConfig,
-            model_config: HFModelConfig,
-            gpus_per_node: int = 8,
+        self,
+        replica_rank: int,
+        config: RolloutConfig | RewardModelConfig,
+        model_config: HFModelConfig,
+        gpus_per_node: int = 8,
     ):
         super().__init__(replica_rank, config, model_config, gpus_per_node)
         self.server_class = vLLMHttpServer
@@ -462,7 +462,7 @@ async def launch_servers(self):
 
         # create server actor in each node with node affinity
         for node_rank in range(nnodes):
-            workers = self.workers[node_rank * gpus_per_node: (node_rank + 1) * gpus_per_node]
+            workers = self.workers[node_rank * gpus_per_node : (node_rank + 1) * gpus_per_node]
             node_id = worker_node_ids[node_rank * gpus_per_node]
             server = self.server_class.options(
                 scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(

From 2383a157410c6004fa7963f3a3c6c4e576e7cd42 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 16 Oct 2025 15:56:40 +0800
Subject: [PATCH 172/182] fix some ci

---
 docs/advance/fully_async.md                     |  5 +++++
 recipe/fully_async_policy/README_zh.md          |  2 +-
 .../config/fully_async_ppo_trainer.yaml         | 17 +++++++++--------
 recipe/fully_async_policy/param_sync.py         |  4 +---
 tests/special_e2e/run_fully_async_policy.sh     |  2 +-
 tests/special_sanity/check_device_api_usage.py  |  1 +
 verl/workers/actor/dp_actor.py                  |  1 +
 verl/workers/config/actor.py                    |  2 +-
 8 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md
index e69de29bb2d..aa9e33ff99c 100644
--- a/docs/advance/fully_async.md
+++ b/docs/advance/fully_async.md
@@ -0,0 +1,5 @@
+# Recipe: Fully Async Policy Async Trainer
+
+**Author:**  `https://github.com/meituan-search`
+
+Last updated: 10/16/2025.
\ No newline at end of file
diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index 040dfe47dd1..0a43a7ec406 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -2,7 +2,7 @@
 
 **Author:**  `https://github.com/meituan-search`
 
-Last updated: 10/13/2025.
+Last updated: 10/16/2025.
 
 本文档介绍了完全异步PPO训练系统，该系统实现了 Trainer 和 Rollouter 的完全解耦，支持异步样本生成和训练。
 
diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 17c3b925476..4a8b8fc32e7 100644
--- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -10,19 +10,19 @@ async_training:
 
   # Maximum samples staleness threshold
   staleness_threshold: 0.1
-       
+
   # Frequency of parameter synchronization between rollouter and trainer, 
   # One step means trainer obtains a batch of required samples
   trigger_parameter_sync_step: 4
   
   # The number of ppo_mini_batches that the FullyAsyncTrainer obtains once
-  require_batches: 1  
+  require_batches: 1
 
   # When synchronizing parameters, whether to interrupt rollouter and perform partial rollout
   partial_rollout: True
 
   # Whether to use rollout log probs for training
-  use_rollout_log_probs: True         
+  use_rollout_log_probs: True
 
 # Rollout config
 rollout:
@@ -34,7 +34,7 @@ rollout:
   n_gpus_per_node: 8
 
   # number of responses (i.e. num sample times). > 1 for grpo
-  n: 4                                
+  n: 4
 
   # total rollout samples # TODO rename to total_rollout_samples
   total_rollout_steps: 100
@@ -43,12 +43,13 @@ rollout:
   total_epochs: 10
 
   # Test frequency, how many times a parameter update triggers a validation
-  test_freq: 1                   
+  test_freq: 1
 
 data:
   # Number of samples generated, currently only support 1
   gen_batch_size: 1
 
-actor:
-  # Whether to use rollout log probs for training
-  use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, True}
+actor_rollout_ref:
+  actor:
+    # Whether to use rollout log probs for training
+    use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, True}
diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py
index 2fdcbb919db..d6c67ceb409 100644
--- a/recipe/fully_async_policy/param_sync.py
+++ b/recipe/fully_async_policy/param_sync.py
@@ -18,8 +18,6 @@
 import ray
 from ray.util.collective import collective
 
-from verl.utils.device import get_nccl_backend
-
 logger = logging.getLogger(__name__)
 
 
@@ -71,7 +69,7 @@ def _init_sync_group(self):
             actor_rollout_workers,
             len(actor_rollout_workers),
             list(range(0, len(actor_rollout_workers))),
-            backend=get_nccl_backend(),
+            backend="nccl",
             group_name=self.sync_group_name,
         )
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index a2f99f0d67b..e5386c9e4fe 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -143,7 +143,7 @@ if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then
     ref_offload=True
     actor_offload=False
 
-    python3 -m recipe.fully_async_policy.fully_async_main \
+    /home/hadoop-ai-search/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
         "${common_params[@]}" \
         actor_rollout_ref.actor.strategy=fsdp2 \
         critic.strategy=fsdp2 \
diff --git a/tests/special_sanity/check_device_api_usage.py b/tests/special_sanity/check_device_api_usage.py
index dae5ac4b43d..8d3cfda27c8 100644
--- a/tests/special_sanity/check_device_api_usage.py
+++ b/tests/special_sanity/check_device_api_usage.py
@@ -48,6 +48,7 @@
 NCCL_KEYWORD_CHECK_WHITELIST = [
     "verl/utils/device.py",
     "verl/third_party/sglang/parallel_state.py",  # appear in default backend
+    "verl/recipe/fully_async_policy/param_sync.py",  # fully_async_policy in default backend
 ]
 
 SEARCH_WHITELIST = CUDA_KEYWORD_CHECK_WHITELIST + NCCL_KEYWORD_CHECK_WHITELIST
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
index 7dd531ad266..5955dfc33ed 100644
--- a/verl/workers/actor/dp_actor.py
+++ b/verl/workers/actor/dp_actor.py
@@ -429,6 +429,7 @@ def update_policy(self, data: DataProto):
 
                     # for fully_async_policy recipe
                     if hasattr(self.config, "use_rollout_log_probs") and self.config.use_rollout_log_probs:
+                        print("for fully_async_policy recipe")
                         old_log_prob = model_inputs["old_log_probs"]
                     else:
                         if on_policy:
diff --git a/verl/workers/config/actor.py b/verl/workers/config/actor.py
index 1ccab8e41c7..fe5b3e1193a 100644
--- a/verl/workers/config/actor.py
+++ b/verl/workers/config/actor.py
@@ -231,7 +231,7 @@ class FSDPActorConfig(ActorConfig):
     fsdp_config: FSDPEngineConfig = field(default_factory=FSDPEngineConfig)
     use_remove_padding: bool = False
     profiler: ProfilerConfig = field(default_factory=ProfilerConfig)
-    hybrid_engine: bool = True
+    use_rollout_log_probs: bool = False
 
     def __post_init__(self):
         """Validate FSDP actor configuration parameters."""

From 7298b65d6381442c470b128faa1e9ffac7d22c85 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 16 Oct 2025 16:01:16 +0800
Subject: [PATCH 173/182] fix e2e_fully_async_policy_fsdp2

---
 tests/special_e2e/run_fully_async_policy.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index e5386c9e4fe..a2f99f0d67b 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -143,7 +143,7 @@ if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then
     ref_offload=True
     actor_offload=False
 
-    /home/hadoop-ai-search/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \
+    python3 -m recipe.fully_async_policy.fully_async_main \
         "${common_params[@]}" \
         actor_rollout_ref.actor.strategy=fsdp2 \
         critic.strategy=fsdp2 \

From 0730b75275b7d3a8e2d8a40a3dbfdea6290b0d8b Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Thu, 16 Oct 2025 16:15:22 +0800
Subject: [PATCH 174/182] update readme exp

---
 recipe/fully_async_policy/README_zh.md | 48 +++++++++++++-------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index 040dfe47dd1..b3c4ffe83d0 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -275,14 +275,14 @@ python -m recipe.fully_async_policy.fully_async_main \
     * staleness_threshold: 0.3
     * partial_rollout: True
 
-| training mode      | Resource allocation | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
-|--------------------|---------------------|------|--------------------|--------------|--------------|------------|------------------|
-| colocate sync      | 32                  |      |                    |              |              |            |                  |
-| fully_async_policy | 16:16               |      |                    |              |              |            |                  |
-| colocate sync      | 64                  |      |                    |              |              |            |                  |
-| fully_async_policy | 32:32               |      |                    |              |              |            |                  |
-| colocate sync      | 128                 |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               |      |                    |              |              |            |                  |
+|    training mode   	| resource allocation 	|  step  	|   gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	| total time<br>400 step 	|          acc/mean@1          	|
+|:------------------:	|:-------------------:	|:------:	|:------:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------------:	|
+| colocate sync      	| 32                  	| 790.10 	| 357.41 	| 107.71       	| 313.81       	| 13h 44m                	| 1d 3h 43m              	| 2d 9h 22m              	| 3d 17h 5m              	| max: 0.3313<br>last: 0.2448  	|
+| fully_async_policy 	| 16:16               	|        	|        	| \            	|              	|                        	|                        	|                        	|                        	| max: <br>last:               	|
+| colocate sync      	| 64                  	| 365.28 	| 150.72 	| 70.26        	| 133.41       	| 10h 22m                	| 20h 45m                	| 1d 7h 6m               	| 1d 17h 32m             	| max: 0.3365<br>last:  0.2333 	|
+| fully_async_policy 	| 32:32               	| 189.26 	| 28.46  	| \            	| 156.98       	| 4h 57m<br>(2.09x)      	| 10h 14m<br>(2.03x)     	| 16h 58m<br>(1.83x)     	| 21h 40m<br>(1.92x)     	| max: 0.3677<br>last: 0.3406  	|
+| colocate sync      	| 128                 	| 356.30 	| 177.85 	| 53.92        	| 113.81       	| 8h 36m                 	| 17h 56m                	| 1d 5h 6m               	| 1d 16h 48m             	| max: 0.3573<br>last: 0.2958  	|
+| fully_async_policy 	| 64:64               	| 150.63 	| 33.14  	| \            	| 113.16       	| 3h 13m<br>(2.67x)      	| 6h 46m<br>(2.65x)      	| 10h 53m<br>(2.67x)     	| 17h 22m<br>(2.35x)     	| max: 0.3521<br>last: 0.3094  	|
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
@@ -290,22 +290,22 @@ python -m recipe.fully_async_policy.fully_async_main \
 
 我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模型的效果。
 
-| training mode      | Resource allocation | mode                                           | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
-|--------------------|---------------------|------------------------------------------------|------|--------------------|--------------|--------------|------------|------------------|
-| fully_async_policy | 64:64               | `stream off policy pipeline`                   |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | `async stream pipeline with staleness samples` |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | `async stream pipeline with partial rollout`   |      |                    |              |              |            |                  |
+|                                          mode                                         	|  step  	|   gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	| total time<br>400 step 	|          acc/mean@1         	|
+|:-------------------------------------------------------------------------------------:	|:------:	|:------:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:---------------------------:	|
+| `stream off policy pipeline`<br>(trigger_parameter_sync_step= 4,<br>require_batches= 4) 	| 231.34 	| 128.47 	| \            	| 98.77        	| 4h 25m                 	| 9h 41m                 	| 15h 2m                 	| 1d 1h 53m              	| max: 0.2844<br>last: 0.2604 	|
+| `async stream pipeline with staleness samples`<br>(+staleness_threshold=0.5)            	|        	|        	|              	|              	|                        	|                        	|                        	|                        	|                             	|
+| `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	| 150.63 	| 33.14  	| \            	| 113.16       	| 3h 13m                 	| 6h 46m                 	| 10h 53m                	| 17h 22m                	| max: 0.3521<br>last: 0.3094 	|
 
 ### 128卡 stale 消融实验
 
 在 `async stream pipeline with partial rollout` 模式下，我们验证 staleness 的设置对于训练效率的影响。
 
-| training mode      | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
-|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------|
-| fully_async_policy | 64:64               | 0         |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 0.1       |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 0.3       |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 0.5       |      |                    |              |              |            |                  |
+| staleness_threshold 	|  step  	|   gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	| total time<br>400 step 	|          acc/mean@1         	|
+|:-------------------:	|:------:	|:------:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:---------------------------:	|
+| 0                   	| 231.34 	| 128.47 	| \            	| 98.77        	| 4h 25m                 	| 9h 41m                 	| 15h 2m                 	| 1d 1h 53m              	| max: 0.2844<br>last: 0.2604 	|
+| 0.1                 	| 171.30 	| 58.17  	| \            	| 109.12       	| 3h 53m                 	| 8h 37m                 	| 14h 25m                	| 19h 59m                	| max: 0.3542<br>last: 0.2979 	|
+| 0.3                 	| 146.11 	| 38.88  	| \            	| 103.22       	| 3h 18m                 	| 6h 49m                 	| 11h 40m                	| 17h 20m                	| max: 0.3469<br>last: 0.2865 	|
+| 0.5                 	| 150.63 	| 33.14  	| \            	| 113.16       	| 3h 13m                 	| 6h 46m                 	| 10h 53m                	| 17h 22m                	| max: 0.3521<br>last: 0.3094 	|
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
@@ -313,11 +313,11 @@ python -m recipe.fully_async_policy.fully_async_main \
 
 在多次测试下，我们发现流式每次下发样本的数量，会影响训练的结果，我们通过修改 `async_training.require_batches` 验证对与结果的影响。
 
-| training mode      | Resource allocation | async_training.require_batches | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
-|--------------------|---------------------|--------------------------------|------|--------------------|--------------|--------------|------------|------------------|
-| fully_async_policy | 64:64               | 1                              |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 2                              |      |                    |              |              |            |                  |
-| fully_async_policy | 64:64               | 4                              |      |                    |              |              |            |                  |
+| require_batches 	|  step  	|  gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	|          acc/mean@1         	|
+|:---------------:	|:------:	|:-----:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:---------------------------:	|
+| 1               	| 203.47 	| 30.88 	| \            	| 181.08       	| 3h 31m                 	| 8h 29m                 	| 17h 36m                	| max: 0.349<br>last: 0.326   	|
+| 2               	| 158.72 	| 26.32 	| \            	| 128.08       	| 3h 35m                 	| 7h 38m                 	| 13h 57m                	| max: 0.351<br>last: 0.3406  	|
+| 4               	| 124.64 	| 25.62 	| \            	| 95.06        	| 3h 13m                 	| 6h 46m                 	| 10h 53m                	| max: 0.3521<br>last: 0.3521 	|
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 

From c4a063374dd2317597a8fd18032bf0f4ae034131 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 16 Oct 2025 18:33:57 +0800
Subject: [PATCH 175/182] update readme

---
 docs/advance/fully_async.md            | 426 +++++++++++++++++++++++-
 recipe/fully_async_policy/README.md    | 427 +++++++++++++++++++++++++
 recipe/fully_async_policy/README_zh.md |  66 ++--
 verl/workers/actor/dp_actor.py         |   1 -
 4 files changed, 887 insertions(+), 33 deletions(-)
 create mode 100644 recipe/fully_async_policy/README.md

diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md
index aa9e33ff99c..77498131e45 100644
--- a/docs/advance/fully_async.md
+++ b/docs/advance/fully_async.md
@@ -1,5 +1,427 @@
 # Recipe: Fully Async Policy Async Trainer
 
-**Author:**  `https://github.com/meituan-search`
+**Author:** `https://github.com/meituan-search`
 
-Last updated: 10/16/2025.
\ No newline at end of file
+Last updated: 10/16/2025.
+
+This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter,
+supporting asynchronous sample generation and training.
+Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs,
+without significantly affecting the results.
+
+## Introduction
+
+### Background
+
+The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more
+flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training
+efficiency caused by long-tail problems.
+The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by
+designing a separated architecture and performing asynchronous training between rollout and train for one round.
+However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot
+completely eliminate the impact of long-tail on training efficiency.
+In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have
+been implemented based on the separated architecture and have achieved gains.
+We借鉴 their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and partial
+rollout training.
+By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy
+can significantly improve training efficiency.
+
+> Magistral https://arxiv.org/abs/2506.10910
+>
+> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language
+> Reasoning https://arxiv.org/abs/2505.24298
+>
+> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream
+> Generation https://arxiv.org/abs/2504.15930
+>
+> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663
+>
+
+### Core Contributions
+
+* **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to
+  specify the resources they occupy separately.
+* **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples.
+* **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to
+  multiple steps, making the asynchronous solution more flexible.
+* **NCCL Parameter Synchronization**: Uses NCCL communication primitives for parameter communication between Rollouter
+  and Trainer.
+* **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single
+  sample as the minimum transmission unit.
+* **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it
+  supports training with samples generated by old parameters.
+* **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter
+  synchronization, by adding `sleep() and resume()` logic, it
+  saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for
+  ongoing tasks to finish during parameter synchronization.
+
+Currently, the supported usage mode is fsdp+vllm. vllm must use the server mode based on AgentLoop.
+
+## Design
+
+The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four
+parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer.
+
+![fully_async_policy_structure](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_structure.svg?raw=true)
+
+1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the
+   production speed controlled by freshness.
+2. MessageQueue is used to temporarily store samples generated by Rollouter.
+3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size`
+   samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers
+   a parameter synchronization with Rollouter.
+4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability.
+
+The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for
+rollout cannot solve the idleness caused by long-tail samples.
+After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources
+are used),
+but the overlap in their time consumption reduces the end-to-end time consumption.
+
+![fully_async_policy_revenue](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true)
+
+## Usage
+
+### Parameter Description
+
+| super params                                  | implication                                                                                    |
+|-----------------------------------------------|------------------------------------------------------------------------------------------------|
+| `trainer.nnodes`                              | Number of nodes for Trainer                                                                    |
+| `trainer.n_gpus_per_node`                     | Number of GPUs per node for Trainer                                                            |
+| `rollout.nnodes`                              | Number of nodes for Rollouter                                                                  |
+| `rollout.n_gpus_per_node`                     | Number of GPUs per node for Rollouter                                                          |
+| `data.train_batch_size`                       | In the fully async strategy, this value is not effective (default is 0)                        |
+| `data.gen_batch_size`                         | In the fully async strategy, uses streaming sample production logic (default is 1)             |
+| `rollout.total_rollout_steps`                 | Total number of rollout samples                                                                |
+| `rollout.test_freq`                           | How many times Rollouter updates parameters before performing a validation                     |
+| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus                                |
+| `async_training.require_batches`              | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once                           |
+| `async_training.trigger_parameter_sync_step`  | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization |
+| `async_training.staleness_threshold`          | Freshness control                                                                              |
+| `async_training.partial_rollout`              | Whether to perform partial_rollout                                                             |
+| `async_training.use_rollout_log_probs`        | Use log_probs generated by rollout                                                             |
+
+**Further Explanation:**
+
+* `rollout.total_rollout_steps`
+
+  Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step:
+  `rollout.total_rollout_steps = data.train_batch_size * step`.
+
+* `async_training.trigger_parameter_sync_step`
+
+  In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches
+  `require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter.
+  Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process
+  `trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples.
+  To fairly compare speed with colocate, trigger_parameter_sync_step should be set to
+  `data.train_batch_size / (require_batches * ppo_mini_batch_size)`.
+
+* `async_training.staleness_threshold`
+
+  In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used.
+
+    * staleness_threshold=0, indicates synchronous training.
+      Rollouter will generate a fixed number of samples between two parameter updates, the sample count is:
+      $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$
+    * staleness_threshold>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous
+      calls.
+      Rollouter will generate at most the following number of samples between two parameter updates:
+      $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$
+
+  num_staleness_sample represents the number of stale samples generated in excess during the last rollout.
+
+  Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower,
+  trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples.
+  When rollout is fast enough, setting staleness_threshold to 1 is basically equivalent to one_step_off policy.
+  To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1.
+
+* `async_training.partial_rollout`
+
+  partial_rollout only actually takes effect when staleness_threshold>0.
+
+* `async_training.use_rollout_log_probs`
+
+  In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to
+  the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling,
+  old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm
+  correctness. In the fully
+  async strategy, we default to old_log_prob being calculated by rollout rather than by trainer.
+
+    * `async_training.require_batches`
+
+  In streaming training, require_batches should be set to 1, indicating that training is performed after producing
+  enough ppo_mini_batch_size samples.
+  In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can
+  cause training instability and longer response lengths.
+  Here, we additionally provide require_batches for streaming distribution and control the number of samples
+  participating in training at once.
+
+### Supported Modes
+
+1. on policy pipeline:
+    1. **trigger_parameter_sync_step=1, staleness_threshold=0**
+    2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for
+       training, and after training completes, Trainer and Rollouter perform a parameter synchronization;
+    3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill
+       idle resources, causing some resource waste.
+    4. As shown in figure a;
+
+2. stream off policy pipeline:
+    1. **trigger_parameter_sync_step>1, staleness_threshold=0**
+    2. Synchronous streaming training will be performed. Rollouter produces
+       `require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local
+       training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training
+       trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization;
+    3. Compared to a, since more samples are generated at once, resource idleness will be lower.
+    4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples,
+       train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter
+       update, rollout waits for training to complete.
+    5. As shown in figure b;
+
+3. async stream pipeline with stale samples:
+    1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False**
+    2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number
+       of samples generated may be less than this value depending on rollout speed).
+    3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples
+       before parameter synchronization for immediate use by Trainer after synchronization.
+       When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete
+       and not add new tasks;
+    4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the
+       first batch rollout to finish, but will have the time to wait for active tasks to finish.
+    5. As shown in figure c;
+
+4. async stream pipeline with partial rollout:
+    1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True**
+    2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will
+       interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be
+       generated after synchronization. This reduces the time to wait for active tasks to finish.
+    3. As shown in figure d;
+
+![fully_async_policy_mode](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true)
+
+### Key Metrics
+
+| metrics                                        | implication                                                                                            |
+|------------------------------------------------|--------------------------------------------------------------------------------------------------------|
+| `trainer/idle_ratio`                           | Trainer idle rate                                                                                      |
+| `rollouter/idle_ratio`                         | Rollouter idle rate                                                                                    |
+| `fully_async/count/stale_samples_processed`    | Total number of old samples used in training                                                           |
+| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories)         |
+| `fully_async/partial/total_partial_num`        | Number of partial samples processed by Trainer between two trigger_parameter_sync_step                 |
+| `fully_async/partial/partial_ratio`            | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step                  |
+| `fully_async/partial/max_partial_span`         | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step |
+
+### Parameter Tuning Recommendations
+
+* Resource Allocation and Adjustment:
+    * Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource
+      allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire
+      training process,
+      avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource
+      allocation can be adjusted based on the idle time of rollout and train during actual training,
+      which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and
+      trainer/idle_ratio is low,
+      Trainer resources should be increased and Rollouter resources should be reduced, and vice versa.
+
+* Key Parameters:
+    * staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It
+      is recommended to set it to less than 1.
+    * require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and
+      the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample
+      processing;
+    * trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent
+      parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in
+      low resource utilization.
+      The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy.
+    * rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small.
+
+* Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at
+  different levels, suitable for tasks in different scenarios.
+    * For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed
+      requirements, the on policy pipeline mode (Mode 1) can be tried.
+    * For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy
+      pipeline mode can be tried. That is, by
+      setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization
+      mechanism (staleness_threshold=0) (Mode 2).
+    * For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and
+      staleness, setting staleness_threshold>
+      0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4).
+
+### Quick Start
+
+```shell
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=10
+staleness_threshold=0
+trigger_parameter_sync_step=16
+partial_rollout=False
+
+
+python -m recipe.fully_async_policy.fully_async_main \
+	train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.hybrid_engine=False \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
+```
+
+## Experiments
+
+### Asynchronous Training on 7B Model
+
+We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources.
+Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards,
+64 cards, and 128 cards without significantly affecting experimental results.
+
+* Machine: H20
+* Model: Qwen2.5-Math-7B
+* Rollout length: max_response_length FSDP2: 28K tokens;
+* Algorithm: DAPO
+* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+* Engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 20
+
+* colocate sync:
+    * step: 400
+    * train_batch_size: 512
+
+* fully_async_policy
+    * total_rollout_steps: 512*400
+    * require_batches: 4
+    * trigger_parameter_sync_step: 4
+    * staleness_threshold: 0.3
+    * partial_rollout: True
+
+|  training mode   	   | resource allocation 	 | step  	  |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1          	      |
+|:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:|
+| colocate sync      	 | 32                  	 | 790.10 	 | 357.41 	 | 107.71       	 | 313.81       	 | 13h 44m                	 | 1d 3h 43m              	 | 2d 9h 22m              	 | 3d 17h 5m              	 | max: 0.3313<br>last: 0.2448  	 |
+| fully_async_policy 	 | 16:16               	 |    	     |    	     | \            	 |       	        |            	             |            	             |            	             |            	             | max: <br>last:               	 |
+| colocate sync      	 | 64                  	 | 365.28 	 | 150.72 	 | 70.26        	 | 133.41       	 | 10h 22m                	 | 20h 45m                	 | 1d 7h 6m               	 | 1d 17h 32m             	 | max: 0.3365<br>last:  0.2333 	 |
+| fully_async_policy 	 | 32:32               	 | 189.26 	 | 28.46  	 | \            	 | 156.98       	 | 4h 57m<br>(2.09x)      	 | 10h 14m<br>(2.03x)     	 | 16h 58m<br>(1.83x)     	 | 21h 40m<br>(1.92x)     	 | max: 0.3677<br>last: 0.3406  	 |
+| colocate sync      	 | 128                 	 | 356.30 	 | 177.85 	 | 53.92        	 | 113.81       	 | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 | 1d 16h 48m             	 | max: 0.3573<br>last: 0.2958  	 |
+| fully_async_policy 	 | 64:64               	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m<br>(2.67x)      	 | 6h 46m<br>(2.65x)      	 | 10h 53m<br>(2.67x)     	 | 17h 22m<br>(2.35x)     	 | max: 0.3521<br>last: 0.3094  	 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 128-card 7B Asynchronous Mode Experiment
+
+We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async.
+We can see that the benefit brought by streaming is approximately 0.6x, and after combining staleness and
+partial_rollout, the benefit reaches 2.35x.
+
+|                             mode                                         	                              |        step  	        |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1         	      |
+|:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+|                                          colocate sync      	                                           | 128                 	 | 356.30 	 |    177.85 	    | 53.92        	 |      113.81       	      | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 |   1d 16h 48m             	    | max: 0.3573<br>last: 0.2958  	 |
+| `stream off policy pipeline`<br>(+fully async: trigger_parameter_sync_step= 4,<br>require_batches= 4) 	 |       231.34 	        | 128.47 	 | \            	 | 98.77        	 | 4h 25m                 	 | 9h 41m                 	 | 15h 2m                 	 | 1d 1h 53m              	 | max: 0.2844<br>last: 0.2604 	 |
+|          `async stream pipeline with stale samples`<br>(+staleness_threshold=0.5)            	          |           	           |    	     |       	        |       	        |            	             |            	             |            	             |            	             |               	               |
+|        `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	        |       150.63 	        | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
+
+### 128-card Stale Ablation Experiment
+
+Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training
+efficiency.
+We found that the larger the staleness, the more obvious the final gains.
+We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps
+increase, the response length changes significantly, causing training instability.
+Further analysis and optimization are needed for this issue.
+
+| staleness_threshold 	 | step  	  |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1         	      |
+|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+| 0                   	 | 231.34 	 | 128.47 	 | \            	 | 98.77        	 | 4h 25m                 	 | 9h 41m                 	 | 15h 2m                 	 | 1d 1h 53m              	 | max: 0.2844<br>last: 0.2604 	 |
+| 0.1                 	 | 171.30 	 | 58.17  	 | \            	 | 109.12       	 | 3h 53m                 	 | 8h 37m                 	 | 14h 25m                	 | 19h 59m                	 | max: 0.3542<br>last: 0.2979 	 |
+| 0.3                 	 | 146.11 	 | 38.88  	 | \            	 | 103.22       	 | 3h 18m                 	 | 6h 49m                 	 | 11h 40m                	 | 17h 20m                	 | max: 0.3469<br>last: 0.2865 	 |
+| 0.5                 	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 128-card 7B require_batches Ablation Experiment
+
+In multiple tests, we found that the number of samples issued each time in streaming affects the response length during
+training, which in turn affects training time. We verified the impact on results by modifying
+`async_training.require_batches`.
+
+| require_batches 	 | step  	  | gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 |     acc/mean@1         	      |
+|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+| 1               	 | 203.47 	 | 30.88 	 | \            	 | 181.08       	 | 3h 31m                 	 | 8h 29m                 	 | 17h 36m                	 | max: 0.349<br>last: 0.326   	 |
+| 2               	 | 158.72 	 | 26.32 	 | \            	 | 128.08       	 | 3h 35m                 	 | 7h 38m                 	 | 13h 57m                	 | max: 0.351<br>last: 0.3406  	 |
+| 4               	 | 124.64 	 | 25.62 	 | \            	 | 95.06        	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | max: 0.3521<br>last: 0.3521 	 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 30B Model Mode Experiment
+
+TODO: The 30B experiment is still in progress.
+
+* Machine: H20
+* Model: Qwen2.5-32B
+* Rollout length: max_response_length FSDP2: 20K tokens;
+* Algorithm: DAPO
+* Engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 20
+
+* colocate sync:
+    * step:200
+    * train_batch_size: 512
+
+* fully_async_policy
+    * total_rollout_steps: 512*200
+    * trigger_parameter_sync_step: 512/32 = 16
+    * staleness_threshold: 0
+    * partial_rollout: False
+
+| training mode      | Resource allocation | mode                                       | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|--------------------------------------------|------|--------------------|--------------|--------------|------------|------------------|
+| colocate sync      | 128                 |                                            |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | stream off policy pipeline                 |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | async stream pipeline with stale samples   |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | async stream pipeline with partial rollout |      |                    |              |              |            |                  |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+## Future Plans
+
+* GRPO experiments
+* Megatron adaptation
+* SGLang integration
+* Transfer queue integration
+* Asynchronous parameter synchronization
+* AReaL asynchronous algorithm implementation
+* TPPO algorithm implementation
+* Multi-turn and Tool support
\ No newline at end of file
diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md
new file mode 100644
index 00000000000..77498131e45
--- /dev/null
+++ b/recipe/fully_async_policy/README.md
@@ -0,0 +1,427 @@
+# Recipe: Fully Async Policy Async Trainer
+
+**Author:** `https://github.com/meituan-search`
+
+Last updated: 10/16/2025.
+
+This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter,
+supporting asynchronous sample generation and training.
+Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs,
+without significantly affecting the results.
+
+## Introduction
+
+### Background
+
+The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more
+flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training
+efficiency caused by long-tail problems.
+The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by
+designing a separated architecture and performing asynchronous training between rollout and train for one round.
+However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot
+completely eliminate the impact of long-tail on training efficiency.
+In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have
+been implemented based on the separated architecture and have achieved gains.
+We借鉴 their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and partial
+rollout training.
+By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy
+can significantly improve training efficiency.
+
+> Magistral https://arxiv.org/abs/2506.10910
+>
+> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language
+> Reasoning https://arxiv.org/abs/2505.24298
+>
+> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream
+> Generation https://arxiv.org/abs/2504.15930
+>
+> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663
+>
+
+### Core Contributions
+
+* **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to
+  specify the resources they occupy separately.
+* **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples.
+* **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to
+  multiple steps, making the asynchronous solution more flexible.
+* **NCCL Parameter Synchronization**: Uses NCCL communication primitives for parameter communication between Rollouter
+  and Trainer.
+* **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single
+  sample as the minimum transmission unit.
+* **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it
+  supports training with samples generated by old parameters.
+* **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter
+  synchronization, by adding `sleep() and resume()` logic, it
+  saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for
+  ongoing tasks to finish during parameter synchronization.
+
+Currently, the supported usage mode is fsdp+vllm. vllm must use the server mode based on AgentLoop.
+
+## Design
+
+The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four
+parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer.
+
+![fully_async_policy_structure](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_structure.svg?raw=true)
+
+1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the
+   production speed controlled by freshness.
+2. MessageQueue is used to temporarily store samples generated by Rollouter.
+3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size`
+   samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers
+   a parameter synchronization with Rollouter.
+4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability.
+
+The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for
+rollout cannot solve the idleness caused by long-tail samples.
+After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources
+are used),
+but the overlap in their time consumption reduces the end-to-end time consumption.
+
+![fully_async_policy_revenue](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true)
+
+## Usage
+
+### Parameter Description
+
+| super params                                  | implication                                                                                    |
+|-----------------------------------------------|------------------------------------------------------------------------------------------------|
+| `trainer.nnodes`                              | Number of nodes for Trainer                                                                    |
+| `trainer.n_gpus_per_node`                     | Number of GPUs per node for Trainer                                                            |
+| `rollout.nnodes`                              | Number of nodes for Rollouter                                                                  |
+| `rollout.n_gpus_per_node`                     | Number of GPUs per node for Rollouter                                                          |
+| `data.train_batch_size`                       | In the fully async strategy, this value is not effective (default is 0)                        |
+| `data.gen_batch_size`                         | In the fully async strategy, uses streaming sample production logic (default is 1)             |
+| `rollout.total_rollout_steps`                 | Total number of rollout samples                                                                |
+| `rollout.test_freq`                           | How many times Rollouter updates parameters before performing a validation                     |
+| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus                                |
+| `async_training.require_batches`              | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once                           |
+| `async_training.trigger_parameter_sync_step`  | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization |
+| `async_training.staleness_threshold`          | Freshness control                                                                              |
+| `async_training.partial_rollout`              | Whether to perform partial_rollout                                                             |
+| `async_training.use_rollout_log_probs`        | Use log_probs generated by rollout                                                             |
+
+**Further Explanation:**
+
+* `rollout.total_rollout_steps`
+
+  Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step:
+  `rollout.total_rollout_steps = data.train_batch_size * step`.
+
+* `async_training.trigger_parameter_sync_step`
+
+  In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches
+  `require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter.
+  Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process
+  `trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples.
+  To fairly compare speed with colocate, trigger_parameter_sync_step should be set to
+  `data.train_batch_size / (require_batches * ppo_mini_batch_size)`.
+
+* `async_training.staleness_threshold`
+
+  In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used.
+
+    * staleness_threshold=0, indicates synchronous training.
+      Rollouter will generate a fixed number of samples between two parameter updates, the sample count is:
+      $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$
+    * staleness_threshold>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous
+      calls.
+      Rollouter will generate at most the following number of samples between two parameter updates:
+      $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$
+
+  num_staleness_sample represents the number of stale samples generated in excess during the last rollout.
+
+  Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower,
+  trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples.
+  When rollout is fast enough, setting staleness_threshold to 1 is basically equivalent to one_step_off policy.
+  To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1.
+
+* `async_training.partial_rollout`
+
+  partial_rollout only actually takes effect when staleness_threshold>0.
+
+* `async_training.use_rollout_log_probs`
+
+  In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to
+  the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling,
+  old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm
+  correctness. In the fully
+  async strategy, we default to old_log_prob being calculated by rollout rather than by trainer.
+
+    * `async_training.require_batches`
+
+  In streaming training, require_batches should be set to 1, indicating that training is performed after producing
+  enough ppo_mini_batch_size samples.
+  In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can
+  cause training instability and longer response lengths.
+  Here, we additionally provide require_batches for streaming distribution and control the number of samples
+  participating in training at once.
+
+### Supported Modes
+
+1. on policy pipeline:
+    1. **trigger_parameter_sync_step=1, staleness_threshold=0**
+    2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for
+       training, and after training completes, Trainer and Rollouter perform a parameter synchronization;
+    3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill
+       idle resources, causing some resource waste.
+    4. As shown in figure a;
+
+2. stream off policy pipeline:
+    1. **trigger_parameter_sync_step>1, staleness_threshold=0**
+    2. Synchronous streaming training will be performed. Rollouter produces
+       `require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local
+       training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training
+       trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization;
+    3. Compared to a, since more samples are generated at once, resource idleness will be lower.
+    4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples,
+       train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter
+       update, rollout waits for training to complete.
+    5. As shown in figure b;
+
+3. async stream pipeline with stale samples:
+    1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False**
+    2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number
+       of samples generated may be less than this value depending on rollout speed).
+    3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples
+       before parameter synchronization for immediate use by Trainer after synchronization.
+       When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete
+       and not add new tasks;
+    4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the
+       first batch rollout to finish, but will have the time to wait for active tasks to finish.
+    5. As shown in figure c;
+
+4. async stream pipeline with partial rollout:
+    1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True**
+    2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will
+       interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be
+       generated after synchronization. This reduces the time to wait for active tasks to finish.
+    3. As shown in figure d;
+
+![fully_async_policy_mode](
+https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true)
+
+### Key Metrics
+
+| metrics                                        | implication                                                                                            |
+|------------------------------------------------|--------------------------------------------------------------------------------------------------------|
+| `trainer/idle_ratio`                           | Trainer idle rate                                                                                      |
+| `rollouter/idle_ratio`                         | Rollouter idle rate                                                                                    |
+| `fully_async/count/stale_samples_processed`    | Total number of old samples used in training                                                           |
+| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories)         |
+| `fully_async/partial/total_partial_num`        | Number of partial samples processed by Trainer between two trigger_parameter_sync_step                 |
+| `fully_async/partial/partial_ratio`            | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step                  |
+| `fully_async/partial/max_partial_span`         | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step |
+
+### Parameter Tuning Recommendations
+
+* Resource Allocation and Adjustment:
+    * Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource
+      allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire
+      training process,
+      avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource
+      allocation can be adjusted based on the idle time of rollout and train during actual training,
+      which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and
+      trainer/idle_ratio is low,
+      Trainer resources should be increased and Rollouter resources should be reduced, and vice versa.
+
+* Key Parameters:
+    * staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It
+      is recommended to set it to less than 1.
+    * require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and
+      the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample
+      processing;
+    * trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent
+      parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in
+      low resource utilization.
+      The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy.
+    * rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small.
+
+* Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at
+  different levels, suitable for tasks in different scenarios.
+    * For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed
+      requirements, the on policy pipeline mode (Mode 1) can be tried.
+    * For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy
+      pipeline mode can be tried. That is, by
+      setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization
+      mechanism (staleness_threshold=0) (Mode 2).
+    * For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and
+      staleness, setting staleness_threshold>
+      0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4).
+
+### Quick Start
+
+```shell
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=10
+staleness_threshold=0
+trigger_parameter_sync_step=16
+partial_rollout=False
+
+
+python -m recipe.fully_async_policy.fully_async_main \
+	train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.hybrid_engine=False \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
+```
+
+## Experiments
+
+### Asynchronous Training on 7B Model
+
+We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources.
+Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards,
+64 cards, and 128 cards without significantly affecting experimental results.
+
+* Machine: H20
+* Model: Qwen2.5-Math-7B
+* Rollout length: max_response_length FSDP2: 28K tokens;
+* Algorithm: DAPO
+* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+* Engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 20
+
+* colocate sync:
+    * step: 400
+    * train_batch_size: 512
+
+* fully_async_policy
+    * total_rollout_steps: 512*400
+    * require_batches: 4
+    * trigger_parameter_sync_step: 4
+    * staleness_threshold: 0.3
+    * partial_rollout: True
+
+|  training mode   	   | resource allocation 	 | step  	  |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1          	      |
+|:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:|
+| colocate sync      	 | 32                  	 | 790.10 	 | 357.41 	 | 107.71       	 | 313.81       	 | 13h 44m                	 | 1d 3h 43m              	 | 2d 9h 22m              	 | 3d 17h 5m              	 | max: 0.3313<br>last: 0.2448  	 |
+| fully_async_policy 	 | 16:16               	 |    	     |    	     | \            	 |       	        |            	             |            	             |            	             |            	             | max: <br>last:               	 |
+| colocate sync      	 | 64                  	 | 365.28 	 | 150.72 	 | 70.26        	 | 133.41       	 | 10h 22m                	 | 20h 45m                	 | 1d 7h 6m               	 | 1d 17h 32m             	 | max: 0.3365<br>last:  0.2333 	 |
+| fully_async_policy 	 | 32:32               	 | 189.26 	 | 28.46  	 | \            	 | 156.98       	 | 4h 57m<br>(2.09x)      	 | 10h 14m<br>(2.03x)     	 | 16h 58m<br>(1.83x)     	 | 21h 40m<br>(1.92x)     	 | max: 0.3677<br>last: 0.3406  	 |
+| colocate sync      	 | 128                 	 | 356.30 	 | 177.85 	 | 53.92        	 | 113.81       	 | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 | 1d 16h 48m             	 | max: 0.3573<br>last: 0.2958  	 |
+| fully_async_policy 	 | 64:64               	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m<br>(2.67x)      	 | 6h 46m<br>(2.65x)      	 | 10h 53m<br>(2.67x)     	 | 17h 22m<br>(2.35x)     	 | max: 0.3521<br>last: 0.3094  	 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 128-card 7B Asynchronous Mode Experiment
+
+We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async.
+We can see that the benefit brought by streaming is approximately 0.6x, and after combining staleness and
+partial_rollout, the benefit reaches 2.35x.
+
+|                             mode                                         	                              |        step  	        |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1         	      |
+|:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+|                                          colocate sync      	                                           | 128                 	 | 356.30 	 |    177.85 	    | 53.92        	 |      113.81       	      | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 |   1d 16h 48m             	    | max: 0.3573<br>last: 0.2958  	 |
+| `stream off policy pipeline`<br>(+fully async: trigger_parameter_sync_step= 4,<br>require_batches= 4) 	 |       231.34 	        | 128.47 	 | \            	 | 98.77        	 | 4h 25m                 	 | 9h 41m                 	 | 15h 2m                 	 | 1d 1h 53m              	 | max: 0.2844<br>last: 0.2604 	 |
+|          `async stream pipeline with stale samples`<br>(+staleness_threshold=0.5)            	          |           	           |    	     |       	        |       	        |            	             |            	             |            	             |            	             |               	               |
+|        `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	        |       150.63 	        | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
+
+### 128-card Stale Ablation Experiment
+
+Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training
+efficiency.
+We found that the larger the staleness, the more obvious the final gains.
+We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps
+increase, the response length changes significantly, causing training instability.
+Further analysis and optimization are needed for this issue.
+
+| staleness_threshold 	 | step  	  |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1         	      |
+|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+| 0                   	 | 231.34 	 | 128.47 	 | \            	 | 98.77        	 | 4h 25m                 	 | 9h 41m                 	 | 15h 2m                 	 | 1d 1h 53m              	 | max: 0.2844<br>last: 0.2604 	 |
+| 0.1                 	 | 171.30 	 | 58.17  	 | \            	 | 109.12       	 | 3h 53m                 	 | 8h 37m                 	 | 14h 25m                	 | 19h 59m                	 | max: 0.3542<br>last: 0.2979 	 |
+| 0.3                 	 | 146.11 	 | 38.88  	 | \            	 | 103.22       	 | 3h 18m                 	 | 6h 49m                 	 | 11h 40m                	 | 17h 20m                	 | max: 0.3469<br>last: 0.2865 	 |
+| 0.5                 	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 128-card 7B require_batches Ablation Experiment
+
+In multiple tests, we found that the number of samples issued each time in streaming affects the response length during
+training, which in turn affects training time. We verified the impact on results by modifying
+`async_training.require_batches`.
+
+| require_batches 	 | step  	  | gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 |     acc/mean@1         	      |
+|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+| 1               	 | 203.47 	 | 30.88 	 | \            	 | 181.08       	 | 3h 31m                 	 | 8h 29m                 	 | 17h 36m                	 | max: 0.349<br>last: 0.326   	 |
+| 2               	 | 158.72 	 | 26.32 	 | \            	 | 128.08       	 | 3h 35m                 	 | 7h 38m                 	 | 13h 57m                	 | max: 0.351<br>last: 0.3406  	 |
+| 4               	 | 124.64 	 | 25.62 	 | \            	 | 95.06        	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | max: 0.3521<br>last: 0.3521 	 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+### 30B Model Mode Experiment
+
+TODO: The 30B experiment is still in progress.
+
+* Machine: H20
+* Model: Qwen2.5-32B
+* Rollout length: max_response_length FSDP2: 20K tokens;
+* Algorithm: DAPO
+* Engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 20
+
+* colocate sync:
+    * step:200
+    * train_batch_size: 512
+
+* fully_async_policy
+    * total_rollout_steps: 512*200
+    * trigger_parameter_sync_step: 512/32 = 16
+    * staleness_threshold: 0
+    * partial_rollout: False
+
+| training mode      | Resource allocation | mode                                       | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean |
+|--------------------|---------------------|--------------------------------------------|------|--------------------|--------------|--------------|------------|------------------|
+| colocate sync      | 128                 |                                            |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | stream off policy pipeline                 |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | async stream pipeline with stale samples   |      |                    |              |              |            |                  |
+| fully_async_policy | 64:64               | async stream pipeline with partial rollout |      |                    |              |              |            |                  |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+
+## Future Plans
+
+* GRPO experiments
+* Megatron adaptation
+* SGLang integration
+* Transfer queue integration
+* Asynchronous parameter synchronization
+* AReaL asynchronous algorithm implementation
+* TPPO algorithm implementation
+* Multi-turn and Tool support
\ No newline at end of file
diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index 87f75db93a4..e6751213841 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -5,6 +5,7 @@
 Last updated: 10/16/2025.
 
 本文档介绍了完全异步PPO训练系统，该系统实现了 Trainer 和 Rollouter 的完全解耦，支持异步样本生成和训练。
+在该系统下，我们使用128卡训练qwen2.5-7B模型取得了2.35x-2.67x的性能提升,同时效果没有显著受到影响。
 
 ## Introduction
 
@@ -124,12 +125,11 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a
   即 old_log_prob必须使用rollout参数及token所对应log_probs，才能保证算法的正确性。在fully
   async策略中，我们默认old_log_prob是有rollout所计算的，而不是由trainer所计算。
 
-  * `async_training.require_batches`
-  
+    * `async_training.require_batches`
+
   在流式训练中，require_batches 应该设置为1，表示生产够ppo_mini_batch_size样本后，就进行训练。
   在实际测试中，我们发现，如果单次下发的样本较少，由于数据分发的顺序，会导致训练不稳定，response 长度变长。
   在这里，我们额外提供 require_batches 进行流式分发，单次参与训练的样本数量控制。
-  
 
 ### 模式支持
 
@@ -252,7 +252,8 @@ python -m recipe.fully_async_policy.fully_async_main \
 
 ### 在7B模型上进行异步训练
 
-我们使用 Qwen2.5-Math-7B 验证 fully async 策略在长候选下，各个资源的收益。
+我们使用 Qwen2.5-Math-7B 验证 fully async 策略在长候选下，多种资源下的收益情况。
+使用`async stream pipeline with staleness samples` 策略，我们在32卡，64卡，128卡都取得2x左右的性能提升，同时没有显著影响实验效果。
 
 * 机器：H20
 * 模型：Qwen2.5-Math-7B
@@ -275,49 +276,54 @@ python -m recipe.fully_async_policy.fully_async_main \
     * staleness_threshold: 0.3
     * partial_rollout: True
 
-|    training mode   	| resource allocation 	|  step  	|   gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	| total time<br>400 step 	|          acc/mean@1          	|
-|:------------------:	|:-------------------:	|:------:	|:------:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------------:	|
-| colocate sync      	| 32                  	| 790.10 	| 357.41 	| 107.71       	| 313.81       	| 13h 44m                	| 1d 3h 43m              	| 2d 9h 22m              	| 3d 17h 5m              	| max: 0.3313<br>last: 0.2448  	|
-| fully_async_policy 	| 16:16               	|        	|        	| \            	|              	|                        	|                        	|                        	|                        	| max: <br>last:               	|
-| colocate sync      	| 64                  	| 365.28 	| 150.72 	| 70.26        	| 133.41       	| 10h 22m                	| 20h 45m                	| 1d 7h 6m               	| 1d 17h 32m             	| max: 0.3365<br>last:  0.2333 	|
-| fully_async_policy 	| 32:32               	| 189.26 	| 28.46  	| \            	| 156.98       	| 4h 57m<br>(2.09x)      	| 10h 14m<br>(2.03x)     	| 16h 58m<br>(1.83x)     	| 21h 40m<br>(1.92x)     	| max: 0.3677<br>last: 0.3406  	|
-| colocate sync      	| 128                 	| 356.30 	| 177.85 	| 53.92        	| 113.81       	| 8h 36m                 	| 17h 56m                	| 1d 5h 6m               	| 1d 16h 48m             	| max: 0.3573<br>last: 0.2958  	|
-| fully_async_policy 	| 64:64               	| 150.63 	| 33.14  	| \            	| 113.16       	| 3h 13m<br>(2.67x)      	| 6h 46m<br>(2.65x)      	| 10h 53m<br>(2.67x)     	| 17h 22m<br>(2.35x)     	| max: 0.3521<br>last: 0.3094  	|
+|  training mode   	   | resource allocation 	 | step  	  |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1          	      |
+|:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:|
+| colocate sync      	 | 32                  	 | 790.10 	 | 357.41 	 | 107.71       	 | 313.81       	 | 13h 44m                	 | 1d 3h 43m              	 | 2d 9h 22m              	 | 3d 17h 5m              	 | max: 0.3313<br>last: 0.2448  	 |
+| fully_async_policy 	 | 16:16               	 |    	     |    	     | \            	 |  	             |            	             |            	             |            	             |            	             | max: <br>last:               	 |
+| colocate sync      	 | 64                  	 | 365.28 	 | 150.72 	 | 70.26        	 | 133.41       	 | 10h 22m                	 | 20h 45m                	 | 1d 7h 6m               	 | 1d 17h 32m             	 | max: 0.3365<br>last:  0.2333 	 |
+| fully_async_policy 	 | 32:32               	 | 189.26 	 | 28.46  	 | \            	 | 156.98       	 | 4h 57m<br>(2.09x)      	 | 10h 14m<br>(2.03x)     	 | 16h 58m<br>(1.83x)     	 | 21h 40m<br>(1.92x)     	 | max: 0.3677<br>last: 0.3406  	 |
+| colocate sync      	 | 128                 	 | 356.30 	 | 177.85 	 | 53.92        	 | 113.81       	 | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 | 1d 16h 48m             	 | max: 0.3573<br>last: 0.2958  	 |
+| fully_async_policy 	 | 64:64               	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m<br>(2.67x)      	 | 6h 46m<br>(2.65x)      	 | 10h 53m<br>(2.67x)     	 | 17h 22m<br>(2.35x)     	 | max: 0.3521<br>last: 0.3094  	 |
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ### 128卡  7B 异步模式实验
 
-我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模型的效果。
+我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模式的效果。
 
-|                                          mode                                         	|  step  	|   gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	| total time<br>400 step 	|          acc/mean@1         	|
-|:-------------------------------------------------------------------------------------:	|:------:	|:------:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:---------------------------:	|
-| `stream off policy pipeline`<br>(trigger_parameter_sync_step= 4,<br>require_batches= 4) 	| 231.34 	| 128.47 	| \            	| 98.77        	| 4h 25m                 	| 9h 41m                 	| 15h 2m                 	| 1d 1h 53m              	| max: 0.2844<br>last: 0.2604 	|
-| `async stream pipeline with staleness samples`<br>(+staleness_threshold=0.5)            	|        	|        	|              	|              	|                        	|                        	|                        	|                        	|                             	|
-| `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	| 150.63 	| 33.14  	| \            	| 113.16       	| 3h 13m                 	| 6h 46m                 	| 10h 53m                	| 17h 22m                	| max: 0.3521<br>last: 0.3094 	|
+|                             mode                                         	                              |        step  	        |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1         	      |
+|:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+|                                          colocate sync      	                                           | 128                 	 | 356.30 	 |    177.85 	    | 53.92        	 |      113.81       	      | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 |   1d 16h 48m             	    | max: 0.3573<br>last: 0.2958  	 |
+| `stream off policy pipeline`<br>(+fully async: trigger_parameter_sync_step= 4,<br>require_batches= 4) 	 |       231.34 	        | 128.47 	 | \            	 | 98.77        	 | 4h 25m                 	 | 9h 41m                 	 | 15h 2m                 	 | 1d 1h 53m              	 | max: 0.2844<br>last: 0.2604 	 |
+|        `async stream pipeline with staleness samples`<br>(+staleness_threshold=0.5)            	        |           	           |    	     |       	        |       	        |            	             |            	             |            	             |            	             |               	               |
+|        `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	        |       150.63 	        | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
 ### 128卡 stale 消融实验
 
 在 `async stream pipeline with partial rollout` 模式下，我们验证 staleness 的设置对于训练效率的影响。
+我们可以发现，staleness 越大，最终取得的收益越明显。
+同时我们也注意到 staleness 取 0.3 和 0.5 的时间比较接近，原因是随着训练步数的增量，response 长度变化较大，训练出现了不稳定的问题。
+后续还需要针对该问题进行进一步的分析和优化。
 
-| staleness_threshold 	|  step  	|   gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	| total time<br>400 step 	|          acc/mean@1         	|
-|:-------------------:	|:------:	|:------:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:---------------------------:	|
-| 0                   	| 231.34 	| 128.47 	| \            	| 98.77        	| 4h 25m                 	| 9h 41m                 	| 15h 2m                 	| 1d 1h 53m              	| max: 0.2844<br>last: 0.2604 	|
-| 0.1                 	| 171.30 	| 58.17  	| \            	| 109.12       	| 3h 53m                 	| 8h 37m                 	| 14h 25m                	| 19h 59m                	| max: 0.3542<br>last: 0.2979 	|
-| 0.3                 	| 146.11 	| 38.88  	| \            	| 103.22       	| 3h 18m                 	| 6h 49m                 	| 11h 40m                	| 17h 20m                	| max: 0.3469<br>last: 0.2865 	|
-| 0.5                 	| 150.63 	| 33.14  	| \            	| 113.16       	| 3h 13m                 	| 6h 46m                 	| 10h 53m                	| 17h 22m                	| max: 0.3521<br>last: 0.3094 	|
+| staleness_threshold 	 | step  	  |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1         	      |
+|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+| 0                   	 | 231.34 	 | 128.47 	 | \            	 | 98.77        	 | 4h 25m                 	 | 9h 41m                 	 | 15h 2m                 	 | 1d 1h 53m              	 | max: 0.2844<br>last: 0.2604 	 |
+| 0.1                 	 | 171.30 	 | 58.17  	 | \            	 | 109.12       	 | 3h 53m                 	 | 8h 37m                 	 | 14h 25m                	 | 19h 59m                	 | max: 0.3542<br>last: 0.2979 	 |
+| 0.3                 	 | 146.11 	 | 38.88  	 | \            	 | 103.22       	 | 3h 18m                 	 | 6h 49m                 	 | 11h 40m                	 | 17h 20m                	 | max: 0.3469<br>last: 0.2865 	 |
+| 0.5                 	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ### 128卡  7B require_batches 消融实验
 
-在多次测试下，我们发现流式每次下发样本的数量，会影响训练的结果，我们通过修改 `async_training.require_batches` 验证对与结果的影响。
+在多次测试下，我们发现流式每次下发样本的数量会影响训练的response长度，进而影响训练时长，我们通过修改
+`async_training.require_batches` 验证对与结果的影响。
 
-| require_batches 	|  step  	|  gen  	| old_log_prob 	| update_actor 	| total time<br>100 step 	| total time<br>200 step 	| total time<br>300 step 	|          acc/mean@1         	|
-|:---------------:	|:------:	|:-----:	|:------------:	|:------------:	|:----------------------:	|:----------------------:	|:----------------------:	|:---------------------------:	|
-| 1               	| 203.47 	| 30.88 	| \            	| 181.08       	| 3h 31m                 	| 8h 29m                 	| 17h 36m                	| max: 0.349<br>last: 0.326   	|
-| 2               	| 158.72 	| 26.32 	| \            	| 128.08       	| 3h 35m                 	| 7h 38m                 	| 13h 57m                	| max: 0.351<br>last: 0.3406  	|
-| 4               	| 124.64 	| 25.62 	| \            	| 95.06        	| 3h 13m                 	| 6h 46m                 	| 10h 53m                	| max: 0.3521<br>last: 0.3521 	|
+| require_batches 	 | step  	  | gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 |     acc/mean@1         	      |
+|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
+| 1               	 | 203.47 	 | 30.88 	 | \            	 | 181.08       	 | 3h 31m                 	 | 8h 29m                 	 | 17h 36m                	 | max: 0.349<br>last: 0.326   	 |
+| 2               	 | 158.72 	 | 26.32 	 | \            	 | 128.08       	 | 3h 35m                 	 | 7h 38m                 	 | 13h 57m                	 | max: 0.351<br>last: 0.3406  	 |
+| 4               	 | 124.64 	 | 25.62 	 | \            	 | 95.06        	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | max: 0.3521<br>last: 0.3521 	 |
 
 > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
index 5955dfc33ed..7dd531ad266 100644
--- a/verl/workers/actor/dp_actor.py
+++ b/verl/workers/actor/dp_actor.py
@@ -429,7 +429,6 @@ def update_policy(self, data: DataProto):
 
                     # for fully_async_policy recipe
                     if hasattr(self.config, "use_rollout_log_probs") and self.config.use_rollout_log_probs:
-                        print("for fully_async_policy recipe")
                         old_log_prob = model_inputs["old_log_probs"]
                     else:
                         if on_policy:

From de055102c9451852c0384756321bcfe8f9f71958 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Thu, 16 Oct 2025 19:26:16 +0800
Subject: [PATCH 176/182] update readme

---
 docs/advance/fully_async.md            | 11 ++++++-----
 recipe/fully_async_policy/README.md    | 11 ++++++-----
 recipe/fully_async_policy/README_zh.md | 16 ++++++++++------
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md
index 77498131e45..6dac051922c 100644
--- a/docs/advance/fully_async.md
+++ b/docs/advance/fully_async.md
@@ -336,7 +336,7 @@ Using the `async stream pipeline with stale samples` strategy, we achieved about
 | colocate sync      	 | 128                 	 | 356.30 	 | 177.85 	 | 53.92        	 | 113.81       	 | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 | 1d 16h 48m             	 | max: 0.3573<br>last: 0.2958  	 |
 | fully_async_policy 	 | 64:64               	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m<br>(2.67x)      	 | 6h 46m<br>(2.65x)      	 | 10h 53m<br>(2.67x)     	 | 17h 22m<br>(2.35x)     	 | max: 0.3521<br>last: 0.3094  	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg
 
 ### 128-card 7B Asynchronous Mode Experiment
 
@@ -351,6 +351,8 @@ partial_rollout, the benefit reaches 2.35x.
 |          `async stream pipeline with stale samples`<br>(+staleness_threshold=0.5)            	          |           	           |    	     |       	        |       	        |            	             |            	             |            	             |            	             |               	               |
 |        `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	        |       150.63 	        | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
+
 ### 128-card Stale Ablation Experiment
 
 Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training
@@ -367,7 +369,7 @@ Further analysis and optimization are needed for this issue.
 | 0.3                 	 | 146.11 	 | 38.88  	 | \            	 | 103.22       	 | 3h 18m                 	 | 6h 49m                 	 | 11h 40m                	 | 17h 20m                	 | max: 0.3469<br>last: 0.2865 	 |
 | 0.5                 	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
 
 ### 128-card 7B require_batches Ablation Experiment
 
@@ -381,14 +383,14 @@ training, which in turn affects training time. We verified the impact on results
 | 2               	 | 158.72 	 | 26.32 	 | \            	 | 128.08       	 | 3h 35m                 	 | 7h 38m                 	 | 13h 57m                	 | max: 0.351<br>last: 0.3406  	 |
 | 4               	 | 124.64 	 | 25.62 	 | \            	 | 95.06        	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | max: 0.3521<br>last: 0.3521 	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg
 
 ### 30B Model Mode Experiment
 
 TODO: The 30B experiment is still in progress.
 
 * Machine: H20
-* Model: Qwen2.5-32B
+* Model: Qwen2.5-32B~~~~
 * Rollout length: max_response_length FSDP2: 20K tokens;
 * Algorithm: DAPO
 * Engine: vllm+FSDP2
@@ -413,7 +415,6 @@ TODO: The 30B experiment is still in progress.
 | fully_async_policy | 64:64               | async stream pipeline with stale samples   |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout |      |                    |              |              |            |                  |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ## Future Plans
 
diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md
index 77498131e45..6dac051922c 100644
--- a/recipe/fully_async_policy/README.md
+++ b/recipe/fully_async_policy/README.md
@@ -336,7 +336,7 @@ Using the `async stream pipeline with stale samples` strategy, we achieved about
 | colocate sync      	 | 128                 	 | 356.30 	 | 177.85 	 | 53.92        	 | 113.81       	 | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 | 1d 16h 48m             	 | max: 0.3573<br>last: 0.2958  	 |
 | fully_async_policy 	 | 64:64               	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m<br>(2.67x)      	 | 6h 46m<br>(2.65x)      	 | 10h 53m<br>(2.67x)     	 | 17h 22m<br>(2.35x)     	 | max: 0.3521<br>last: 0.3094  	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg
 
 ### 128-card 7B Asynchronous Mode Experiment
 
@@ -351,6 +351,8 @@ partial_rollout, the benefit reaches 2.35x.
 |          `async stream pipeline with stale samples`<br>(+staleness_threshold=0.5)            	          |           	           |    	     |       	        |       	        |            	             |            	             |            	             |            	             |               	               |
 |        `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	        |       150.63 	        | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
+
 ### 128-card Stale Ablation Experiment
 
 Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training
@@ -367,7 +369,7 @@ Further analysis and optimization are needed for this issue.
 | 0.3                 	 | 146.11 	 | 38.88  	 | \            	 | 103.22       	 | 3h 18m                 	 | 6h 49m                 	 | 11h 40m                	 | 17h 20m                	 | max: 0.3469<br>last: 0.2865 	 |
 | 0.5                 	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
 
 ### 128-card 7B require_batches Ablation Experiment
 
@@ -381,14 +383,14 @@ training, which in turn affects training time. We verified the impact on results
 | 2               	 | 158.72 	 | 26.32 	 | \            	 | 128.08       	 | 3h 35m                 	 | 7h 38m                 	 | 13h 57m                	 | max: 0.351<br>last: 0.3406  	 |
 | 4               	 | 124.64 	 | 25.62 	 | \            	 | 95.06        	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | max: 0.3521<br>last: 0.3521 	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg
 
 ### 30B Model Mode Experiment
 
 TODO: The 30B experiment is still in progress.
 
 * Machine: H20
-* Model: Qwen2.5-32B
+* Model: Qwen2.5-32B~~~~
 * Rollout length: max_response_length FSDP2: 20K tokens;
 * Algorithm: DAPO
 * Engine: vllm+FSDP2
@@ -413,7 +415,6 @@ TODO: The 30B experiment is still in progress.
 | fully_async_policy | 64:64               | async stream pipeline with stale samples   |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout |      |                    |              |              |            |                  |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ## Future Plans
 
diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index e6751213841..ea0e8c14679 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -279,17 +279,18 @@ python -m recipe.fully_async_policy.fully_async_main \
 |  training mode   	   | resource allocation 	 | step  	  |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1          	      |
 |:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:|
 | colocate sync      	 | 32                  	 | 790.10 	 | 357.41 	 | 107.71       	 | 313.81       	 | 13h 44m                	 | 1d 3h 43m              	 | 2d 9h 22m              	 | 3d 17h 5m              	 | max: 0.3313<br>last: 0.2448  	 |
-| fully_async_policy 	 | 16:16               	 |    	     |    	     | \            	 |  	             |            	             |            	             |            	             |            	             | max: <br>last:               	 |
+| fully_async_policy 	 | 16:16               	 |    	     |    	     | \            	 |       	        |            	             |            	             |            	             |            	             | max: <br>last:               	 |
 | colocate sync      	 | 64                  	 | 365.28 	 | 150.72 	 | 70.26        	 | 133.41       	 | 10h 22m                	 | 20h 45m                	 | 1d 7h 6m               	 | 1d 17h 32m             	 | max: 0.3365<br>last:  0.2333 	 |
 | fully_async_policy 	 | 32:32               	 | 189.26 	 | 28.46  	 | \            	 | 156.98       	 | 4h 57m<br>(2.09x)      	 | 10h 14m<br>(2.03x)     	 | 16h 58m<br>(1.83x)     	 | 21h 40m<br>(1.92x)     	 | max: 0.3677<br>last: 0.3406  	 |
 | colocate sync      	 | 128                 	 | 356.30 	 | 177.85 	 | 53.92        	 | 113.81       	 | 8h 36m                 	 | 17h 56m                	 | 1d 5h 6m               	 | 1d 16h 48m             	 | max: 0.3573<br>last: 0.2958  	 |
 | fully_async_policy 	 | 64:64               	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m<br>(2.67x)      	 | 6h 46m<br>(2.65x)      	 | 10h 53m<br>(2.67x)     	 | 17h 22m<br>(2.35x)     	 | max: 0.3521<br>last: 0.3094  	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg
 
 ### 128卡  7B 异步模式实验
 
 我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模式的效果。
+我们可以看到 stream 带来的收益大约0.6x，叠加 staleness 和 partial_rollout 后，收益为2.35x。
 
 |                             mode                                         	                              |        step  	        |  gen  	  | old_log_prob 	 | update_actor 	 | total time<br>100 step 	 | total time<br>200 step 	 | total time<br>300 step 	 | total time<br>400 step 	 |     acc/mean@1         	      |
 |:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
@@ -298,6 +299,8 @@ python -m recipe.fully_async_policy.fully_async_main \
 |        `async stream pipeline with staleness samples`<br>(+staleness_threshold=0.5)            	        |           	           |    	     |       	        |       	        |            	             |            	             |            	             |            	             |               	               |
 |        `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                 	        |       150.63 	        | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
+
 ### 128卡 stale 消融实验
 
 在 `async stream pipeline with partial rollout` 模式下，我们验证 staleness 的设置对于训练效率的影响。
@@ -312,9 +315,9 @@ python -m recipe.fully_async_policy.fully_async_main \
 | 0.3                 	 | 146.11 	 | 38.88  	 | \            	 | 103.22       	 | 3h 18m                 	 | 6h 49m                 	 | 11h 40m                	 | 17h 20m                	 | max: 0.3469<br>last: 0.2865 	 |
 | 0.5                 	 | 150.63 	 | 33.14  	 | \            	 | 113.16       	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | 17h 22m                	 | max: 0.3521<br>last: 0.3094 	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_stale?nw=nwuserhouzg
 
-### 128卡  7B require_batches 消融实验
+### 128卡 7B require_batches 消融实验
 
 在多次测试下，我们发现流式每次下发样本的数量会影响训练的response长度，进而影响训练时长，我们通过修改
 `async_training.require_batches` 验证对与结果的影响。
@@ -325,10 +328,12 @@ python -m recipe.fully_async_policy.fully_async_main \
 | 2               	 | 158.72 	 | 26.32 	 | \            	 | 128.08       	 | 3h 35m                 	 | 7h 38m                 	 | 13h 57m                	 | max: 0.351<br>last: 0.3406  	 |
 | 4               	 | 124.64 	 | 25.62 	 | \            	 | 95.06        	 | 3h 13m                 	 | 6h 46m                 	 | 10h 53m                	 | max: 0.3521<br>last: 0.3521 	 |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg
 
 ### 30B模型模式实验
 
+TODO: 30B 的实验，还在完善中。
+
 * 机器: H20
 * 模型：Qwen2.5-32B
 * rollout长度：max_response_length FSDP2: 20K tokens;
@@ -355,7 +360,6 @@ python -m recipe.fully_async_policy.fully_async_main \
 | fully_async_policy | 64:64               | async stream pipeline with staleness samples |      |                    |              |              |            |                  |
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
 
-> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg
 
 ## 后续计划
 

From 4e122bfd709fd0bcddedbe0a8d576649428a1b68 Mon Sep 17 00:00:00 2001
From: wangshulin02 <953550366@qq.com>
Date: Fri, 17 Oct 2025 10:46:15 +0800
Subject: [PATCH 177/182] update shell script

---
 ...2_64_64.sh => dapo_7b_math_fsdp2_16-16.sh} |  30 ++--
 .../shell/dapo_7b_math_fsdp2_32_32.sh         | 162 ++++++++++++++++++
 .../shell/dapo_7b_math_fsdp2_4_12.sh          |  12 +-
 .../shell/dapo_7b_math_fsdp2_4_4.sh           |  16 +-
 .../shell/dapo_7b_math_fsdp2_64_64.sh         |  20 +--
 .../shell/dapo_7b_math_fsdp2_8_8.sh           |  16 +-
 6 files changed, 200 insertions(+), 56 deletions(-)
 rename recipe/fully_async_policy/shell/{dapo-32B_fsdp2_64_64.sh => dapo_7b_math_fsdp2_16-16.sh} (89%)
 create mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh

diff --git a/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_16-16.sh
similarity index 89%
rename from recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh
rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_16-16.sh
index 324a7d9470e..82072c3a0eb 100644
--- a/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_16-16.sh
@@ -2,7 +2,7 @@
 set -xeuo pipefail
 
 project_name='DAPO'
-exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16'
 
 # Ray
 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
@@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
@@ -41,7 +36,7 @@ clip_ratio_high=0.28
 
 # Response length parameters
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 20))
+max_response_length=$((1024 * 28))
 enable_overlong_buffer=True
 overlong_buffer_len=$((1024 * 4))
 overlong_penalty_factor=1.0
@@ -62,23 +57,24 @@ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
 ref_offload=True
 actor_offload=False
 gen_tp=4
-sp_size=8
-fsdp_size=-1
+sp_size=4
+fsdp_size=8
 
 # Fully async specific parameters
-NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
-NNODES_TRAIN=${NNODES_TRAIN:-8}
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-2}
+NNODES_TRAIN=${NNODES_TRAIN:-2}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
-total_rollout_steps=$(((512*200)))
+total_rollout_steps=$(((512*400)))
 test_freq=20
-staleness_threshold=0
-trigger_parameter_sync_step=16
-partial_rollout=False
+staleness_threshold=0.1
+trigger_parameter_sync_step=4
+require_batches=4
+partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
@@ -161,4 +157,6 @@ python -m recipe.fully_async_policy.fully_async_main \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
     async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
+    async_training.require_batches="${require_batches}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.use_rollout_log_probs=True
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
new file mode 100644
index 00000000000..ded0b0d42cd
--- /dev/null
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO'
+exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32'
+
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 28))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+sp_size=4
+fsdp_size=8
+
+# Fully async specific parameters
+NNODES_ROLLOUT=${NNODES_ROLLOUT:-4}
+NNODES_TRAIN=${NNODES_TRAIN:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=4
+require_batches=4
+partial_rollout=True
+
+python -m recipe.fully_async_policy.fully_async_main \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.val_before_train=True \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.total_epochs=10 \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.require_batches="${require_batches}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.use_rollout_log_probs=True
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index dbfbee8fdfc..18888fd161c 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
@@ -79,7 +74,8 @@ train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*100)))
 test_freq=10
 staleness_threshold=0.1
-trigger_parameter_sync_step=16
+trigger_parameter_sync_step=4
+require_batches=4
 partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
@@ -163,4 +159,6 @@ python -m recipe.fully_async_policy.fully_async_main \
     rollout.total_epochs=10 \
     async_training.staleness_threshold="${staleness_threshold}" \
     async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
+    async_training.require_batches="${require_batches}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.use_rollout_log_probs=True
\ No newline at end of file
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
index 6f64caaea0a..bd56bdd424b 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
@@ -16,15 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
-MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B
-TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet
-
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
@@ -83,7 +74,8 @@ train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*100)))
 test_freq=10
 staleness_threshold=0.1
-trigger_parameter_sync_step=16
+trigger_parameter_sync_step=4
+require_batches=4
 partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
@@ -167,4 +159,6 @@ python -m recipe.fully_async_policy.fully_async_main \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
     async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
+    async_training.require_batches="${require_batches}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.use_rollout_log_probs=True
\ No newline at end of file
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
index 9e77ed3e567..c03e880eec8 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
@@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
@@ -63,7 +58,7 @@ ref_offload=True
 actor_offload=False
 gen_tp=4
 sp_size=4
-fsdp_size=2
+fsdp_size=8
 
 # Fully async specific parameters
 NNODES_ROLLOUT=${NNODES_ROLLOUT:-8}
@@ -75,10 +70,11 @@ gen_prompt_bsz=1
 n_resp_per_prompt=16
 train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*400)))
-test_freq=10
-staleness_threshold=0
-trigger_parameter_sync_step=16
-partial_rollout=False
+test_freq=20
+staleness_threshold=0.1
+trigger_parameter_sync_step=4
+require_batches=4
+partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
     data.train_files="${TRAIN_FILE}" \
@@ -161,4 +157,6 @@ python -m recipe.fully_async_policy.fully_async_main \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
     async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
+    async_training.require_batches="${require_batches}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.use_rollout_log_probs=True
diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
index 02f7664360f..ab9c98b1f4d 100644
--- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
+++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
@@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
 TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
 TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
 
-MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B
-CKPTS_DIR=./ckpts/${project_name}/${exp_name}
-TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet
-TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet
-
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
@@ -65,15 +60,11 @@ gen_tp=1
 sp_size=1
 fsdp_size=2
 
-
 # Fully async specific parameters
 NNODES_ROLLOUT=${NNODES_ROLLOUT:-1}
 NNODES_TRAIN=${NNODES_TRAIN:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
 
-n_gpus_rollout=8
-n_gpus_training=8
-
 train_prompt_bsz=0
 gen_prompt_bsz=1
 n_resp_per_prompt=16
@@ -81,7 +72,8 @@ train_prompt_mini_bsz=32
 total_rollout_steps=$(((512*100)))
 test_freq=10
 staleness_threshold=0.1
-trigger_parameter_sync_step=16
+trigger_parameter_sync_step=4
+require_batches=4
 partial_rollout=True
 
 python -m recipe.fully_async_policy.fully_async_main \
@@ -165,4 +157,6 @@ python -m recipe.fully_async_policy.fully_async_main \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
     async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
-    async_training.partial_rollout="${partial_rollout}"
+    async_training.require_batches="${require_batches}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.use_rollout_log_probs=True
\ No newline at end of file

From 7cae5d5ed3f11ddb75b1189da7719eaa0bdedb68 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 17 Oct 2025 13:04:09 +0800
Subject: [PATCH 178/182] update readme

---
 docs/advance/fully_async.md            | 2 +-
 recipe/fully_async_policy/README.md    | 2 +-
 recipe/fully_async_policy/README_zh.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md
index 6dac051922c..a3ad5e5cf0c 100644
--- a/docs/advance/fully_async.md
+++ b/docs/advance/fully_async.md
@@ -2,7 +2,7 @@
 
 **Author:** `https://github.com/meituan-search`
 
-Last updated: 10/16/2025.
+Last updated: 10/17/2025.
 
 This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter,
 supporting asynchronous sample generation and training.
diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md
index 6dac051922c..a3ad5e5cf0c 100644
--- a/recipe/fully_async_policy/README.md
+++ b/recipe/fully_async_policy/README.md
@@ -2,7 +2,7 @@
 
 **Author:** `https://github.com/meituan-search`
 
-Last updated: 10/16/2025.
+Last updated: 10/17/2025.
 
 This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter,
 supporting asynchronous sample generation and training.
diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index ea0e8c14679..fbbed992d4d 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -2,7 +2,7 @@
 
 **Author:**  `https://github.com/meituan-search`
 
-Last updated: 10/16/2025.
+Last updated: 10/17/2025.
 
 本文档介绍了完全异步PPO训练系统，该系统实现了 Trainer 和 Rollouter 的完全解耦，支持异步样本生成和训练。
 在该系统下，我们使用128卡训练qwen2.5-7B模型取得了2.35x-2.67x的性能提升,同时效果没有显著受到影响。

From 62fb0d0a263b510ccd18f88b500149ba32b651c2 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 17 Oct 2025 14:10:37 +0800
Subject: [PATCH 179/182] trigger ci

---
 recipe/fully_async_policy/README_zh.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index fbbed992d4d..b30738dc4a3 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -361,6 +361,7 @@ TODO: 30B 的实验，还在完善中。
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
 
 
+
 ## 后续计划
 
 * GRPO实验

From fbae66a5c5358400b8ab8819cbcf88d73e2ed4b8 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 17 Oct 2025 14:18:00 +0800
Subject: [PATCH 180/182] trigger ci

---
 recipe/fully_async_policy/README_zh.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index b30738dc4a3..fbbed992d4d 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -361,7 +361,6 @@ TODO: 30B 的实验，还在完善中。
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
 
 
-
 ## 后续计划
 
 * GRPO实验

From 0565a5523019fd25dc81afa99062f7961b5e78a0 Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 17 Oct 2025 14:23:13 +0800
Subject: [PATCH 181/182] trigger ci

---
 recipe/fully_async_policy/README_zh.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index fbbed992d4d..b30738dc4a3 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -361,6 +361,7 @@ TODO: 30B 的实验，还在完善中。
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
 
 
+
 ## 后续计划
 
 * GRPO实验

From dda6c5d66f8d14326e486e2ca6e63bf4bede06db Mon Sep 17 00:00:00 2001
From: ArronHZG <hou.zg@foxmail.com>
Date: Fri, 17 Oct 2025 14:50:42 +0800
Subject: [PATCH 182/182] trigger ci

---
 recipe/fully_async_policy/README_zh.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md
index b30738dc4a3..fbbed992d4d 100644
--- a/recipe/fully_async_policy/README_zh.md
+++ b/recipe/fully_async_policy/README_zh.md
@@ -361,7 +361,6 @@ TODO: 30B 的实验，还在完善中。
 | fully_async_policy | 64:64               | async stream pipeline with partial rollout   |      |                    |              |              |            |                  |
 
 
-
 ## 后续计划
 
 * GRPO实验